From 7936c5283885d2ae119d40af6ce71c9291703365 Mon Sep 17 00:00:00 2001 From: "shenping.matt" Date: Mon, 9 May 2022 15:53:05 +0800 Subject: [PATCH 0001/5344] bytedance: driver: smith: high cpu usage due to udp_recvmsg kretprobe High threshold value caused endless loop of on/off switching, so kretprobe would be installed right after unloading. Here we adjusted the thresholds according to the data from production servers, and also unified the judgement condition to udp rate to avoid possible periodic vibration. Signed-off-by: shenping.matt --- drivers/bytedance/smith/src/smith_hook.c | 69 ++++++++++++++---------- 1 file changed, 40 insertions(+), 29 deletions(-) diff --git a/drivers/bytedance/smith/src/smith_hook.c b/drivers/bytedance/smith/src/smith_hook.c index 75ad1c64b7f2a..67dd3b1b29d0a 100644 --- a/drivers/bytedance/smith/src/smith_hook.c +++ b/drivers/bytedance/smith/src/smith_hook.c @@ -1788,9 +1788,6 @@ int udp_recvmsg_entry_handler(struct kretprobe_instance *ri, struct msghdr *msg; struct udp_recvmsg_data *data; - /* counting udpv4_recvmsg callings */ - smith_count_dnsv4_kretprobe(); - data = (struct udp_recvmsg_data *) ri->data; #if LINUX_VERSION_CODE > KERNEL_VERSION(4, 0, 9) @@ -1871,6 +1868,10 @@ int udp_recvmsg_entry_handler(struct kretprobe_instance *ri, return -EINVAL; do_kretprobe: + + /* counting dns requests */ + smith_count_dnsv4_kretprobe(); + return 0; } @@ -1887,9 +1888,6 @@ int udpv6_recvmsg_entry_handler(struct kretprobe_instance *ri, void *tmp_sk; int flags; - /* counting udpv6_recvmsg callings */ - smith_count_dnsv6_kretprobe(); - data = (struct udp_recvmsg_data *)ri->data; #if LINUX_VERSION_CODE > KERNEL_VERSION(4, 0, 9) @@ -1985,6 +1983,10 @@ int udpv6_recvmsg_entry_handler(struct kretprobe_instance *ri, return -EINVAL; do_kretprobe: + + /* counting dns requests */ + smith_count_dnsv6_kretprobe(); + return 0; } #endif @@ -1994,8 +1996,8 @@ void unregister_udp_recvmsg_kprobe(void); int register_udpv6_recvmsg_kprobe(void); void unregister_udpv6_recvmsg_kprobe(void); -#define SMITH_DNS_THRESHOLD (5000) /* DNS threshold */ -#define SMITH_UDP_THRESHOLD (28000 * num_online_cpus()) /* UDP threshold */ +#define SMITH_DNS_THRESHOLD (800) /* DNS threshold: ops/s */ +#define SMITH_UDP_THRESHOLD (80000) /* UDP threshold: ops/s */ #define SMITH_DNS_KRP_INTERVAL (10) /* 10 seconds */ #define SMITH_DNS_NET_INTERVAL (20) /* 20 seconds, WARNING: atomic_t may overflow */ @@ -2004,7 +2006,6 @@ struct smith_dns_switch { atomic_t krp ____cacheline_aligned_in_smp; /* udp_recvmsg kretprobe handling count */ atomic_t ops ____cacheline_aligned_in_smp; /* udp in traffic count */ uint64_t start ____cacheline_aligned_in_smp; /* start time stamp of counting */ - uint64_t anchor ____cacheline_aligned_in_smp; /* anchor time stamp */ int *regs; /* kretprobe registration result */ int (*enable)(void); void (*disable)(void); @@ -2017,43 +2018,45 @@ static void smith_count_dnsv4_kretprobe(void) atomic_inc(&g_dns_v4_switch.krp); } +/* turn off kretprobe if dns requests exceed threshold */ static void smith_dns_kretprobe(struct smith_dns_switch *ds) { - uint64_t now = smith_get_seconds(); + uint64_t now = smith_get_seconds(), delta; - if (ds->start + SMITH_DNS_KRP_INTERVAL < now) { - if (atomic_read(&ds->krp) > SMITH_DNS_THRESHOLD * (now - ds->start)) { + delta = now - ds->start; + if (delta > SMITH_DNS_KRP_INTERVAL) { + if (atomic_read(&ds->krp) > delta * SMITH_DNS_THRESHOLD || + atomic_read(&ds->ops) > delta * SMITH_UDP_THRESHOLD ){ /* trying to avoid concurrent issue */ - if (atomic_cmpxchg(&ds->armed, 1, 0) == 1) { - atomic_set(&ds->ops, 0); - ds->anchor = smith_get_seconds(); + if (atomic_cmpxchg(&ds->armed, 1, 0) == 1) ds->disable(); - } } - atomic_set(&ds->krp, 0); /* inaccurate for SMP, but acceptable */ + atomic_set(&ds->krp, 0); + atomic_set(&ds->ops, 0); ds->start = smith_get_seconds(); } } +/* try to trun on kretprobe for udp_recvmsg */ static void smith_dns_try_switch(struct smith_dns_switch *ds) { - uint64_t now = smith_get_seconds(); + uint64_t now = smith_get_seconds(), delta; if (0 == DNS_HOOK || atomic_read(&ds->armed) || *(ds->regs)) return; - if (ds->anchor + SMITH_DNS_NET_INTERVAL < now) { - if (atomic_read(&ds->ops) < SMITH_UDP_THRESHOLD * (now - ds->anchor)) { + delta = now - ds->start; + if (delta > SMITH_DNS_NET_INTERVAL) { + if (atomic_read(&ds->ops) < delta * SMITH_UDP_THRESHOLD) { /* trying to avoid concurrent issue */ if (atomic_cmpxchg(&ds->armed, 0, 1) == 0) { - atomic_set(&ds->krp, 0); - ds->start = smith_get_seconds(); if (ds->enable()) atomic_set(&ds->armed, 0); } } - atomic_set(&ds->ops, 0); /* inaccurate for SMP, but acceptable */ - ds->anchor = smith_get_seconds(); + atomic_set(&ds->krp, 0); + atomic_set(&ds->ops, 0); + ds->start = smith_get_seconds(); } } @@ -2081,7 +2084,11 @@ static unsigned int smith_nf_udp_v4_handler( #endif ) { - atomic_inc(&g_dns_v4_switch.ops); + struct iphdr *iph = ip_hdr(skb); + + /* only counting udp packets */ + if (iph->protocol == IPPROTO_UDP) + atomic_inc(&g_dns_v4_switch.ops); return NF_ACCEPT; } @@ -2119,7 +2126,11 @@ static unsigned int smith_nf_udp_v6_handler( #endif ) { - atomic_inc(&g_dns_v6_switch.ops); + struct iphdr *iph = ip_hdr(skb); + + /* only counting udp packets */ + if (iph->protocol == IPPROTO_UDP) + atomic_inc(&g_dns_v6_switch.ops); return NF_ACCEPT; } #endif @@ -2184,10 +2195,10 @@ static int smith_dns_work_handler(void *argu) { unsigned long timeout = msecs_to_jiffies(1000); - /* reset anchor timestamp */ - g_dns_v4_switch.anchor = smith_get_seconds(); + /* reset start timestamp */ + g_dns_v4_switch.start = smith_get_seconds(); #if IS_ENABLED(CONFIG_IPV6) - g_dns_v6_switch.anchor = smith_get_seconds(); + g_dns_v6_switch.start = smith_get_seconds(); #endif do { From e39871f2bf530a420ea9b81f67b080b6cb268525 Mon Sep 17 00:00:00 2001 From: Gang Li Date: Mon, 20 Jun 2022 18:27:03 +0800 Subject: [PATCH 0002/5344] bytedance: driver: toa: fix uninitialized memory read in ip_option_to_four_tuple An uninitialized kernel memory read has been identified in method ip_option_to_four_tuple (defined in line 684 of toa.c), which handles packets with custom IP options. Specifically, the issue exists when incoming packets contain custom IP options with header.ipv4.operation or header.ipv6.optlen set to neither 0 or 1.This would result in the method skipping the memcpy lines, leaving any data from the allocated toa_map_cache uninitialized. Fix it by initializing dst to zero. ``` static int ip_option_to_four_tuple(int outside, struct ip_option* src, struct four_tuple* dst) { int inside = -1; if (outside == 0){ inside = src->header.ipv4.operation; // attacker-controlled value } else if (outside == 1){ if (src->header.ipv6.optlen == IPV6_HEADER_OPTION_IPV4_LEN) // attacker-controlled value inside = 0; else if (src->header.ipv6.optlen == IPV6_HEADER_OPTION_IPV6_LEN) // attacker-controlled value inside = 1; } if (inside == 0){ // skipped if inside is not 0 memset(dst, 0, sizeof(struct four_tuple)); memcpy(dst, src, IP_OPTION_IPV4_LEN); dst->type = 0; return IP_OPTION_IPV4_LEN; } else if (inside == 1){ // skipped if inside is not 1 memcpy(dst, src, IP_OPTION_IPV6_LEN); dst->type = 1; return IP_OPTION_IPV6_LEN; } return -1; // this return value is ignored } ``` Fixes: c035d78df981 ("bytedance: driver: add toa module") Signed-off-by: Gang Li --- drivers/bytedance/toa.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/bytedance/toa.c b/drivers/bytedance/toa.c index 8d0b9bb98fd89..ba64a99fcc0b9 100644 --- a/drivers/bytedance/toa.c +++ b/drivers/bytedance/toa.c @@ -695,8 +695,9 @@ static int ip_option_to_four_tuple(int outside, struct ip_option* src, struct fo inside = 1; } + memset(dst, 0, sizeof(*dst)); + if (inside == 0){ - memset(dst, 0, sizeof(struct four_tuple)); memcpy(dst, src, IP_OPTION_IPV4_LEN); dst->type = 0; From ebe36e7e3004fb5b9b430589d6d3ce01879588d4 Mon Sep 17 00:00:00 2001 From: Gang Li Date: Mon, 20 Jun 2022 18:44:03 +0800 Subject: [PATCH 0003/5344] bytedance: driver: toa: fix Out-of-bounds kernel memory read in ip_option_to_four_tuple An out-of-bounds read has been identified in method ip_option_to_four_tuple (defined in line 684 of toa.c), which handles packets with custom IP options. Specifically, the issue exists when the IP options field size of an incoming packet is shorter than 16 bytes for IPv4, or 40 bytes for IPv6, the memcpy line will end up copying any remaining data from the TCP header for IPv4, and beyond in the case of IPv6. Fix by checking option length. ``` static int ip_option_to_four_tuple(int outside, struct ip_option* src, struct four_tuple* dst) { int inside = -1; if (outside == 0){ inside = src->header.ipv4.operation; // attacker-controlled value } else if (outside == 1){ if (src->header.ipv6.optlen == IPV6_HEADER_OPTION_IPV4_LEN) // attacker-controlled value inside = 0; else if (src->header.ipv6.optlen == IPV6_HEADER_OPTION_IPV6_LEN) // attacker-controlled value inside = 1; } ... } ``` Fixes: c035d78df981 ("bytedance: driver: add toa module") Signed-off-by: Gang Li --- drivers/bytedance/toa.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/bytedance/toa.c b/drivers/bytedance/toa.c index ba64a99fcc0b9..af0119ae34b1e 100644 --- a/drivers/bytedance/toa.c +++ b/drivers/bytedance/toa.c @@ -686,7 +686,10 @@ static int ip_option_to_four_tuple(int outside, struct ip_option* src, struct fo int inside = -1; if (outside == 0){ - inside = src->header.ipv4.operation; + if (src->header.ipv4.operation == 0 && src->header.ipv4.length == IP_OPTION_IPV4_LEN) + inside = 0; + else if (src->header.ipv4.operation == 1 && src->header.ipv4.length == IP_OPTION_IPV6_LEN ) + inside = 1; } else if (outside == 1){ if (src->header.ipv6.optlen == IPV6_HEADER_OPTION_IPV4_LEN) From f42cf717707ceb4a7a9f350f8511852973e98f06 Mon Sep 17 00:00:00 2001 From: "shenping.matt" Date: Mon, 20 Jun 2022 18:08:56 +0800 Subject: [PATCH 0004/5344] bytedance: driver: smith: kretprobe events missing on 2-core KVM guest kretprobe events missing due to too small maxactive value: the default value of maxactive is set as num_possible_cpus() for non-preemptable systems, i.e. only 2 kretprobe instances are to be allocated for a 2-core system, and it's very likely to be used up for a cron task. Here we increase the minimum of maxactive to 32 to avoid possible missings. Signed-off-by: shenping.matt --- drivers/bytedance/smith/src/smith_hook.c | 97 ++++++++++++++---------- 1 file changed, 57 insertions(+), 40 deletions(-) diff --git a/drivers/bytedance/smith/src/smith_hook.c b/drivers/bytedance/smith/src/smith_hook.c index 67dd3b1b29d0a..e925e39ad1514 100644 --- a/drivers/bytedance/smith/src/smith_hook.c +++ b/drivers/bytedance/smith/src/smith_hook.c @@ -3403,22 +3403,20 @@ struct kprobe ptrace_kprobe = { .pre_handler = ptrace_pre_handler, }; -struct kretprobe udp_recvmsg_kretprobe_value = { +struct kretprobe udp_recvmsg_kretprobe = { .kp.symbol_name = "udp_recvmsg", .data_size = sizeof(struct udp_recvmsg_data), .handler = udp_recvmsg_handler, .entry_handler = udp_recvmsg_entry_handler, }; -struct kretprobe udp_recvmsg_kretprobe; #if IS_ENABLED(CONFIG_IPV6) -struct kretprobe udpv6_recvmsg_kretprobe_value = { +struct kretprobe udpv6_recvmsg_kretprobe = { .kp.symbol_name = "udpv6_recvmsg", .data_size = sizeof(struct udp_recvmsg_data), .handler = udp_recvmsg_handler, .entry_handler = udpv6_recvmsg_entry_handler, }; -struct kretprobe udpv6_recvmsg_kretprobe; struct kretprobe ip6_datagram_connect_kretprobe = { .kp.symbol_name = "ip6_datagram_connect", @@ -3582,10 +3580,31 @@ static struct notifier_block smith_usb_notifier = { .notifier_call = smith_usb_ncb, }; +/* + * set minimal of maxactive as 32 to avoid possible missings + * of kretprobe events, especially for NON-PREEMPTED systems + * with small number of CPU cores (ex: 2-core KVM guest) + */ +static int smith_register_kretprobe(struct kretprobe *kr) +{ + int ninsts = max_t(int, 32, 2 * num_present_cpus()); + if (kr->maxactive < ninsts) + kr->maxactive = ninsts; + return register_kretprobe(kr); +} + +static void smith_unregister_kretprobe(struct kretprobe *kr) +{ + unregister_kretprobe(kr); + + /* set addr to NULL to enable re-registeration */ + kr->kp.addr = NULL; +} + int register_bind_kprobe(void) { int ret; - ret = register_kretprobe(&bind_kretprobe); + ret = smith_register_kretprobe(&bind_kretprobe); if (ret == 0) bind_kprobe_state = 0x1; @@ -3595,7 +3614,7 @@ int register_bind_kprobe(void) void unregister_bind_kprobe(void) { - unregister_kretprobe(&bind_kretprobe); + smith_unregister_kretprobe(&bind_kretprobe); } int register_call_usermodehelper_exec_kprobe(void) @@ -3681,7 +3700,7 @@ void unregister_link_kprobe(void) int register_execve_kprobe(void) { int ret; - ret = register_kretprobe(&execve_kretprobe); + ret = smith_register_kretprobe(&execve_kretprobe); if (ret == 0) execve_kretprobe_state = 0x1; @@ -3692,7 +3711,7 @@ int register_execve_kprobe(void) int register_execveat_kprobe(void) { int ret; - ret = register_kretprobe(&execveat_kretprobe); + ret = smith_register_kretprobe(&execveat_kretprobe); if (ret == 0) execveat_kretprobe_state = 0x1; @@ -3704,7 +3723,7 @@ int register_execveat_kprobe(void) int register_compat_execve_kprobe(void) { int ret; - ret = register_kretprobe(&compat_execve_kretprobe); + ret = smith_register_kretprobe(&compat_execve_kretprobe); if (ret == 0) compat_execve_kretprobe_state = 0x1; @@ -3713,14 +3732,14 @@ int register_compat_execve_kprobe(void) void unregister_compat_execve_kprobe(void) { - unregister_kretprobe(&compat_execve_kretprobe); + smith_unregister_kretprobe(&compat_execve_kretprobe); } #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0) int register_compat_execveat_kprobe(void) { int ret; - ret = register_kretprobe(&compat_execveat_kretprobe); + ret = smith_register_kretprobe(&compat_execveat_kretprobe); if (ret == 0) compat_execveat_kretprobe_state = 0x1; @@ -3729,20 +3748,20 @@ int register_compat_execveat_kprobe(void) void unregister_compat_execveat_kprobe(void) { - unregister_kretprobe(&compat_execveat_kretprobe); + smith_unregister_kretprobe(&compat_execveat_kretprobe); } #endif #endif void unregister_execve_kprobe(void) { - unregister_kretprobe(&execve_kretprobe); + smith_unregister_kretprobe(&execve_kretprobe); } #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0) void unregister_execveat_kprobe(void) { - unregister_kretprobe(&execveat_kretprobe); + smith_unregister_kretprobe(&execveat_kretprobe); } #endif @@ -3765,8 +3784,7 @@ void unregister_ptrace_kprobe(void) int register_udp_recvmsg_kprobe(void) { int ret; - udp_recvmsg_kretprobe = udp_recvmsg_kretprobe_value; - ret = register_kretprobe(&udp_recvmsg_kretprobe); + ret = smith_register_kretprobe(&udp_recvmsg_kretprobe); if (ret == 0) udp_recvmsg_kprobe_state = 0x1; @@ -3776,7 +3794,7 @@ int register_udp_recvmsg_kprobe(void) void unregister_udp_recvmsg_kprobe(void) { - unregister_kretprobe(&udp_recvmsg_kretprobe); + smith_unregister_kretprobe(&udp_recvmsg_kretprobe); udp_recvmsg_kprobe_state = 0; } @@ -3784,8 +3802,7 @@ void unregister_udp_recvmsg_kprobe(void) int register_udpv6_recvmsg_kprobe(void) { int ret; - udpv6_recvmsg_kretprobe = udpv6_recvmsg_kretprobe_value; - ret = register_kretprobe(&udpv6_recvmsg_kretprobe); + ret = smith_register_kretprobe(&udpv6_recvmsg_kretprobe); if (ret == 0) udpv6_recvmsg_kprobe_state = 0x1; @@ -3795,14 +3812,14 @@ int register_udpv6_recvmsg_kprobe(void) void unregister_udpv6_recvmsg_kprobe(void) { - unregister_kretprobe(&udpv6_recvmsg_kretprobe); + smith_unregister_kretprobe(&udpv6_recvmsg_kretprobe); udpv6_recvmsg_kprobe_state = 0; } int register_ip6_datagram_connect_kprobe(void) { int ret; - ret = register_kretprobe(&ip6_datagram_connect_kretprobe); + ret = smith_register_kretprobe(&ip6_datagram_connect_kretprobe); if (ret == 0) ip6_datagram_connect_kprobe_state = 0x1; @@ -3812,13 +3829,13 @@ int register_ip6_datagram_connect_kprobe(void) void unregister_ip6_datagram_connect_kprobe(void) { - unregister_kretprobe(&ip6_datagram_connect_kretprobe); + smith_unregister_kretprobe(&ip6_datagram_connect_kretprobe); } int register_tcp_v6_connect_kprobe(void) { int ret; - ret = register_kretprobe(&tcp_v6_connect_kretprobe); + ret = smith_register_kretprobe(&tcp_v6_connect_kretprobe); if (ret == 0) tcp_v6_connect_kprobe_state = 0x1; @@ -3828,14 +3845,14 @@ int register_tcp_v6_connect_kprobe(void) void unregister_tcp_v6_connect_kprobe(void) { - unregister_kretprobe(&tcp_v6_connect_kretprobe); + smith_unregister_kretprobe(&tcp_v6_connect_kretprobe); } #endif int register_ip4_datagram_connect_kprobe(void) { int ret; - ret = register_kretprobe(&ip4_datagram_connect_kretprobe); + ret = smith_register_kretprobe(&ip4_datagram_connect_kretprobe); if (ret == 0) ip4_datagram_connect_kprobe_state = 0x1; @@ -3845,13 +3862,13 @@ int register_ip4_datagram_connect_kprobe(void) void unregister_ip4_datagram_connect_kprobe(void) { - unregister_kretprobe(&ip4_datagram_connect_kretprobe); + smith_unregister_kretprobe(&ip4_datagram_connect_kretprobe); } int register_tcp_v4_connect_kprobe(void) { int ret; - ret = register_kretprobe(&tcp_v4_connect_kretprobe); + ret = smith_register_kretprobe(&tcp_v4_connect_kretprobe); if (ret == 0) tcp_v4_connect_kprobe_state = 0x1; @@ -3861,13 +3878,13 @@ int register_tcp_v4_connect_kprobe(void) void unregister_tcp_v4_connect_kprobe(void) { - unregister_kretprobe(&tcp_v4_connect_kretprobe); + smith_unregister_kretprobe(&tcp_v4_connect_kretprobe); } int register_connect_syscall_kprobe(void) { int ret; - ret = register_kretprobe(&connect_syscall_kretprobe); + ret = smith_register_kretprobe(&connect_syscall_kretprobe); if (ret == 0) connect_syscall_kprobe_state = 0x1; @@ -3877,13 +3894,13 @@ int register_connect_syscall_kprobe(void) void unregister_connect_syscall_kprobe(void) { - unregister_kretprobe(&connect_syscall_kretprobe); + smith_unregister_kretprobe(&connect_syscall_kretprobe); } int register_accept_kprobe(void) { int ret; - ret = register_kretprobe(&accept_kretprobe); + ret = smith_register_kretprobe(&accept_kretprobe); if (ret == 0) accept_kretprobe_state = 0x1; @@ -3893,13 +3910,13 @@ int register_accept_kprobe(void) void unregister_accept_kprobe(void) { - unregister_kretprobe(&accept_kretprobe); + smith_unregister_kretprobe(&accept_kretprobe); } int register_accept4_kprobe(void) { int ret; - ret = register_kretprobe(&accept4_kretprobe); + ret = smith_register_kretprobe(&accept4_kretprobe); if (ret == 0) accept4_kretprobe_state = 0x1; @@ -3909,7 +3926,7 @@ int register_accept4_kprobe(void) void unregister_accept4_kprobe(void) { - unregister_kretprobe(&accept4_kretprobe); + smith_unregister_kretprobe(&accept4_kretprobe); } int register_create_file_kprobe(void) @@ -4011,7 +4028,7 @@ void unregister_prctl_kprobe(void) int register_update_cred_kprobe(void) { int ret; - ret = register_kretprobe(&update_cred_kretprobe); + ret = smith_register_kretprobe(&update_cred_kretprobe); if (ret == 0) update_cred_kprobe_state = 0x1; @@ -4020,7 +4037,7 @@ int register_update_cred_kprobe(void) void unregister_update_cred_kprobe(void) { - unregister_kretprobe(&update_cred_kretprobe); + smith_unregister_kretprobe(&update_cred_kretprobe); } int register_mprotect_kprobe(void) @@ -4116,7 +4133,7 @@ void unregister_nanosleep_kprobe(void) int register_security_path_rmdir_kprobe(void) { int ret; - ret = register_kretprobe(&security_path_rmdir_kprobe); + ret = smith_register_kretprobe(&security_path_rmdir_kprobe); if (ret == 0) security_path_rmdir_kprobe_state = 0x1; @@ -4125,13 +4142,13 @@ int register_security_path_rmdir_kprobe(void) void unregister_security_path_rmdir_kprobe(void) { - unregister_kretprobe(&security_path_rmdir_kprobe); + smith_unregister_kretprobe(&security_path_rmdir_kprobe); } int register_security_path_unlink_kprobe(void) { int ret; - ret = register_kretprobe(&security_path_unlink_kprobe); + ret = smith_register_kretprobe(&security_path_unlink_kprobe); if (ret == 0) security_path_unlink_kprobe_state = 0x1; @@ -4140,7 +4157,7 @@ int register_security_path_unlink_kprobe(void) void unregister_security_path_unlink_kprobe(void) { - unregister_kretprobe(&security_path_unlink_kprobe); + smith_unregister_kretprobe(&security_path_unlink_kprobe); } int register_kill_kprobe(void) From 8c1aa89cbf4fabad0df0d5d8c4884bd369857910 Mon Sep 17 00:00:00 2001 From: Jia Hao Date: Wed, 23 Mar 2022 14:45:48 +0800 Subject: [PATCH 0005/5344] bytedance: sched: Add burst instance data structure For burst instances we need to identify tasks by burst_credit, and isolate the burst data structure by BYTEDANCE_BURST_SCHED. Signed-off-by: Jia Hao --- config.aarch64 | 1 + config.x86_64 | 1 + include/linux/sched.h | 4 +++- init/Kconfig | 10 ++++++++++ 4 files changed, 15 insertions(+), 1 deletion(-) diff --git a/config.aarch64 b/config.aarch64 index 63c378fddf701..7040d2b9c1651 100644 --- a/config.aarch64 +++ b/config.aarch64 @@ -116,6 +116,7 @@ CONFIG_GENERIC_SCHED_CLOCK=y # # Scheduler features # +CONFIG_BYTEDANCE_BURST_SCHED=y # CONFIG_UCLAMP_TASK is not set # end of Scheduler features diff --git a/config.x86_64 b/config.x86_64 index 0633c4302c60d..f0d99f2989056 100644 --- a/config.x86_64 +++ b/config.x86_64 @@ -133,6 +133,7 @@ CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y # # Scheduler features # +CONFIG_BYTEDANCE_BURST_SCHED=y # CONFIG_UCLAMP_TASK is not set # end of Scheduler features diff --git a/include/linux/sched.h b/include/linux/sched.h index 78faffc70dd60..56b72c64e2834 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -849,7 +849,9 @@ struct task_struct { #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN struct vtime vtime; #endif - +#ifdef CONFIG_BYTEDANCE_BURST_SCHED + void *burst_credit; +#endif #ifdef CONFIG_NO_HZ_FULL atomic_t tick_dep_mask; #endif diff --git a/init/Kconfig b/init/Kconfig index 6d64930be40de..a6ed1ced39315 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -686,6 +686,16 @@ config GENERIC_SCHED_CLOCK menu "Scheduler features" +config BYTEDANCE_BURST_SCHED + bool "Enable burst instance scheduling" + default n + help + This feature enables the scheduler to calculate and modify the credit value + of burst instance tasks, and schedule tasks according to the credit value. + + The specific details of the burst instance scheduler are implemented + in the module. + config UCLAMP_TASK bool "Enable utilization clamping for RT/FAIR tasks" depends on CPU_FREQ_GOV_SCHEDUTIL From 16438fb9f766d366da2fb5245bfd362d1464dc78 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sat, 12 Mar 2022 23:22:20 +0800 Subject: [PATCH 0006/5344] livepatch: Don't block removal of patches that are safe to unload commit 2957308343fa7c621df9f342fab88cb970b8d5f3 upstream. module_put() is not called for a patch with "forced" flag. It should block the removal of the livepatch module when the code might still be in use after forced transition. klp_force_transition() currently sets "forced" flag for all patches on the list. In fact, any patch can be safely unloaded when it passed through the consistency model in KLP_UNPATCHED transition. In other words, the "forced" flag must be set only for livepatches that are being removed. In particular, set the "forced" flag: + only for klp_transition_patch when the transition to KLP_UNPATCHED state was forced. + all replaced patches when the transition to KLP_PATCHED state was forced and the patch was replacing the existing patches. Signed-off-by: Chengming Zhou Acked-by: Joe Lawrence Reviewed-by: Petr Mladek Tested-by: Petr Mladek [mbenes@suse.cz: wording improvements] Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220312152220.88127-1-zhouchengming@bytedance.com --- kernel/livepatch/transition.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index e422d49925929..98d328d7aee13 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -643,6 +643,13 @@ void klp_force_transition(void) for_each_possible_cpu(cpu) klp_update_patch_state(idle_task(cpu)); - klp_for_each_patch(patch) - patch->forced = true; + /* Set forced flag for patches being removed. */ + if (klp_target_state == KLP_UNPATCHED) + klp_transition_patch->forced = true; + else if (klp_transition_patch->replace) { + klp_for_each_patch(patch) { + if (patch != klp_transition_patch) + patch->forced = true; + } + } } From 6859b4a8a8649af007279158e1d03c2d5edbad7b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 20 Jan 2022 11:04:01 -0600 Subject: [PATCH 0007/5344] cgroup-v1: Require capabilities to set release_agent commit 24f6008564183aa120d07c03d9289519c2fe02af upstream. The cgroup release_agent is called with call_usermodehelper. The function call_usermodehelper starts the release_agent with a full set fo capabilities. Therefore require capabilities when setting the release_agaent. Reported-by: Tabitha Sable Tested-by: Tabitha Sable Fixes: 81a6a5cdd2c5 ("Task Control Groups: automatic userspace notification of idle cgroups") Cc: stable@vger.kernel.org # v2.6.24+ Signed-off-by: "Eric W. Biederman" Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman Signed-off-by: Muchun Song --- kernel/cgroup/cgroup-v1.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 94026c700ba26..227c59a15e0fa 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -549,6 +549,14 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + /* + * Release agent gets called with all capabilities, + * require capabilities to set release agent. + */ + if ((of->file->f_cred->user_ns != &init_user_ns) || + !capable(CAP_SYS_ADMIN)) + return -EPERM; + cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; @@ -962,6 +970,12 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) /* Specifying two release agents is forbidden */ if (ctx->release_agent) return cg_invalf(fc, "cgroup1: release_agent respecified"); + /* + * Release agent gets called with all capabilities, + * require capabilities to set release agent. + */ + if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) + return cg_invalf(fc, "cgroup1: Setting release_agent not allowed"); ctx->release_agent = param->string; param->string = NULL; break; From 1e7af4d98a19b8162fe1834811fa68cbc7787033 Mon Sep 17 00:00:00 2001 From: Hui Su Date: Fri, 6 Nov 2020 22:47:40 +0800 Subject: [PATCH 0008/5344] cgroup/cgroup.c: replace 'of->kn->priv' with of_cft() commit 5a7b5f32c5aa628841502d19a813c633ff6ecbe4 upstream. we have supplied the inline function: of_cft() in cgroup.h. So replace the direct use 'of->kn->priv' with inline func of_cft(), which is more readable. Signed-off-by: Hui Su Signed-off-by: Tejun Heo Signed-off-by: Muchun Song --- kernel/cgroup/cgroup.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 2db181e6f26bb..02f261f7e3d21 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3718,7 +3718,7 @@ static ssize_t cgroup_freeze_write(struct kernfs_open_file *of, static int cgroup_file_open(struct kernfs_open_file *of) { - struct cftype *cft = of->kn->priv; + struct cftype *cft = of_cft(of); if (cft->open) return cft->open(of); @@ -3727,7 +3727,7 @@ static int cgroup_file_open(struct kernfs_open_file *of) static void cgroup_file_release(struct kernfs_open_file *of) { - struct cftype *cft = of->kn->priv; + struct cftype *cft = of_cft(of); if (cft->release) cft->release(of); @@ -3738,7 +3738,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, { struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct cgroup *cgrp = of->kn->parent->priv; - struct cftype *cft = of->kn->priv; + struct cftype *cft = of_cft(of); struct cgroup_subsys_state *css; int ret; @@ -3785,7 +3785,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt) { - struct cftype *cft = of->kn->priv; + struct cftype *cft = of_cft(of); if (cft->poll) return cft->poll(of, pt); From ffd0bbd39d9c7d85fbba3d8bfd111bd886b6a74a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 6 Jan 2022 11:02:29 -1000 Subject: [PATCH 0009/5344] cgroup: Allocate cgroup_file_ctx for kernfs_open_file->priv MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 0d2b5955b36250a9428c832664f2079cbf723bec upstream. of->priv is currently used by each interface file implementation to store private information. This patch collects the current two private data usages into struct cgroup_file_ctx which is allocated and freed by the common path. This allows generic private data which applies to multiple files, which will be used to in the following patch. Note that cgroup_procs iterator is now embedded as procs.iter in the new cgroup_file_ctx so that it doesn't need to be allocated and freed separately. v2: union dropped from cgroup_file_ctx and the procs iterator is embedded in cgroup_file_ctx as suggested by Linus. v3: Michal pointed out that cgroup1's procs pidlist uses of->priv too. Converted. Didn't change to embedded allocation as cgroup1 pidlists get stored for caching. Signed-off-by: Tejun Heo Cc: Linus Torvalds Reviewed-by: Michal Koutný Signed-off-by: Muchun Song --- kernel/cgroup/cgroup-internal.h | 17 +++++++++++ kernel/cgroup/cgroup-v1.c | 26 ++++++++-------- kernel/cgroup/cgroup.c | 53 +++++++++++++++++++++------------ 3 files changed, 65 insertions(+), 31 deletions(-) diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 87d8c36f8bb60..963f286e8856c 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -65,6 +65,23 @@ static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc) return container_of(kfc, struct cgroup_fs_context, kfc); } +struct cgroup_pidlist; + +struct cgroup_file_ctx { + struct { + void *trigger; + } psi; + + struct { + bool started; + struct css_task_iter iter; + } procs; + + struct { + struct cgroup_pidlist *pidlist; + } procs1; +}; + /* * A cgroup can be associated with multiple css_sets as different tasks may * belong to different cgroups on different hierarchies. In the other diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 227c59a15e0fa..63a1656eba609 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -398,6 +398,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) * next pid to display, if any */ struct kernfs_open_file *of = s->private; + struct cgroup_file_ctx *ctx = of->priv; struct cgroup *cgrp = seq_css(s)->cgroup; struct cgroup_pidlist *l; enum cgroup_filetype type = seq_cft(s)->private; @@ -407,25 +408,24 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) mutex_lock(&cgrp->pidlist_mutex); /* - * !NULL @of->priv indicates that this isn't the first start() - * after open. If the matching pidlist is around, we can use that. - * Look for it. Note that @of->priv can't be used directly. It - * could already have been destroyed. + * !NULL @ctx->procs1.pidlist indicates that this isn't the first + * start() after open. If the matching pidlist is around, we can use + * that. Look for it. Note that @ctx->procs1.pidlist can't be used + * directly. It could already have been destroyed. */ - if (of->priv) - of->priv = cgroup_pidlist_find(cgrp, type); + if (ctx->procs1.pidlist) + ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type); /* * Either this is the first start() after open or the matching * pidlist has been destroyed inbetween. Create a new one. */ - if (!of->priv) { - ret = pidlist_array_load(cgrp, type, - (struct cgroup_pidlist **)&of->priv); + if (!ctx->procs1.pidlist) { + ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist); if (ret) return ERR_PTR(ret); } - l = of->priv; + l = ctx->procs1.pidlist; if (pid) { int end = l->length; @@ -453,7 +453,8 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) static void cgroup_pidlist_stop(struct seq_file *s, void *v) { struct kernfs_open_file *of = s->private; - struct cgroup_pidlist *l = of->priv; + struct cgroup_file_ctx *ctx = of->priv; + struct cgroup_pidlist *l = ctx->procs1.pidlist; if (l) mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, @@ -464,7 +465,8 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v) static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) { struct kernfs_open_file *of = s->private; - struct cgroup_pidlist *l = of->priv; + struct cgroup_file_ctx *ctx = of->priv; + struct cgroup_pidlist *l = ctx->procs1.pidlist; pid_t *p = v; pid_t *end = l->list + l->length; /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 02f261f7e3d21..acc9289c8f01f 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3624,6 +3624,7 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, enum psi_res res) { + struct cgroup_file_ctx *ctx = of->priv; struct psi_trigger *new; struct cgroup *cgrp; struct psi_group *psi; @@ -3642,7 +3643,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, return PTR_ERR(new); } - psi_trigger_replace(&of->priv, new); + psi_trigger_replace(&ctx->psi.trigger, new); cgroup_put(cgrp); @@ -3673,12 +3674,16 @@ static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, poll_table *pt) { - return psi_trigger_poll(&of->priv, of->file, pt); + struct cgroup_file_ctx *ctx = of->priv; + + return psi_trigger_poll(&ctx->psi.trigger, of->file, pt); } static void cgroup_pressure_release(struct kernfs_open_file *of) { - psi_trigger_replace(&of->priv, NULL); + struct cgroup_file_ctx *ctx = of->priv; + + psi_trigger_replace(&ctx->psi.trigger, NULL); } #endif /* CONFIG_PSI */ @@ -3719,18 +3724,31 @@ static ssize_t cgroup_freeze_write(struct kernfs_open_file *of, static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of_cft(of); + struct cgroup_file_ctx *ctx; + int ret; - if (cft->open) - return cft->open(of); - return 0; + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + of->priv = ctx; + + if (!cft->open) + return 0; + + ret = cft->open(of); + if (ret) + kfree(ctx); + return ret; } static void cgroup_file_release(struct kernfs_open_file *of) { struct cftype *cft = of_cft(of); + struct cgroup_file_ctx *ctx = of->priv; if (cft->release) cft->release(of); + kfree(ctx); } static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, @@ -4658,21 +4676,21 @@ void css_task_iter_end(struct css_task_iter *it) static void cgroup_procs_release(struct kernfs_open_file *of) { - if (of->priv) { - css_task_iter_end(of->priv); - kfree(of->priv); - } + struct cgroup_file_ctx *ctx = of->priv; + + if (ctx->procs.started) + css_task_iter_end(&ctx->procs.iter); } static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) { struct kernfs_open_file *of = s->private; - struct css_task_iter *it = of->priv; + struct cgroup_file_ctx *ctx = of->priv; if (pos) (*pos)++; - return css_task_iter_next(it); + return css_task_iter_next(&ctx->procs.iter); } static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, @@ -4680,21 +4698,18 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, { struct kernfs_open_file *of = s->private; struct cgroup *cgrp = seq_css(s)->cgroup; - struct css_task_iter *it = of->priv; + struct cgroup_file_ctx *ctx = of->priv; + struct css_task_iter *it = &ctx->procs.iter; /* * When a seq_file is seeked, it's always traversed sequentially * from position 0, so we can simply keep iterating on !0 *pos. */ - if (!it) { + if (!ctx->procs.started) { if (WARN_ON_ONCE((*pos))) return ERR_PTR(-EINVAL); - - it = kzalloc(sizeof(*it), GFP_KERNEL); - if (!it) - return ERR_PTR(-ENOMEM); - of->priv = it; css_task_iter_start(&cgrp->self, iter_flags, it); + ctx->procs.started = true; } else if (!(*pos)) { css_task_iter_end(it); css_task_iter_start(&cgrp->self, iter_flags, it); From 93f7fdc8df86abec0fb39db1396795c6961d5712 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 5 Feb 2020 14:26:18 +0100 Subject: [PATCH 0010/5344] cgroup: unify attach permission checking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 6df970e4f5d2c273554550d40d8b92cea9bec1a0 upstream. The core codepaths to check whether a process can be attached to a cgroup are the same for threads and thread-group leaders. Only a small piece of code verifying that source and destination cgroup are in the same domain differentiates the thread permission checking from thread-group leader permission checking. Since cgroup_migrate_vet_dst() only matters cgroup2 - it is a noop on cgroup1 - we can move it out of cgroup_attach_task(). All checks can now be consolidated into a new helper cgroup_attach_permissions() callable from both cgroup_procs_write() and cgroup_threads_write(). Cc: Tejun Heo Cc: Li Zefan Cc: Johannes Weiner Cc: cgroups@vger.kernel.org Acked-by: Michal Koutný Signed-off-by: Christian Brauner Signed-off-by: Tejun Heo Signed-off-by: Muchun Song --- kernel/cgroup/cgroup.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index acc9289c8f01f..e02affd3ab186 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2793,11 +2793,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, { DEFINE_CGROUP_MGCTX(mgctx); struct task_struct *task; - int ret; - - ret = cgroup_migrate_vet_dst(dst_cgrp); - if (ret) - return ret; + int ret = 0; /* look up all src csets */ spin_lock_irq(&css_set_lock); @@ -4779,6 +4775,26 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp, return 0; } +static int cgroup_attach_permissions(struct cgroup *src_cgrp, + struct cgroup *dst_cgrp, + struct super_block *sb, bool threadgroup) +{ + int ret = 0; + + ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb); + if (ret) + return ret; + + ret = cgroup_migrate_vet_dst(dst_cgrp); + if (ret) + return ret; + + if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)) + ret = -EOPNOTSUPP; + + return ret; +} + static ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -4800,8 +4816,8 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of, src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); spin_unlock_irq(&css_set_lock); - ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb); + ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, + of->file->f_path.dentry->d_sb, true); if (ret) goto out_finish; @@ -4844,16 +4860,11 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, spin_unlock_irq(&css_set_lock); /* thread migrations follow the cgroup.procs delegation rule */ - ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb); + ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, + of->file->f_path.dentry->d_sb, false); if (ret) goto out_finish; - /* and must be contained in the same domain */ - ret = -EOPNOTSUPP; - if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp) - goto out_finish; - ret = cgroup_attach_task(dst_cgrp, task, false); out_finish: From 2d2d8138ea2133ef8f2ec83d61f792d7c8535798 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Fri, 4 Oct 2019 12:57:40 +0200 Subject: [PATCH 0011/5344] cgroup: Optimize single thread migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 9a3284fad42f66bb43629c6716709ff791aaa457 upstream. There are reports of users who use thread migrations between cgroups and they report performance drop after d59cfc09c32a ("sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem"). The effect is pronounced on machines with more CPUs. The migration is affected by forking noise happening in the background, after the mentioned commit a migrating thread must wait for all (forking) processes on the system, not only of its threadgroup. There are several places that need to synchronize with migration: a) do_exit, b) de_thread, c) copy_process, d) cgroup_update_dfl_csses, e) parallel migration (cgroup_{proc,thread}s_write). In the case of self-migrating thread, we relax the synchronization on cgroup_threadgroup_rwsem to avoid the cost of waiting. d) and e) are excluded with cgroup_mutex, c) does not matter in case of single thread migration and the executing thread cannot exec(2) or exit(2) while it is writing into cgroup.threads. In case of do_exit because of signal delivery, we either exit before the migration or finish the migration (of not yet PF_EXITING thread) and die afterwards. This patch handles only the case of self-migration by writing "0" into cgroup.threads. For simplicity, we always take cgroup_threadgroup_rwsem with numeric PIDs. This change improves migration dependent workload performance similar to per-signal_struct state. Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo Signed-off-by: Muchun Song --- kernel/cgroup/cgroup-internal.h | 5 +++-- kernel/cgroup/cgroup-v1.c | 5 +++-- kernel/cgroup/cgroup.c | 39 +++++++++++++++++++++++++-------- 3 files changed, 36 insertions(+), 13 deletions(-) diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 963f286e8856c..c0ca96b351582 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -249,9 +249,10 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); -struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) +struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, + bool *locked) __acquires(&cgroup_threadgroup_rwsem); -void cgroup_procs_write_finish(struct task_struct *task) +void cgroup_procs_write_finish(struct task_struct *task, bool locked) __releases(&cgroup_threadgroup_rwsem); void cgroup_lock_and_drain_offline(struct cgroup *cgrp); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 63a1656eba609..294c9d7c4018f 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -498,12 +498,13 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, struct task_struct *task; const struct cred *cred, *tcred; ssize_t ret; + bool locked; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup); + task = cgroup_procs_write_start(buf, threadgroup, &locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -525,7 +526,7 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, ret = cgroup_attach_task(cgrp, task, threadgroup); out_finish: - cgroup_procs_write_finish(task); + cgroup_procs_write_finish(task, locked); out_unlock: cgroup_kn_unlock(of->kn); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index e02affd3ab186..f9d2c8df8c555 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2820,7 +2820,8 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, return ret; } -struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) +struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, + bool *locked) __acquires(&cgroup_threadgroup_rwsem) { struct task_struct *tsk; @@ -2829,7 +2830,21 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return ERR_PTR(-EINVAL); - percpu_down_write(&cgroup_threadgroup_rwsem); + /* + * If we migrate a single thread, we don't care about threadgroup + * stability. If the thread is `current`, it won't exit(2) under our + * hands or change PID through exec(2). We exclude + * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write + * callers by cgroup_mutex. + * Therefore, we can skip the global lock. + */ + lockdep_assert_held(&cgroup_mutex); + if (pid || threadgroup) { + percpu_down_write(&cgroup_threadgroup_rwsem); + *locked = true; + } else { + *locked = false; + } rcu_read_lock(); if (pid) { @@ -2860,13 +2875,16 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) goto out_unlock_rcu; out_unlock_threadgroup: - percpu_up_write(&cgroup_threadgroup_rwsem); + if (*locked) { + percpu_up_write(&cgroup_threadgroup_rwsem); + *locked = false; + } out_unlock_rcu: rcu_read_unlock(); return tsk; } -void cgroup_procs_write_finish(struct task_struct *task) +void cgroup_procs_write_finish(struct task_struct *task, bool locked) __releases(&cgroup_threadgroup_rwsem) { struct cgroup_subsys *ss; @@ -2875,7 +2893,8 @@ void cgroup_procs_write_finish(struct task_struct *task) /* release reference from cgroup_procs_write_start() */ put_task_struct(task); - percpu_up_write(&cgroup_threadgroup_rwsem); + if (locked) + percpu_up_write(&cgroup_threadgroup_rwsem); for_each_subsys(ss, ssid) if (ss->post_attach) ss->post_attach(); @@ -4801,12 +4820,13 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of, struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; ssize_t ret; + bool locked; dst_cgrp = cgroup_kn_lock_live(of->kn, false); if (!dst_cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, true); + task = cgroup_procs_write_start(buf, true, &locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -4824,7 +4844,7 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of, ret = cgroup_attach_task(dst_cgrp, task, true); out_finish: - cgroup_procs_write_finish(task); + cgroup_procs_write_finish(task, locked); out_unlock: cgroup_kn_unlock(of->kn); @@ -4842,6 +4862,7 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; ssize_t ret; + bool locked; buf = strstrip(buf); @@ -4849,7 +4870,7 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, if (!dst_cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, false); + task = cgroup_procs_write_start(buf, false, &locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -4868,7 +4889,7 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, ret = cgroup_attach_task(dst_cgrp, task, false); out_finish: - cgroup_procs_write_finish(task); + cgroup_procs_write_finish(task, locked); out_unlock: cgroup_kn_unlock(of->kn); From 512f54af272af5c017f5a9b6dfeb6ff96e8f0b9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Thu, 14 Jan 2021 13:44:27 +0100 Subject: [PATCH 0012/5344] cgroup: cgroup.{procs,threads} factor out common parts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit da70862efe0065bada33d67a903270cdbbaf07d9 upstream. The functions cgroup_threads_write and cgroup_procs_write are almost identical. In order to reduce duplication, factor out the common code in similar fashion we already do for other threadgroup/task functions. No functional changes are intended. Suggested-by: Hao Lee Signed-off-by: Michal Koutný Reviewed-by: Daniel Jordan Signed-off-by: Tejun Heo Signed-off-by: Muchun Song --- kernel/cgroup/cgroup.c | 55 +++++++++++------------------------------- 1 file changed, 14 insertions(+), 41 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f9d2c8df8c555..a6f35e796208a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4814,8 +4814,8 @@ static int cgroup_attach_permissions(struct cgroup *src_cgrp, return ret; } -static ssize_t cgroup_procs_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) +static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, + bool threadgroup) { struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; @@ -4826,7 +4826,7 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of, if (!dst_cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, true, &locked); + task = cgroup_procs_write_start(buf, threadgroup, &locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -4836,19 +4836,26 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of, src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); spin_unlock_irq(&css_set_lock); + /* process and thread migrations follow same delegation rule */ ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb, true); + of->file->f_path.dentry->d_sb, threadgroup); if (ret) goto out_finish; - ret = cgroup_attach_task(dst_cgrp, task, true); + ret = cgroup_attach_task(dst_cgrp, task, threadgroup); out_finish: cgroup_procs_write_finish(task, locked); out_unlock: cgroup_kn_unlock(of->kn); - return ret ?: nbytes; + return ret; +} + +static ssize_t cgroup_procs_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return __cgroup_procs_write(of, buf, true) ?: nbytes; } static void *cgroup_threads_start(struct seq_file *s, loff_t *pos) @@ -4859,41 +4866,7 @@ static void *cgroup_threads_start(struct seq_file *s, loff_t *pos) static ssize_t cgroup_threads_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - struct cgroup *src_cgrp, *dst_cgrp; - struct task_struct *task; - ssize_t ret; - bool locked; - - buf = strstrip(buf); - - dst_cgrp = cgroup_kn_lock_live(of->kn, false); - if (!dst_cgrp) - return -ENODEV; - - task = cgroup_procs_write_start(buf, false, &locked); - ret = PTR_ERR_OR_ZERO(task); - if (ret) - goto out_unlock; - - /* find the source cgroup */ - spin_lock_irq(&css_set_lock); - src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); - spin_unlock_irq(&css_set_lock); - - /* thread migrations follow the cgroup.procs delegation rule */ - ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb, false); - if (ret) - goto out_finish; - - ret = cgroup_attach_task(dst_cgrp, task, false); - -out_finish: - cgroup_procs_write_finish(task, locked); -out_unlock: - cgroup_kn_unlock(of->kn); - - return ret ?: nbytes; + return __cgroup_procs_write(of, buf, false) ?: nbytes; } /* cgroup core interface files for the default hierarchy */ From f1f3f710f421048582cc62110921392792028b04 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 6 Jan 2022 11:02:28 -1000 Subject: [PATCH 0013/5344] cgroup: Use open-time credentials for process migraton perm checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 1756d7994ad85c2479af6ae5a9750b92324685af upstream. cgroup process migration permission checks are performed at write time as whether a given operation is allowed or not is dependent on the content of the write - the PID. This currently uses current's credentials which is a potential security weakness as it may allow scenarios where a less privileged process tricks a more privileged one into writing into a fd that it created. This patch makes both cgroup2 and cgroup1 process migration interfaces to use the credentials saved at the time of open (file->f_cred) instead of current's. Reported-by: "Eric W. Biederman" Suggested-by: Linus Torvalds Fixes: 187fe84067bd ("cgroup: require write perm on common ancestor when moving processes on the default hierarchy") Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo Signed-off-by: Muchun Song --- kernel/cgroup/cgroup-v1.c | 7 ++++--- kernel/cgroup/cgroup.c | 9 ++++++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 294c9d7c4018f..80eb1891f1288 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -510,10 +510,11 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, goto out_unlock; /* - * Even if we're attaching all tasks in the thread group, we only - * need to check permissions on one of them. + * Even if we're attaching all tasks in the thread group, we only need + * to check permissions on one of them. Check permissions using the + * credentials from file open to protect against inherited fd attacks. */ - cred = current_cred(); + cred = of->file->f_cred; tcred = get_task_cred(task); if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && !uid_eq(cred->euid, tcred->uid) && diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a6f35e796208a..9978bcd762af8 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4819,6 +4819,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, { struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; + const struct cred *saved_cred; ssize_t ret; bool locked; @@ -4836,9 +4837,15 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); spin_unlock_irq(&css_set_lock); - /* process and thread migrations follow same delegation rule */ + /* + * Process and thread migrations follow same delegation rule. Check + * permissions using the credentials from file open to protect against + * inherited fd attacks. + */ + saved_cred = override_creds(of->file->f_cred); ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, of->file->f_path.dentry->d_sb, threadgroup); + revert_creds(saved_cred); if (ret) goto out_finish; From d82dc26b794a4d26d67e98b02c655e5fe4e5c858 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 6 Jan 2022 11:02:29 -1000 Subject: [PATCH 0014/5344] cgroup: Use open-time cgroup namespace for process migration perm checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit e57457641613fef0d147ede8bd6a3047df588b95 upstream. cgroup process migration permission checks are performed at write time as whether a given operation is allowed or not is dependent on the content of the write - the PID. This currently uses current's cgroup namespace which is a potential security weakness as it may allow scenarios where a less privileged process tricks a more privileged one into writing into a fd that it created. This patch makes cgroup remember the cgroup namespace at the time of open and uses it for migration permission checks instad of current's. Note that this only applies to cgroup2 as cgroup1 doesn't have namespace support. This also fixes a use-after-free bug on cgroupns reported in https://lore.kernel.org/r/00000000000048c15c05d0083397@google.com Note that backporting this fix also requires the preceding patch. Reported-by: "Eric W. Biederman" Suggested-by: Linus Torvalds Cc: Michal Koutný Cc: Oleg Nesterov Reviewed-by: Michal Koutný Reported-by: syzbot+50f5cf33a284ce738b62@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/00000000000048c15c05d0083397@google.com Fixes: 5136f6365ce3 ("cgroup: implement "nsdelegate" mount option") Signed-off-by: Tejun Heo Signed-off-by: Muchun Song --- kernel/cgroup/cgroup-internal.h | 2 ++ kernel/cgroup/cgroup.c | 25 +++++++++++++++++-------- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index c0ca96b351582..e4cee104c5f89 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -68,6 +68,8 @@ static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc) struct cgroup_pidlist; struct cgroup_file_ctx { + struct cgroup_namespace *ns; + struct { void *trigger; } psi; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 9978bcd762af8..f82fa1b159af4 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3745,14 +3745,19 @@ static int cgroup_file_open(struct kernfs_open_file *of) ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; + + ctx->ns = current->nsproxy->cgroup_ns; + get_cgroup_ns(ctx->ns); of->priv = ctx; if (!cft->open) return 0; ret = cft->open(of); - if (ret) + if (ret) { + put_cgroup_ns(ctx->ns); kfree(ctx); + } return ret; } @@ -3763,13 +3768,14 @@ static void cgroup_file_release(struct kernfs_open_file *of) if (cft->release) cft->release(of); + put_cgroup_ns(ctx->ns); kfree(ctx); } static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; + struct cgroup_file_ctx *ctx = of->priv; struct cgroup *cgrp = of->kn->parent->priv; struct cftype *cft = of_cft(of); struct cgroup_subsys_state *css; @@ -3783,7 +3789,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, */ if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && !(cft->flags & CFTYPE_NS_DELEGATABLE) && - ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp) + ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp) return -EPERM; if (cft->write) @@ -4759,9 +4765,9 @@ static int cgroup_procs_show(struct seq_file *s, void *v) static int cgroup_procs_write_permission(struct cgroup *src_cgrp, struct cgroup *dst_cgrp, - struct super_block *sb) + struct super_block *sb, + struct cgroup_namespace *ns) { - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct cgroup *com_cgrp = src_cgrp; struct inode *inode; int ret; @@ -4796,11 +4802,12 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp, static int cgroup_attach_permissions(struct cgroup *src_cgrp, struct cgroup *dst_cgrp, - struct super_block *sb, bool threadgroup) + struct super_block *sb, bool threadgroup, + struct cgroup_namespace *ns) { int ret = 0; - ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb); + ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns); if (ret) return ret; @@ -4817,6 +4824,7 @@ static int cgroup_attach_permissions(struct cgroup *src_cgrp, static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, bool threadgroup) { + struct cgroup_file_ctx *ctx = of->priv; struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; const struct cred *saved_cred; @@ -4844,7 +4852,8 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, */ saved_cred = override_creds(of->file->f_cred); ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb, threadgroup); + of->file->f_path.dentry->d_sb, + threadgroup, ctx->ns); revert_creds(saved_cred); if (ret) goto out_finish; From 1c0b70801045b515e8d5ed0f27ef333f33bc1914 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Thu, 17 Feb 2022 17:11:28 +0100 Subject: [PATCH 0015/5344] cgroup-v1: Correct privileges check in release_agent writes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 467a726b754f474936980da793b4ff2ec3e382a7 upstream. The idea is to check: a) the owning user_ns of cgroup_ns, b) capabilities in init_user_ns. The commit 24f600856418 ("cgroup-v1: Require capabilities to set release_agent") got this wrong in the write handler of release_agent since it checked user_ns of the opener (may be different from the owning user_ns of cgroup_ns). Secondly, to avoid possibly confused deputy, the capability of the opener must be checked. Fixes: 24f600856418 ("cgroup-v1: Require capabilities to set release_agent") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/stable/20220216121142.GB30035@blackbody.suse.cz/ Signed-off-by: Michal Koutný Reviewed-by: Masami Ichikawa(CIP) Signed-off-by: Tejun Heo Signed-off-by: Muchun Song --- kernel/cgroup/cgroup-v1.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 80eb1891f1288..de054295c5f6a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -550,6 +550,7 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; + struct cgroup_file_ctx *ctx; BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); @@ -557,8 +558,9 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, * Release agent gets called with all capabilities, * require capabilities to set release agent. */ - if ((of->file->f_cred->user_ns != &init_user_ns) || - !capable(CAP_SYS_ADMIN)) + ctx = of->priv; + if ((ctx->ns->user_ns != &init_user_ns) || + !file_ns_capable(of->file, &init_user_ns, CAP_SYS_ADMIN)) return -EPERM; cgrp = cgroup_kn_lock_live(of->kn, false); From 9025157b31c395f8c483e59f1c46cbff9b76a0a1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 19 Nov 2021 16:43:49 -0800 Subject: [PATCH 0016/5344] mm/damon/dbgfs: use '__GFP_NOWARN' for user-specified size buffer allocation commit db7a347b26fe05d2e8c115bb24dfd908d0252bc3 upstream. Patch series "DAMON fixes". This patch (of 2): DAMON users can trigger below warning in '__alloc_pages()' by invoking write() to some DAMON debugfs files with arbitrarily high count argument, because DAMON debugfs interface allocates some buffers based on the user-specified 'count'. if (unlikely(order >= MAX_ORDER)) { WARN_ON_ONCE(!(gfp & __GFP_NOWARN)); return NULL; } Because the DAMON debugfs interface code checks failure of the 'kmalloc()', this commit simply suppresses the warnings by adding '__GFP_NOWARN' flag. Link: https://lkml.kernel.org/r/20211110145758.16558-1-sj@kernel.org Link: https://lkml.kernel.org/r/20211110145758.16558-2-sj@kernel.org Fixes: 4bc05954d007 ("mm/damon: implement a debugfs-based user space interface") Signed-off-by: SeongJae Park Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Muchun Song --- mm/damon/dbgfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index faee070977d80..2741ff79e8e84 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -32,7 +32,7 @@ static char *user_input_str(const char __user *buf, size_t count, loff_t *ppos) if (*ppos) return ERR_PTR(-EINVAL); - kbuf = kmalloc(count + 1, GFP_KERNEL); + kbuf = kmalloc(count + 1, GFP_KERNEL | __GFP_NOWARN); if (!kbuf) return ERR_PTR(-ENOMEM); @@ -247,7 +247,7 @@ static ssize_t dbgfs_kdamond_pid_read(struct file *file, char *kbuf; ssize_t len; - kbuf = kmalloc(count, GFP_KERNEL); + kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN); if (!kbuf) return -ENOMEM; From fc33695f6d7c89c563947ad12569d21027f7b2a9 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 7 Mar 2022 13:11:39 +0100 Subject: [PATCH 0017/5344] esp: Fix possible buffer overflow in ESP transformation commit ebe48d368e97d007bfeb76fcb065d6cfc4c96645 upstream. The maximum message size that can be send is bigger than the maximum site that skb_page_frag_refill can allocate. So it is possible to write beyond the allocated buffer. Fix this by doing a fallback to COW in that case. v2: Avoid get get_order() costs as suggested by Linus Torvalds. Fixes: cac2661c53f3 ("esp4: Avoid skb_cow_data whenever possible") Fixes: 03e2a30f6a27 ("esp6: Avoid skb_cow_data whenever possible") Reported-by: valis Signed-off-by: Steffen Klassert Signed-off-by: Vaibhav Rustagi Signed-off-by: Greg Kroah-Hartman Signed-off-by: Muchun Song --- include/net/esp.h | 2 ++ include/net/sock.h | 3 +++ net/core/sock.c | 2 -- net/ipv4/esp4.c | 5 +++++ net/ipv6/esp6.c | 5 +++++ 5 files changed, 15 insertions(+), 2 deletions(-) diff --git a/include/net/esp.h b/include/net/esp.h index 117652eb6ea32..465e38890ee98 100644 --- a/include/net/esp.h +++ b/include/net/esp.h @@ -4,6 +4,8 @@ #include +#define ESP_SKB_FRAG_MAXSIZE (PAGE_SIZE << SKB_FRAG_PAGE_ORDER) + struct ip_esp_hdr; static inline struct ip_esp_hdr *ip_esp_hdr(const struct sk_buff *skb) diff --git a/include/net/sock.h b/include/net/sock.h index 99233235ee8bd..535666196c52e 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2615,6 +2615,9 @@ extern int sysctl_optmem_max; extern __u32 sysctl_wmem_default; extern __u32 sysctl_rmem_default; + +/* On 32bit arches, an skb frag is limited to 2^15 */ +#define SKB_FRAG_PAGE_ORDER get_order(32768) DECLARE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto) diff --git a/net/core/sock.c b/net/core/sock.c index 7d22785e4a03e..c77298b2d9a1b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2350,8 +2350,6 @@ static void sk_leave_memory_pressure(struct sock *sk) } } -/* On 32bit arches, an skb frag is limited to 2^15 */ -#define SKB_FRAG_PAGE_ORDER get_order(32768) DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); /** diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 86c836fa21459..94e17790769bf 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -277,6 +277,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; + unsigned int allocsz; /* this is non-NULL only with UDP Encapsulation */ if (x->encap) { @@ -286,6 +287,10 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * return err; } + allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES); + if (allocsz > ESP_SKB_FRAG_MAXSIZE) + goto cow; + if (!skb_cloned(skb)) { if (tailen <= skb_tailroom(skb)) { nfrags = 1; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 12570a73def80..06a172aa18c61 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -230,6 +230,11 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; + unsigned int allocsz; + + allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES); + if (allocsz > ESP_SKB_FRAG_MAXSIZE) + goto cow; if (!skb_cloned(skb)) { if (tailen <= skb_tailroom(skb)) { From f604188e30017dc3ebe1b139becf77fb32f9ff54 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 4 Apr 2022 15:00:43 +0800 Subject: [PATCH 0018/5344] bytedance: config: enable CONFIG_PRINTK_CALLER This option is intended for environments where multiple threads concurrently call printk() for many times, for it is difficult to interpret without knowing where these lines (or sometimes individual line which was divided into multiple lines due to race) came from. Signed-off-by: Muchun Song --- config.aarch64 | 2 +- config.x86_64 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/config.aarch64 b/config.aarch64 index 7040d2b9c1651..9e939bf6057b8 100644 --- a/config.aarch64 +++ b/config.aarch64 @@ -9080,7 +9080,7 @@ CONFIG_SBITMAP=y # printk and dmesg options # CONFIG_PRINTK_TIME=y -# CONFIG_PRINTK_CALLER is not set +CONFIG_PRINTK_CALLER=y CONFIG_CONSOLE_LOGLEVEL_DEFAULT=7 CONFIG_CONSOLE_LOGLEVEL_QUIET=4 CONFIG_MESSAGE_LOGLEVEL_DEFAULT=4 diff --git a/config.x86_64 b/config.x86_64 index f0d99f2989056..372f03b499cfb 100644 --- a/config.x86_64 +++ b/config.x86_64 @@ -8912,7 +8912,7 @@ CONFIG_SBITMAP=y # printk and dmesg options # CONFIG_PRINTK_TIME=y -# CONFIG_PRINTK_CALLER is not set +CONFIG_PRINTK_CALLER=y CONFIG_CONSOLE_LOGLEVEL_DEFAULT=7 CONFIG_CONSOLE_LOGLEVEL_QUIET=4 CONFIG_MESSAGE_LOGLEVEL_DEFAULT=4 From 45b478e5002a9533497f35876e4ccbbf0b61c015 Mon Sep 17 00:00:00 2001 From: Zhang Junyu Date: Wed, 22 Sep 2021 18:12:23 +0800 Subject: [PATCH 0019/5344] bytedance: iaas: hugetlb: Introduce a sysctl to enable cleaning hpage concurrently In order to accelerate the background clean process, this commit introduces a new sysctl to let the user specify the work cpus and enable cleaning hpages concurrently. Signed-off-by: Zhang Junyu --- mm/hugetlb.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a9da159fe8d68..d75fea0930fca 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3450,8 +3450,55 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, return ret; } +/* + * The cpumask is the mask of possible cpus that can be waken + * to clean huge pages after releasing them. + */ +static struct cpumask hugetlb_background_clr_cpumask; +static unsigned long *sysctl_hugetlb_background_clr_cpumask_bits = + cpumask_bits(&hugetlb_background_clr_cpumask); + +static int hugetlb_background_clr_cpumask_handler(struct ctl_table *table, + int write, + void __user *buffer, + size_t *lenp, + loff_t *ppos) +{ + int err; + + spin_lock(&hugetlb_lock); + err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); + if (!err && write) + cpumask_and(&hugetlb_background_clr_cpumask, + &hugetlb_background_clr_cpumask, + cpu_possible_mask); + spin_unlock(&hugetlb_lock); + + return err; +} + +static struct ctl_table hugetlb_background_clr_cpumask_ctl_table[] = { + { + .procname = "hugetlb_clear_hpage_background_cpumask", + .data = &sysctl_hugetlb_background_clr_cpumask_bits, + .maxlen = NR_CPUS, + .mode = 0644, + .proc_handler = hugetlb_background_clr_cpumask_handler, + }, + { } +}; + #endif /* CONFIG_SYSCTL */ +static void __init hugetlb_init_background_clean(void) +{ +#ifdef CONFIG_SYSCTL + if (!register_sysctl("vm", hugetlb_background_clr_cpumask_ctl_table)) + pr_err("Hugetlb: Unable to register background clean sysctl"); +#endif +} +late_initcall(hugetlb_init_background_clean); + void hugetlb_report_meminfo(struct seq_file *m) { struct hstate *h; From f12278c61f0cf76ea5f8f804a44bf9b1b3785a40 Mon Sep 17 00:00:00 2001 From: Zhang Junyu Date: Thu, 31 Mar 2022 22:40:09 +0800 Subject: [PATCH 0020/5344] bytedance: iaas: hugetlb: Use per-cpu kthread instead of per-hstate Corresponding to the cpumask-based multi-thread page background cleaning, use per-cpu delayed_work instead of per-hstate to do it. Signed-off-by: Zhang Junyu --- include/linux/hugetlb.h | 1 - mm/hugetlb.c | 105 ++++++++++++++++++++++++++++------------ 2 files changed, 75 insertions(+), 31 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index af5e18ac543d6..2759fd61e8558 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -389,7 +389,6 @@ struct hstate { struct cftype cgroup_files[5]; #endif char name[HSTATE_NAME_LEN]; - struct delayed_work clear_work; }; struct huge_bootmem_page { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d75fea0930fca..32335674318be 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -906,46 +906,91 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg) static void kthread_clear_huge_pages(struct work_struct *work) { - struct hstate *h = container_of(work, struct hstate, clear_work.work); - unsigned int order = huge_page_order(h); - unsigned long i, nr_pages = 1 << order; + unsigned long i, nr_pages; struct page *page, *iter; + unsigned int order; + struct hstate *h; int nid; spin_lock(&hugetlb_lock); - for (nid = 0; nid < MAX_NUMNODES; nid++) { - while (!list_empty(&h->hugepage_wait_for_clean[nid])) { - page = list_entry( - h->hugepage_wait_for_clean[nid].next, - struct page, - lru); - list_del(&page->lru); - spin_unlock(&hugetlb_lock); + for_each_hstate(h) { + order = huge_page_order(h); + nr_pages = 1 << order; - for (i = 0, iter = page; - i < nr_pages; - i++, iter = mem_map_next(iter, page, i)) { - clear_highpage(iter); - cond_resched(); - } + for (nid = 0; nid < MAX_NUMNODES; nid++) { + while (!list_empty(&h->hugepage_wait_for_clean[nid])) { + page = list_entry( + h->hugepage_wait_for_clean[nid].next, + struct page, + lru); + list_del(&page->lru); + spin_unlock(&hugetlb_lock); - spin_lock(&hugetlb_lock); - list_add(&page->lru, &h->hugepage_freelists[nid]); + for (i = 0, iter = page; + i < nr_pages; + i++, iter = mem_map_next(iter, page, i)) { + clear_highpage(iter); + cond_resched(); + } - cond_resched_lock(&hugetlb_lock); + spin_lock(&hugetlb_lock); + list_add(&page->lru, + &h->hugepage_freelists[nid]); + + cond_resched_lock(&hugetlb_lock); + } } } spin_unlock(&hugetlb_lock); } +/* + * The cpumask is the mask of possible cpus that can be waken + * to clean huge pages after releasing them. + */ +static struct cpumask hugetlb_background_clr_cpumask; + +static DEFINE_PER_CPU(struct delayed_work, hugetlb_clean_work); + +static void wakeup_background_clean_work(void) +{ + struct delayed_work *work; + int cpu; + + if (cpumask_empty(&hugetlb_background_clr_cpumask)) { + cpu = raw_smp_processor_id(); + work = &per_cpu(hugetlb_clean_work, cpu); + schedule_delayed_work_on(cpu, work, msecs_to_jiffies(5)); + return; + } + + for_each_cpu_and(cpu, + &hugetlb_background_clr_cpumask, + cpu_online_mask) { + work = &per_cpu(hugetlb_clean_work, cpu); + schedule_delayed_work_on(cpu, work, msecs_to_jiffies(5)); + } +} + +static void flush_background_clean_work(void) +{ + struct delayed_work *work; + int cpu; + + for_each_cpu_and(cpu, + &hugetlb_background_clr_cpumask, + cpu_online_mask) { + work = &per_cpu(hugetlb_clean_work, cpu); + flush_delayed_work(work); + } +} + static void enqueue_huge_page(struct hstate *h, struct page *page) { int nid = page_to_nid(page); if (hugetlb_clear_hpage_background) { list_move(&page->lru, &h->hugepage_wait_for_clean[nid]); - schedule_delayed_work_on(raw_smp_processor_id(), - &h->clear_work, - msecs_to_jiffies(5)); + wakeup_background_clean_work(); } else { list_move(&page->lru, &h->hugepage_freelists[nid]); } @@ -965,7 +1010,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) if (hugetlb_clear_hpage_background && &h->hugepage_freelists[nid] == &page->lru) { spin_unlock(&hugetlb_lock); - flush_delayed_work(&h->clear_work); + flush_background_clean_work(); spin_lock(&hugetlb_lock); list_for_each_entry(page, &h->hugepage_freelists[nid], lru) @@ -3288,7 +3333,6 @@ void __init hugetlb_add_hstate(unsigned int order) h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); h->nr_huge_pages = 0; h->free_huge_pages = 0; - INIT_DELAYED_WORK(&h->clear_work, kthread_clear_huge_pages); for (i = 0; i < MAX_NUMNODES; ++i) { INIT_LIST_HEAD(&h->hugepage_freelists[i]); INIT_LIST_HEAD(&h->hugepage_wait_for_clean[i]); @@ -3450,11 +3494,6 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, return ret; } -/* - * The cpumask is the mask of possible cpus that can be waken - * to clean huge pages after releasing them. - */ -static struct cpumask hugetlb_background_clr_cpumask; static unsigned long *sysctl_hugetlb_background_clr_cpumask_bits = cpumask_bits(&hugetlb_background_clr_cpumask); @@ -3492,10 +3531,16 @@ static struct ctl_table hugetlb_background_clr_cpumask_ctl_table[] = { static void __init hugetlb_init_background_clean(void) { + int cpu; + #ifdef CONFIG_SYSCTL if (!register_sysctl("vm", hugetlb_background_clr_cpumask_ctl_table)) pr_err("Hugetlb: Unable to register background clean sysctl"); #endif + + for_each_possible_cpu(cpu) + INIT_DELAYED_WORK(per_cpu_ptr(&hugetlb_clean_work, cpu), + kthread_clear_huge_pages); } late_initcall(hugetlb_init_background_clean); From 0cd8ed547437ac2d9f91bf58acbb76e62eb43efc Mon Sep 17 00:00:00 2001 From: Zhang Junyu Date: Fri, 24 Sep 2021 07:20:01 +0000 Subject: [PATCH 0021/5344] bytedance: iaas: hugetlb: Cancel delayed cleaning page worker when cpumask is set Since the user can set the background clean cpumask whether there are running clean workers or not, this commit cancels the workers which are not in the new cpumask when set it. Signed-off-by: Zhang Junyu --- mm/hugetlb.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 32335674318be..386b151c24d61 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3504,14 +3504,29 @@ static int hugetlb_background_clr_cpumask_handler(struct ctl_table *table, loff_t *ppos) { int err; + cpumask_var_t tmpmask; + + if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + + cpumask_copy(tmpmask, &hugetlb_background_clr_cpumask); spin_lock(&hugetlb_lock); err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); - if (!err && write) + if (!err && write) { + int cpu; + cpumask_and(&hugetlb_background_clr_cpumask, &hugetlb_background_clr_cpumask, cpu_possible_mask); + + cpumask_xor(tmpmask, tmpmask, &hugetlb_background_clr_cpumask); + for_each_cpu(cpu, tmpmask) + cancel_delayed_work(&per_cpu(hugetlb_clean_work, cpu)); + } + spin_unlock(&hugetlb_lock); + free_cpumask_var(tmpmask); return err; } From 4a8cf8ee4d936551010c9c47943f686bafc95f72 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sat, 20 Jun 2020 13:03:55 +0300 Subject: [PATCH 0022/5344] ipvs: avoid expiring many connections from timer commit f9200a52eedfe96ae8d9cbf68a363f5409a46117 upstream. Add new functions ip_vs_conn_del() and ip_vs_conn_del_put() to release many IPVS connections in process context. They are suitable for connections found in table when we do not want to overload the timers. Currently, the change is useful for the dropentry delayed work but it will be used also in following patch when flushing connections to failed destinations. Signed-off-by: Julian Anastasov Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso Signed-off-by: Teng Hu --- net/netfilter/ipvs/ip_vs_conn.c | 53 +++++++++++++++++++++++---------- net/netfilter/ipvs/ip_vs_ctl.c | 6 ++-- 2 files changed, 42 insertions(+), 17 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 02f2f636798d1..b3921ae927407 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -807,6 +807,31 @@ static void ip_vs_conn_rcu_free(struct rcu_head *head) kmem_cache_free(ip_vs_conn_cachep, cp); } +/* Try to delete connection while not holding reference */ +static void ip_vs_conn_del(struct ip_vs_conn *cp) +{ + if (del_timer(&cp->timer)) { + /* Drop cp->control chain too */ + if (cp->control) + cp->timeout = 0; + ip_vs_conn_expire(&cp->timer); + } +} + +/* Try to delete connection while holding reference */ +static void ip_vs_conn_del_put(struct ip_vs_conn *cp) +{ + if (del_timer(&cp->timer)) { + /* Drop cp->control chain too */ + if (cp->control) + cp->timeout = 0; + __ip_vs_conn_put(cp); + ip_vs_conn_expire(&cp->timer); + } else { + __ip_vs_conn_put(cp); + } +} + static void ip_vs_conn_expire(struct timer_list *t) { struct ip_vs_conn *cp = from_timer(cp, t, timer); @@ -827,14 +852,17 @@ static void ip_vs_conn_expire(struct timer_list *t) /* does anybody control me? */ if (ct) { + bool has_ref = !cp->timeout && __ip_vs_conn_get(ct); + ip_vs_control_del(cp); /* Drop CTL or non-assured TPL if not used anymore */ - if (!cp->timeout && !atomic_read(&ct->n_control) && + if (has_ref && !atomic_read(&ct->n_control) && (!(ct->flags & IP_VS_CONN_F_TEMPLATE) || !(ct->state & IP_VS_CTPL_S_ASSURED))) { IP_VS_DBG(4, "drop controlling connection\n"); - ct->timeout = 0; - ip_vs_conn_expire_now(ct); + ip_vs_conn_del_put(ct); + } else if (has_ref) { + __ip_vs_conn_put(ct); } } @@ -1317,8 +1345,7 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs) drop: IP_VS_DBG(4, "drop connection\n"); - cp->timeout = 0; - ip_vs_conn_expire_now(cp); + ip_vs_conn_del(cp); } cond_resched_rcu(); } @@ -1341,19 +1368,15 @@ static void ip_vs_conn_flush(struct netns_ipvs *ipvs) hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { if (cp->ipvs != ipvs) continue; - /* As timers are expired in LIFO order, restart - * the timer of controlling connection first, so - * that it is expired after us. - */ + if (atomic_read(&cp->n_control)) + continue; cp_c = cp->control; - /* cp->control is valid only with reference to cp */ - if (cp_c && __ip_vs_conn_get(cp)) { + IP_VS_DBG(4, "del connection\n"); + ip_vs_conn_del(cp); + if (cp_c && !atomic_read(&cp_c->n_control)) { IP_VS_DBG(4, "del controlling connection\n"); - ip_vs_conn_expire_now(cp_c); - __ip_vs_conn_put(cp); + ip_vs_conn_del(cp_c); } - IP_VS_DBG(4, "del connection\n"); - ip_vs_conn_expire_now(cp); } cond_resched_rcu(); } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index f93fa0e210979..7c07d8f654d76 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -224,7 +224,8 @@ static void defense_work_handler(struct work_struct *work) update_defense_level(ipvs); if (atomic_read(&ipvs->dropentry)) ip_vs_random_dropentry(ipvs); - schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD); + queue_delayed_work(system_long_wq, &ipvs->defense_work, + DEFENSE_TIMER_PERIOD); } #endif @@ -4058,7 +4059,8 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) ipvs->sysctl_tbl = tbl; /* Schedule defense work */ INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); - schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD); + queue_delayed_work(system_long_wq, &ipvs->defense_work, + DEFENSE_TIMER_PERIOD); return 0; } From 539e927fff57f7a529c21bd1e85a92859d4ca115 Mon Sep 17 00:00:00 2001 From: Andrew Sy Kim Date: Wed, 8 Jul 2020 12:16:38 -0400 Subject: [PATCH 0023/5344] ipvs: queue delayed work to expire no destination connections if expire_nodest_conn=1 commit 35dfb013149f74c2be1ff9c78f14e6a3cd1539d1 upstream. When expire_nodest_conn=1 and a destination is deleted, IPVS does not expire the existing connections until the next matching incoming packet. If there are many connection entries from a single client to a single destination, many packets may get dropped before all the connections are expired (more likely with lots of UDP traffic). An optimization can be made where upon deletion of a destination, IPVS queues up delayed work to immediately expire any connections with a deleted destination. This ensures any reused source ports from a client (within the IPVS timeouts) are scheduled to new real servers instead of silently dropped. Signed-off-by: Andrew Sy Kim Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso Signed-off-by: Teng Hu --- include/net/ip_vs.h | 29 ++++++++++++++++++++ net/netfilter/ipvs/ip_vs_conn.c | 39 +++++++++++++++++++++++++++ net/netfilter/ipvs/ip_vs_core.c | 47 ++++++++++++++------------------- net/netfilter/ipvs/ip_vs_ctl.c | 22 +++++++++++++++ 4 files changed, 110 insertions(+), 27 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 7c37e3c3b1c79..a1540b3a27f96 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -14,6 +14,7 @@ #include /* for struct rwlock_t */ #include /* for struct atomic_t */ #include /* for struct refcount_t */ +#include #include #include @@ -885,6 +886,8 @@ struct netns_ipvs { atomic_t conn_out_counter; #ifdef CONFIG_SYSCTL + /* delayed work for expiring no dest connections */ + struct delayed_work expire_nodest_conn_work; /* 1/rate drop and drop-entry variables */ struct delayed_work defense_work; /* Work handler */ int drop_rate; @@ -1049,6 +1052,11 @@ static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs) return ipvs->sysctl_conn_reuse_mode; } +static inline int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) +{ + return ipvs->sysctl_expire_nodest_conn; +} + static inline int sysctl_schedule_icmp(struct netns_ipvs *ipvs) { return ipvs->sysctl_schedule_icmp; @@ -1136,6 +1144,11 @@ static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs) return 1; } +static inline int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) +{ + return 0; +} + static inline int sysctl_schedule_icmp(struct netns_ipvs *ipvs) { return 0; @@ -1505,6 +1518,22 @@ static inline int ip_vs_todrop(struct netns_ipvs *ipvs) static inline int ip_vs_todrop(struct netns_ipvs *ipvs) { return 0; } #endif +#ifdef CONFIG_SYSCTL +/* Enqueue delayed work for expiring no dest connections + * Only run when sysctl_expire_nodest=1 + */ +static inline void ip_vs_enqueue_expire_nodest_conns(struct netns_ipvs *ipvs) +{ + if (sysctl_expire_nodest_conn(ipvs)) + queue_delayed_work(system_long_wq, + &ipvs->expire_nodest_conn_work, 1); +} + +void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs); +#else +static inline void ip_vs_enqueue_expire_nodest_conns(struct netns_ipvs *ipvs) {} +#endif + #define IP_VS_DFWD_METHOD(dest) (atomic_read(&(dest)->conn_flags) & \ IP_VS_CONN_F_FWD_MASK) diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index b3921ae927407..a90b8eac16ac2 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1389,6 +1389,45 @@ static void ip_vs_conn_flush(struct netns_ipvs *ipvs) goto flush_again; } } + +#ifdef CONFIG_SYSCTL +void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs) +{ + int idx; + struct ip_vs_conn *cp, *cp_c; + struct ip_vs_dest *dest; + + rcu_read_lock(); + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { + if (cp->ipvs != ipvs) + continue; + + dest = cp->dest; + if (!dest || (dest->flags & IP_VS_DEST_F_AVAILABLE)) + continue; + + if (atomic_read(&cp->n_control)) + continue; + + cp_c = cp->control; + IP_VS_DBG(4, "del connection\n"); + ip_vs_conn_del(cp); + if (cp_c && !atomic_read(&cp_c->n_control)) { + IP_VS_DBG(4, "del controlling connection\n"); + ip_vs_conn_del(cp_c); + } + } + cond_resched_rcu(); + + /* netns clean up started, abort delayed work */ + if (!ipvs->enable) + break; + } + rcu_read_unlock(); +} +#endif + /* * per netns init and exit */ diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 89aa1fc334b19..b12c43701f401 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -694,16 +694,10 @@ static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) return ipvs->sysctl_nat_icmp_send; } -static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) -{ - return ipvs->sysctl_expire_nodest_conn; -} - #else static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { return 0; } static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) { return 0; } -static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; } #endif @@ -2092,36 +2086,35 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int } } - if (unlikely(!cp)) { - int v; - - if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph)) - return v; - } - - IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet"); - /* Check the server status */ - if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { + if (cp && cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { /* the destination server is not available */ + if (sysctl_expire_nodest_conn(ipvs)) { + bool old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); - __u32 flags = cp->flags; - - /* when timer already started, silently drop the packet.*/ - if (timer_pending(&cp->timer)) - __ip_vs_conn_put(cp); - else - ip_vs_conn_put(cp); + if (!old_ct) + cp->flags &= ~IP_VS_CONN_F_NFCT; - if (sysctl_expire_nodest_conn(ipvs) && - !(flags & IP_VS_CONN_F_ONE_PACKET)) { - /* try to expire the connection immediately */ ip_vs_conn_expire_now(cp); + __ip_vs_conn_put(cp); + if (old_ct) + return NF_DROP; + cp = NULL; + } else { + __ip_vs_conn_put(cp); + return NF_DROP; } + } - return NF_DROP; + if (unlikely(!cp)) { + int v; + + if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph)) + return v; } + IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet"); + ip_vs_in_stats(cp, skb); ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); if (cp->packet_xmit) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 7c07d8f654d76..ee25de5afd0ca 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -210,6 +210,17 @@ static void update_defense_level(struct netns_ipvs *ipvs) local_bh_enable(); } +/* Handler for delayed work for expiring no + * destination connections + */ +static void expire_nodest_conn_handler(struct work_struct *work) +{ + struct netns_ipvs *ipvs; + + ipvs = container_of(work, struct netns_ipvs, + expire_nodest_conn_work.work); + ip_vs_expire_nodest_conn_flush(ipvs); +} /* * Timer for checking the defense @@ -1164,6 +1175,12 @@ static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest, list_add(&dest->t_list, &ipvs->dest_trash); dest->idle_start = 0; spin_unlock_bh(&ipvs->dest_trash_lock); + + /* Queue up delayed work to expire all no destination connections. + * No-op when CONFIG_SYSCTL is disabled. + */ + if (!cleanup) + ip_vs_enqueue_expire_nodest_conns(ipvs); } @@ -4062,6 +4079,10 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) queue_delayed_work(system_long_wq, &ipvs->defense_work, DEFENSE_TIMER_PERIOD); + /* Init delayed work for expiring no dest conn */ + INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work, + expire_nodest_conn_handler); + return 0; } @@ -4069,6 +4090,7 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { struct net *net = ipvs->net; + cancel_delayed_work_sync(&ipvs->expire_nodest_conn_work); cancel_delayed_work_sync(&ipvs->defense_work); cancel_work_sync(&ipvs->defense_work.work); unregister_net_sysctl_table(ipvs->sysctl_hdr); From 9241c96faa0dae717f46ea89ade9f37c41baab6a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 2 Feb 2021 23:06:36 -0800 Subject: [PATCH 0024/5344] bpf: Unbreak BPF_PROG_TYPE_KPROBE when kprobe is called via do_int3 commit 548f1191d86ccb9bde2a5305988877b7584c01eb upstream. The commit 0d00449c7a28 ("x86: Replace ist_enter() with nmi_enter()") converted do_int3 handler to be "NMI-like". That made old if (in_nmi()) check abort execution of bpf programs attached to kprobe when kprobe is firing via int3 (For example when kprobe is placed in the middle of the function). Remove the check to restore user visible behavior. Fixes: 0d00449c7a28 ("x86: Replace ist_enter() with nmi_enter()") Reported-by: Nikolay Borisov Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Tested-by: Nikolay Borisov Reviewed-by: Masami Hiramatsu Link: https://lore.kernel.org/bpf/20210203070636.70926-1-alexei.starovoitov@gmail.com Signed-off-by: hanjinke --- kernel/trace/bpf_trace.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 8409721620e49..219053748232e 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -80,9 +80,6 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { unsigned int ret; - if (in_nmi()) /* not supported yet */ - return 1; - preempt_disable(); if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { From 56872f5970af6ffe1701c53596e53d975080cb6e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 Nov 2019 11:17:59 -0800 Subject: [PATCH 0025/5344] bfq-iosched: relocate bfqg_*rwstat*() helpers commit a557f1c7fee2f2059234647fea32ed1f3c07dce2 upstream. Collect them right under #ifdef CONFIG_BFQ_CGROUP_DEBUG. The next patch will use them from !DEBUG path and this makes it easy to move them out of the ifdef block. This is pure code reorganization. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: Muchun Song --- block/bfq-cgroup.c | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 342a1cfa48c57..2276e2f15dbdb 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -1084,17 +1084,34 @@ static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, } #ifdef CONFIG_BFQ_CGROUP_DEBUG -static int bfqg_print_stat(struct seq_file *sf, void *v) +static int bfqg_print_rwstat(struct seq_file *sf, void *v) { - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, - &blkcg_policy_bfq, seq_cft(sf)->private, false); + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, + &blkcg_policy_bfq, seq_cft(sf)->private, true); return 0; } -static int bfqg_print_rwstat(struct seq_file *sf, void *v) +static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) { - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, - &blkcg_policy_bfq, seq_cft(sf)->private, true); + struct blkg_rwstat_sample sum; + + blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off, &sum); + return __blkg_prfill_rwstat(sf, pd, &sum); +} + +static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, + seq_cft(sf)->private, true); + return 0; +} + +static int bfqg_print_stat(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, + &blkcg_policy_bfq, seq_cft(sf)->private, false); return 0; } @@ -1123,15 +1140,6 @@ static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, return __blkg_prfill_u64(sf, pd, sum); } -static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - struct blkg_rwstat_sample sum; - - blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off, &sum); - return __blkg_prfill_rwstat(sf, pd, &sum); -} - static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), @@ -1140,14 +1148,6 @@ static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) return 0; } -static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, - seq_cft(sf)->private, true); - return 0; -} - static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, int off) { From aa37ee0904cf4607ba5316d110c9765eaaf4cf91 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 Nov 2019 11:18:00 -0800 Subject: [PATCH 0026/5344] bfq-iosched: stop using blkg->stat_bytes and ->stat_ios commit fd41e60331b13b8fb35cc5048185a46de98db77c upstream. When used on cgroup1, bfq uses the blkg->stat_bytes and ->stat_ios from blk-cgroup core to populate six stat knobs. blk-cgroup core is moving away from blkg_rwstat to improve scalability and won't be able to support this usage. It isn't like the sharing gains all that much. Let's break it out to dedicated rwstat counters which are updated when on cgroup1. This makes use of bfqg_*rwstat*() helpers outside of CONFIG_BFQ_CGROUP_DEBUG. Move them out. v2: Compile fix when !CONFIG_BFQ_CGROUP_DEBUG. Signed-off-by: Tejun Heo Cc: Paolo Valente Signed-off-by: Jens Axboe Signed-off-by: Muchun Song --- block/bfq-cgroup.c | 39 +++++++++++++++++++++++++++------------ block/bfq-iosched.c | 4 ++++ block/bfq-iosched.h | 4 ++++ 3 files changed, 35 insertions(+), 12 deletions(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 2276e2f15dbdb..25c8201209af4 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -347,6 +347,14 @@ void bfqg_and_blkg_put(struct bfq_group *bfqg) bfqg_put(bfqg); } +void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq) +{ + struct bfq_group *bfqg = blkg_to_bfqg(rq->bio->bi_blkg); + + blkg_rwstat_add(&bfqg->stats.bytes, rq->cmd_flags, blk_rq_bytes(rq)); + blkg_rwstat_add(&bfqg->stats.ios, rq->cmd_flags, 1); +} + /* @stats = 0 */ static void bfqg_stats_reset(struct bfqg_stats *stats) { @@ -431,6 +439,8 @@ void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg) static void bfqg_stats_exit(struct bfqg_stats *stats) { + blkg_rwstat_exit(&stats->bytes); + blkg_rwstat_exit(&stats->ios); #ifdef CONFIG_BFQ_CGROUP_DEBUG blkg_rwstat_exit(&stats->merged); blkg_rwstat_exit(&stats->service_time); @@ -448,6 +458,10 @@ static void bfqg_stats_exit(struct bfqg_stats *stats) static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) { + if (blkg_rwstat_init(&stats->bytes, gfp) || + blkg_rwstat_init(&stats->ios, gfp)) + return -ENOMEM; + #ifdef CONFIG_BFQ_CGROUP_DEBUG if (blkg_rwstat_init(&stats->merged, gfp) || blkg_rwstat_init(&stats->service_time, gfp) || @@ -1083,7 +1097,6 @@ static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, return bfq_io_set_device_weight(of, buf, nbytes, off); } -#ifdef CONFIG_BFQ_CGROUP_DEBUG static int bfqg_print_rwstat(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, @@ -1108,6 +1121,7 @@ static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) return 0; } +#ifdef CONFIG_BFQ_CGROUP_DEBUG static int bfqg_print_stat(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, @@ -1151,7 +1165,8 @@ static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); + struct bfq_group *bfqg = blkg_to_bfqg(pd->blkg); + u64 sum = blkg_rwstat_total(&bfqg->stats.bytes); return __blkg_prfill_u64(sf, pd, sum >> 9); } @@ -1168,8 +1183,8 @@ static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, { struct blkg_rwstat_sample tmp; - blkg_rwstat_recursive_sum(pd->blkg, NULL, - offsetof(struct blkcg_gq, stat_bytes), &tmp); + blkg_rwstat_recursive_sum(pd->blkg, &blkcg_policy_bfq, + offsetof(struct bfq_group, stats.bytes), &tmp); return __blkg_prfill_u64(sf, pd, (tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]) >> 9); @@ -1252,13 +1267,13 @@ struct cftype bfq_blkcg_legacy_files[] = { /* statistics, covers only the tasks in the bfqg */ { .name = "bfq.io_service_bytes", - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_bytes, + .private = offsetof(struct bfq_group, stats.bytes), + .seq_show = bfqg_print_rwstat, }, { .name = "bfq.io_serviced", - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_ios, + .private = offsetof(struct bfq_group, stats.ios), + .seq_show = bfqg_print_rwstat, }, #ifdef CONFIG_BFQ_CGROUP_DEBUG { @@ -1295,13 +1310,13 @@ struct cftype bfq_blkcg_legacy_files[] = { /* the same statistics which cover the bfqg and its descendants */ { .name = "bfq.io_service_bytes_recursive", - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_bytes_recursive, + .private = offsetof(struct bfq_group, stats.bytes), + .seq_show = bfqg_print_rwstat_recursive, }, { .name = "bfq.io_serviced_recursive", - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_ios_recursive, + .private = offsetof(struct bfq_group, stats.ios), + .seq_show = bfqg_print_rwstat_recursive, }, #ifdef CONFIG_BFQ_CGROUP_DEBUG { diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index afe73f0b49ebf..6264db493d45d 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5497,6 +5497,10 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bool idle_timer_disabled = false; unsigned int cmd_flags; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio) + bfqg_stats_update_legacy_io(q, rq); +#endif spin_lock_irq(&bfqd->lock); if (blk_mq_sched_try_insert_merge(q, rq)) { spin_unlock_irq(&bfqd->lock); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index de98fdfe9ea17..075182de04bca 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -809,6 +809,9 @@ struct bfq_stat { }; struct bfqg_stats { + /* basic stats */ + struct blkg_rwstat bytes; + struct blkg_rwstat ios; #ifdef CONFIG_BFQ_CGROUP_DEBUG /* number of ios merged */ struct blkg_rwstat merged; @@ -958,6 +961,7 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); /* ---------------- cgroups-support interface ---------------- */ +void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq); void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, unsigned int op); void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op); From bb3c9cc2bc6967f42b619eff91da7fe08cad675f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 Nov 2019 11:18:01 -0800 Subject: [PATCH 0027/5344] blk-throtl: stop using blkg->stat_bytes and ->stat_ios commit 7ca464383aecef5c2e32b987030187ee1e4848fb upstream. When used on cgroup1, blk-throtl uses the blkg->stat_bytes and ->stat_ios from blk-cgroup core to populate four stat knobs. blk-cgroup core is moving away from blkg_rwstat to improve scalability and won't be able to support this usage. It isn't like the sharing gains all that much. Let's break them out to dedicated rwstat counters which are updated when on cgroup1. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: Muchun Song --- block/blk-throttle.c | 70 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 61 insertions(+), 9 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index bd870f9ae4586..69356af063786 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -176,6 +176,9 @@ struct throtl_grp { unsigned int bio_cnt; /* total bios */ unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ unsigned long bio_cnt_reset_time; + + struct blkg_rwstat stat_bytes; + struct blkg_rwstat stat_ios; }; /* We measure latency for request size from <= 4k to >= 1M */ @@ -489,6 +492,12 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, if (!tg) return NULL; + if (blkg_rwstat_init(&tg->stat_bytes, gfp)) + goto err_free_tg; + + if (blkg_rwstat_init(&tg->stat_ios, gfp)) + goto err_exit_stat_bytes; + throtl_service_queue_init(&tg->service_queue); for (rw = READ; rw <= WRITE; rw++) { @@ -513,6 +522,12 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD; return &tg->pd; + +err_exit_stat_bytes: + blkg_rwstat_exit(&tg->stat_bytes); +err_free_tg: + kfree(tg); + return NULL; } static void throtl_pd_init(struct blkg_policy_data *pd) @@ -611,6 +626,8 @@ static void throtl_pd_free(struct blkg_policy_data *pd) struct throtl_grp *tg = pd_to_tg(pd); del_timer_sync(&tg->service_queue.pending_timer); + blkg_rwstat_exit(&tg->stat_bytes); + blkg_rwstat_exit(&tg->stat_ios); kfree(tg); } @@ -1464,6 +1481,32 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, return tg_set_conf(of, buf, nbytes, off, false); } +static int tg_print_rwstat(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat, &blkcg_policy_throtl, + seq_cft(sf)->private, true); + return 0; +} + +static u64 tg_prfill_rwstat_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkg_rwstat_sample sum; + + blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_throtl, off, + &sum); + return __blkg_prfill_rwstat(sf, pd, &sum); +} + +static int tg_print_rwstat_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + tg_prfill_rwstat_recursive, &blkcg_policy_throtl, + seq_cft(sf)->private, true); + return 0; +} + static struct cftype throtl_legacy_files[] = { { .name = "throttle.read_bps_device", @@ -1491,23 +1534,23 @@ static struct cftype throtl_legacy_files[] = { }, { .name = "throttle.io_service_bytes", - .private = (unsigned long)&blkcg_policy_throtl, - .seq_show = blkg_print_stat_bytes, + .private = offsetof(struct throtl_grp, stat_bytes), + .seq_show = tg_print_rwstat, }, { .name = "throttle.io_service_bytes_recursive", - .private = (unsigned long)&blkcg_policy_throtl, - .seq_show = blkg_print_stat_bytes_recursive, + .private = offsetof(struct throtl_grp, stat_bytes), + .seq_show = tg_print_rwstat_recursive, }, { .name = "throttle.io_serviced", - .private = (unsigned long)&blkcg_policy_throtl, - .seq_show = blkg_print_stat_ios, + .private = offsetof(struct throtl_grp, stat_ios), + .seq_show = tg_print_rwstat, }, { .name = "throttle.io_serviced_recursive", - .private = (unsigned long)&blkcg_policy_throtl, - .seq_show = blkg_print_stat_ios_recursive, + .private = offsetof(struct throtl_grp, stat_ios), + .seq_show = tg_print_rwstat_recursive, }, { } /* terminate */ }; @@ -2127,7 +2170,16 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, WARN_ON_ONCE(!rcu_read_lock_held()); /* see throtl_charge_bio() */ - if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw]) + if (bio_flagged(bio, BIO_THROTTLED)) + goto out; + + if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { + blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf, + bio->bi_iter.bi_size); + blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1); + } + + if (!tg->has_rules[rw]) goto out; spin_lock_irq(&q->queue_lock); From bf487fb83b79661bab4dfca70ae79dfec4af12a4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 Nov 2019 11:18:02 -0800 Subject: [PATCH 0028/5344] blk-cgroup: remove now unused blkg_print_stat_{bytes|ios}_recursive() commit 8a80d5d6638b7d58480a83aef49d587de63d4cbb upstream. These don't have users anymore. Remove them. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: Muchun Song --- block/blk-cgroup.c | 83 -------------------------------------- include/linux/blk-cgroup.h | 5 --- 2 files changed, 88 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index dde8d0acfb34f..42c411efa81e2 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -615,89 +615,6 @@ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, } EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); -static u64 blkg_prfill_rwstat_field(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - struct blkg_rwstat_sample rwstat = { }; - - blkg_rwstat_read((void *)pd->blkg + off, &rwstat); - return __blkg_prfill_rwstat(sf, pd, &rwstat); -} - -/** - * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes - * @sf: seq_file to print to - * @v: unused - * - * To be used as cftype->seq_show to print blkg->stat_bytes. - * cftype->private must be set to the blkcg_policy. - */ -int blkg_print_stat_bytes(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private, - offsetof(struct blkcg_gq, stat_bytes), true); - return 0; -} -EXPORT_SYMBOL_GPL(blkg_print_stat_bytes); - -/** - * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios - * @sf: seq_file to print to - * @v: unused - * - * To be used as cftype->seq_show to print blkg->stat_ios. cftype->private - * must be set to the blkcg_policy. - */ -int blkg_print_stat_ios(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private, - offsetof(struct blkcg_gq, stat_ios), true); - return 0; -} -EXPORT_SYMBOL_GPL(blkg_print_stat_ios); - -static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf, - struct blkg_policy_data *pd, - int off) -{ - struct blkg_rwstat_sample rwstat; - - blkg_rwstat_recursive_sum(pd->blkg, NULL, off, &rwstat); - return __blkg_prfill_rwstat(sf, pd, &rwstat); -} - -/** - * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes - * @sf: seq_file to print to - * @v: unused - */ -int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - blkg_prfill_rwstat_field_recursive, - (void *)seq_cft(sf)->private, - offsetof(struct blkcg_gq, stat_bytes), true); - return 0; -} -EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive); - -/** - * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios - * @sf: seq_file to print to - * @v: unused - */ -int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - blkg_prfill_rwstat_field_recursive, - (void *)seq_cft(sf)->private, - offsetof(struct blkcg_gq, stat_ios), true); - return 0; -} -EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive); - /** * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat * @blkg: blkg of interest diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index bed9e43f94260..914ce55fa8c2a 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -220,11 +220,6 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, const struct blkg_rwstat_sample *rwstat); u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, int off); -int blkg_print_stat_bytes(struct seq_file *sf, void *v); -int blkg_print_stat_ios(struct seq_file *sf, void *v); -int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v); -int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v); - void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, int off, struct blkg_rwstat_sample *sum); From 1460643fe920fceda2e3e2139e2aa730e28f41aa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 Nov 2019 11:18:03 -0800 Subject: [PATCH 0029/5344] blk-cgroup: reimplement basic IO stats using cgroup rstat commit f73316482977ac401ac37245c9df48079d4e11f3 upstream. blk-cgroup has been using blkg_rwstat to track basic IO stats. Unfortunately, reading recursive stats scales badly as itinvolves walking all descendants. On systems with a huge number of cgroups (dead or alive), this can lead to substantial CPU cost when reading IO stats. This patch reimplements basic IO stats using cgroup rstat which uses more memory but makes recursive stat reading O(# descendants which have been active since last reading) instead of O(# descendants). * blk-cgroup core no longer uses sync/async stats. Introduce new stat enums - BLKG_IOSTAT_{READ|WRITE|DISCARD}. * Add blkg_iostat[_set] which encapsulates byte and io stats, last values for propagation delta calculation and u64_stats_sync for correctness on 32bit archs. * Update the new percpu stat counters directly and implement blkcg_rstat_flush() to implement propagation. * blkg_print_stat() can now bring the stats up to date by calling cgroup_rstat_flush() and print them instead of directly summing up all descendants. * It now allocates 96 bytes per cpu. It used to be 40 bytes. Signed-off-by: Tejun Heo Cc: Dan Schatzberg Cc: Daniel Xu Signed-off-by: Jens Axboe Signed-off-by: Muchun Song --- block/blk-cgroup.c | 124 +++++++++++++++++++++++++++++-------- include/linux/blk-cgroup.h | 48 ++++++++++++-- 2 files changed, 142 insertions(+), 30 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 42c411efa81e2..8bfda04c72da0 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -80,8 +80,7 @@ static void blkg_free(struct blkcg_gq *blkg) if (blkg->pd[i]) blkcg_policy[i]->pd_free_fn(blkg->pd[i]); - blkg_rwstat_exit(&blkg->stat_ios); - blkg_rwstat_exit(&blkg->stat_bytes); + free_percpu(blkg->iostat_cpu); percpu_ref_exit(&blkg->refcnt); kfree(blkg); } @@ -146,7 +145,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, gfp_t gfp_mask) { struct blkcg_gq *blkg; - int i; + int i, cpu; /* alloc and init base part */ blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node); @@ -156,8 +155,8 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask)) goto err_free; - if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) || - blkg_rwstat_init(&blkg->stat_ios, gfp_mask)) + blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask); + if (!blkg->iostat_cpu) goto err_free; blkg->q = q; @@ -167,6 +166,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn); blkg->blkcg = blkcg; + u64_stats_init(&blkg->iostat.sync); + for_each_possible_cpu(cpu) + u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync); + for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkg_policy_data *pd; @@ -393,7 +396,6 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; - struct blkcg_gq *parent = blkg->parent; int i; lockdep_assert_held(&blkg->q->queue_lock); @@ -410,11 +412,6 @@ static void blkg_destroy(struct blkcg_gq *blkg) pol->pd_offline_fn(blkg->pd[i]); } - if (parent) { - blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes); - blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios); - } - blkg->online = false; radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); @@ -464,7 +461,7 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, { struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; - int i; + int i, cpu; mutex_lock(&blkcg_pol_mutex); spin_lock_irq(&blkcg->lock); @@ -475,8 +472,12 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, * anyway. If you get hit by a race, retry. */ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { - blkg_rwstat_reset(&blkg->stat_bytes); - blkg_rwstat_reset(&blkg->stat_ios); + for_each_possible_cpu(cpu) { + struct blkg_iostat_set *bis = + per_cpu_ptr(blkg->iostat_cpu, cpu); + memset(bis, 0, sizeof(*bis)); + } + memset(&blkg->iostat, 0, sizeof(blkg->iostat)); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; @@ -851,16 +852,18 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct blkcg_gq *blkg; + cgroup_rstat_flush(blkcg->css.cgroup); rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + struct blkg_iostat_set *bis = &blkg->iostat; const char *dname; char *buf; - struct blkg_rwstat_sample rwstat; u64 rbytes, wbytes, rios, wios, dbytes, dios; size_t size = seq_get_buf(sf, &buf), off = 0; int i; bool has_stats = false; + unsigned seq; spin_lock_irq(&blkg->q->queue_lock); @@ -879,17 +882,16 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) */ off += scnprintf(buf+off, size-off, "%s ", dname); - blkg_rwstat_recursive_sum(blkg, NULL, - offsetof(struct blkcg_gq, stat_bytes), &rwstat); - rbytes = rwstat.cnt[BLKG_RWSTAT_READ]; - wbytes = rwstat.cnt[BLKG_RWSTAT_WRITE]; - dbytes = rwstat.cnt[BLKG_RWSTAT_DISCARD]; + do { + seq = u64_stats_fetch_begin(&bis->sync); - blkg_rwstat_recursive_sum(blkg, NULL, - offsetof(struct blkcg_gq, stat_ios), &rwstat); - rios = rwstat.cnt[BLKG_RWSTAT_READ]; - wios = rwstat.cnt[BLKG_RWSTAT_WRITE]; - dios = rwstat.cnt[BLKG_RWSTAT_DISCARD]; + rbytes = bis->cur.bytes[BLKG_IOSTAT_READ]; + wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE]; + dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD]; + rios = bis->cur.ios[BLKG_IOSTAT_READ]; + wios = bis->cur.ios[BLKG_IOSTAT_WRITE]; + dios = bis->cur.ios[BLKG_IOSTAT_DISCARD]; + } while (u64_stats_fetch_retry(&bis->sync, seq)); if (rbytes || wbytes || rios || wios) { has_stats = true; @@ -1235,6 +1237,77 @@ static int blkcg_can_attach(struct cgroup_taskset *tset) return ret; } +static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] = src->bytes[i]; + dst->ios[i] = src->ios[i]; + } +} + +static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] += src->bytes[i]; + dst->ios[i] += src->ios[i]; + } +} + +static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] -= src->bytes[i]; + dst->ios[i] -= src->ios[i]; + } +} + +static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) +{ + struct blkcg *blkcg = css_to_blkcg(css); + struct blkcg_gq *blkg; + + rcu_read_lock(); + + hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + struct blkcg_gq *parent = blkg->parent; + struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu); + struct blkg_iostat cur, delta; + unsigned seq; + + /* fetch the current per-cpu values */ + do { + seq = u64_stats_fetch_begin(&bisc->sync); + blkg_iostat_set(&cur, &bisc->cur); + } while (u64_stats_fetch_retry(&bisc->sync, seq)); + + /* propagate percpu delta to global */ + u64_stats_update_begin(&blkg->iostat.sync); + blkg_iostat_set(&delta, &cur); + blkg_iostat_sub(&delta, &bisc->last); + blkg_iostat_add(&blkg->iostat.cur, &delta); + blkg_iostat_add(&bisc->last, &delta); + u64_stats_update_end(&blkg->iostat.sync); + + /* propagate global delta to parent */ + if (parent) { + u64_stats_update_begin(&parent->iostat.sync); + blkg_iostat_set(&delta, &blkg->iostat.cur); + blkg_iostat_sub(&delta, &blkg->iostat.last); + blkg_iostat_add(&parent->iostat.cur, &delta); + blkg_iostat_add(&blkg->iostat.last, &delta); + u64_stats_update_end(&parent->iostat.sync); + } + } + + rcu_read_unlock(); +} + static void blkcg_bind(struct cgroup_subsys_state *root_css) { int i; @@ -1267,6 +1340,7 @@ struct cgroup_subsys io_cgrp_subsys = { .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .can_attach = blkcg_can_attach, + .css_rstat_flush = blkcg_rstat_flush, .bind = blkcg_bind, .dfl_cftypes = blkcg_files, .legacy_cftypes = blkcg_legacy_files, diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 914ce55fa8c2a..867ab391e409c 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -15,7 +15,9 @@ */ #include +#include #include +#include #include #include #include @@ -31,6 +33,14 @@ #ifdef CONFIG_BLK_CGROUP +enum blkg_iostat_type { + BLKG_IOSTAT_READ, + BLKG_IOSTAT_WRITE, + BLKG_IOSTAT_DISCARD, + + BLKG_IOSTAT_NR, +}; + enum blkg_rwstat_type { BLKG_RWSTAT_READ, BLKG_RWSTAT_WRITE, @@ -61,6 +71,17 @@ struct blkcg { #endif }; +struct blkg_iostat { + u64 bytes[BLKG_IOSTAT_NR]; + u64 ios[BLKG_IOSTAT_NR]; +}; + +struct blkg_iostat_set { + struct u64_stats_sync sync; + struct blkg_iostat cur; + struct blkg_iostat last; +}; + /* * blkg_[rw]stat->aux_cnt is excluded for local stats but included for * recursive. Used to carry stats of dead children. @@ -127,8 +148,8 @@ struct blkcg_gq { /* is this blkg online? protected by both blkcg and q locks */ bool online; - struct blkg_rwstat stat_bytes; - struct blkg_rwstat stat_ios; + struct blkg_iostat_set __percpu *iostat_cpu; + struct blkg_iostat_set iostat; struct blkg_policy_data *pd[BLKCG_MAX_POLS]; @@ -740,15 +761,32 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, throtl = blk_throtl_bio(q, blkg, bio); if (!throtl) { + struct blkg_iostat_set *bis; + int rwd, cpu; + + if (op_is_discard(bio->bi_opf)) + rwd = BLKG_IOSTAT_DISCARD; + else if (op_is_write(bio->bi_opf)) + rwd = BLKG_IOSTAT_WRITE; + else + rwd = BLKG_IOSTAT_READ; + + cpu = get_cpu(); + bis = per_cpu_ptr(blkg->iostat_cpu, cpu); + u64_stats_update_begin(&bis->sync); + /* * If the bio is flagged with BIO_QUEUE_ENTERED it means this * is a split bio and we would have already accounted for the * size of the bio. */ if (!bio_flagged(bio, BIO_QUEUE_ENTERED)) - blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf, - bio->bi_iter.bi_size); - blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1); + bis->cur.bytes[rwd] += bio->bi_iter.bi_size; + bis->cur.ios[rwd]++; + + u64_stats_update_end(&bis->sync); + cgroup_rstat_updated(blkg->blkcg->css.cgroup, cpu); + put_cpu(); } blkcg_bio_issue_init(bio); From 151a27940bccce16d8b0ae4aba1f0807f995d6b5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 Nov 2019 11:18:04 -0800 Subject: [PATCH 0030/5344] blk-cgroup: separate out blkg_rwstat under CONFIG_BLK_CGROUP_RWSTAT commit 1d156646e0d8ec390e5d5ac288137df02d4207be upstream. blkg_rwstat is now only used by bfq-iosched and blk-throtl when on cgroup1. Let's move it into its own files and gate it behind a config option. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: Muchun Song --- block/Kconfig | 4 + block/Kconfig.iosched | 1 + block/Makefile | 1 + block/bfq-iosched.h | 2 + block/blk-cgroup-rwstat.c | 129 ++++++++++++++++++++++++++++++ block/blk-cgroup-rwstat.h | 149 ++++++++++++++++++++++++++++++++++ block/blk-cgroup.c | 97 ---------------------- block/blk-throttle.c | 1 + include/linux/blk-cgroup.h | 159 ------------------------------------- 9 files changed, 287 insertions(+), 256 deletions(-) create mode 100644 block/blk-cgroup-rwstat.c create mode 100644 block/blk-cgroup-rwstat.h diff --git a/block/Kconfig b/block/Kconfig index 41c0917ce6223..c23094a14a2be 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -32,6 +32,9 @@ config BLK_RQ_ALLOC_TIME config BLK_SCSI_REQUEST bool +config BLK_CGROUP_RWSTAT + bool + config BLK_DEV_BSG bool "Block layer SG support v4" default y @@ -86,6 +89,7 @@ config BLK_DEV_ZONED config BLK_DEV_THROTTLING bool "Block layer bio throttling support" depends on BLK_CGROUP=y + select BLK_CGROUP_RWSTAT ---help--- Block layer bio throttling support. It can be used to limit the IO rate to a device. IO rate policies are per cgroup and diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index b89310a022ad4..7df14133adc80 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -31,6 +31,7 @@ config IOSCHED_BFQ config BFQ_GROUP_IOSCHED bool "BFQ hierarchical scheduling support" depends on IOSCHED_BFQ && BLK_CGROUP + select BLK_CGROUP_RWSTAT ---help--- Enable hierarchical scheduling in BFQ, using the blkio diff --git a/block/Makefile b/block/Makefile index 9ef57ace90d46..205a5f2fef17f 100644 --- a/block/Makefile +++ b/block/Makefile @@ -16,6 +16,7 @@ obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o +obj-$(CONFIG_BLK_CGROUP_RWSTAT) += blk-cgroup-rwstat.o obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 075182de04bca..2482d3229604b 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -10,6 +10,8 @@ #include #include +#include "blk-cgroup-rwstat.h" + #define BFQ_IOPRIO_CLASSES 3 #define BFQ_CL_IDLE_TIMEOUT (HZ/5) diff --git a/block/blk-cgroup-rwstat.c b/block/blk-cgroup-rwstat.c new file mode 100644 index 0000000000000..85d5790ac49b0 --- /dev/null +++ b/block/blk-cgroup-rwstat.c @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Legacy blkg rwstat helpers enabled by CONFIG_BLK_CGROUP_RWSTAT. + * Do not use in new code. + */ +#include "blk-cgroup-rwstat.h" + +int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp) +{ + int i, ret; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) { + ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp); + if (ret) { + while (--i >= 0) + percpu_counter_destroy(&rwstat->cpu_cnt[i]); + return ret; + } + atomic64_set(&rwstat->aux_cnt[i], 0); + } + return 0; +} +EXPORT_SYMBOL_GPL(blkg_rwstat_init); + +void blkg_rwstat_exit(struct blkg_rwstat *rwstat) +{ + int i; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + percpu_counter_destroy(&rwstat->cpu_cnt[i]); +} +EXPORT_SYMBOL_GPL(blkg_rwstat_exit); + +/** + * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat + * @sf: seq_file to print to + * @pd: policy private data of interest + * @rwstat: rwstat to print + * + * Print @rwstat to @sf for the device assocaited with @pd. + */ +u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + const struct blkg_rwstat_sample *rwstat) +{ + static const char *rwstr[] = { + [BLKG_RWSTAT_READ] = "Read", + [BLKG_RWSTAT_WRITE] = "Write", + [BLKG_RWSTAT_SYNC] = "Sync", + [BLKG_RWSTAT_ASYNC] = "Async", + [BLKG_RWSTAT_DISCARD] = "Discard", + }; + const char *dname = blkg_dev_name(pd->blkg); + u64 v; + int i; + + if (!dname) + return 0; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], + rwstat->cnt[i]); + + v = rwstat->cnt[BLKG_RWSTAT_READ] + + rwstat->cnt[BLKG_RWSTAT_WRITE] + + rwstat->cnt[BLKG_RWSTAT_DISCARD]; + seq_printf(sf, "%s Total %llu\n", dname, v); + return v; +} +EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat); + +/** + * blkg_prfill_rwstat - prfill callback for blkg_rwstat + * @sf: seq_file to print to + * @pd: policy private data of interest + * @off: offset to the blkg_rwstat in @pd + * + * prfill callback for printing a blkg_rwstat. + */ +u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + struct blkg_rwstat_sample rwstat = { }; + + blkg_rwstat_read((void *)pd + off, &rwstat); + return __blkg_prfill_rwstat(sf, pd, &rwstat); +} +EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); + +/** + * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat + * @blkg: blkg of interest + * @pol: blkcg_policy which contains the blkg_rwstat + * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg + * @sum: blkg_rwstat_sample structure containing the results + * + * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its + * online descendants and their aux counts. The caller must be holding the + * queue lock for online tests. + * + * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it + * is at @off bytes into @blkg's blkg_policy_data of the policy. + */ +void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, + int off, struct blkg_rwstat_sample *sum) +{ + struct blkcg_gq *pos_blkg; + struct cgroup_subsys_state *pos_css; + unsigned int i; + + lockdep_assert_held(&blkg->q->queue_lock); + + rcu_read_lock(); + blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { + struct blkg_rwstat *rwstat; + + if (!pos_blkg->online) + continue; + + if (pol) + rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off; + else + rwstat = (void *)pos_blkg + off; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + sum->cnt[i] = blkg_rwstat_read_counter(rwstat, i); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); diff --git a/block/blk-cgroup-rwstat.h b/block/blk-cgroup-rwstat.h new file mode 100644 index 0000000000000..ee746919c41fc --- /dev/null +++ b/block/blk-cgroup-rwstat.h @@ -0,0 +1,149 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Legacy blkg rwstat helpers enabled by CONFIG_BLK_CGROUP_RWSTAT. + * Do not use in new code. + */ +#ifndef _BLK_CGROUP_RWSTAT_H +#define _BLK_CGROUP_RWSTAT_H + +#include + +enum blkg_rwstat_type { + BLKG_RWSTAT_READ, + BLKG_RWSTAT_WRITE, + BLKG_RWSTAT_SYNC, + BLKG_RWSTAT_ASYNC, + BLKG_RWSTAT_DISCARD, + + BLKG_RWSTAT_NR, + BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, +}; + +/* + * blkg_[rw]stat->aux_cnt is excluded for local stats but included for + * recursive. Used to carry stats of dead children. + */ +struct blkg_rwstat { + struct percpu_counter cpu_cnt[BLKG_RWSTAT_NR]; + atomic64_t aux_cnt[BLKG_RWSTAT_NR]; +}; + +struct blkg_rwstat_sample { + u64 cnt[BLKG_RWSTAT_NR]; +}; + +static inline u64 blkg_rwstat_read_counter(struct blkg_rwstat *rwstat, + unsigned int idx) +{ + return atomic64_read(&rwstat->aux_cnt[idx]) + + percpu_counter_sum_positive(&rwstat->cpu_cnt[idx]); +} + +int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp); +void blkg_rwstat_exit(struct blkg_rwstat *rwstat); +u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + const struct blkg_rwstat_sample *rwstat); +u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + int off); +void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, + int off, struct blkg_rwstat_sample *sum); + + +/** + * blkg_rwstat_add - add a value to a blkg_rwstat + * @rwstat: target blkg_rwstat + * @op: REQ_OP and flags + * @val: value to add + * + * Add @val to @rwstat. The counters are chosen according to @rw. The + * caller is responsible for synchronizing calls to this function. + */ +static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, + unsigned int op, uint64_t val) +{ + struct percpu_counter *cnt; + + if (op_is_discard(op)) + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD]; + else if (op_is_write(op)) + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE]; + else + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ]; + + percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH); + + if (op_is_sync(op)) + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC]; + else + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC]; + + percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH); +} + +/** + * blkg_rwstat_read - read the current values of a blkg_rwstat + * @rwstat: blkg_rwstat to read + * + * Read the current snapshot of @rwstat and return it in the aux counts. + */ +static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat, + struct blkg_rwstat_sample *result) +{ + int i; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + result->cnt[i] = + percpu_counter_sum_positive(&rwstat->cpu_cnt[i]); +} + +/** + * blkg_rwstat_total - read the total count of a blkg_rwstat + * @rwstat: blkg_rwstat to read + * + * Return the total count of @rwstat regardless of the IO direction. This + * function can be called without synchronization and takes care of u64 + * atomicity. + */ +static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) +{ + struct blkg_rwstat_sample tmp = { }; + + blkg_rwstat_read(rwstat, &tmp); + return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; +} + +/** + * blkg_rwstat_reset - reset a blkg_rwstat + * @rwstat: blkg_rwstat to reset + */ +static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) +{ + int i; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) { + percpu_counter_set(&rwstat->cpu_cnt[i], 0); + atomic64_set(&rwstat->aux_cnt[i], 0); + } +} + +/** + * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count + * @to: the destination blkg_rwstat + * @from: the source + * + * Add @from's count including the aux one to @to's aux count. + */ +static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to, + struct blkg_rwstat *from) +{ + u64 sum[BLKG_RWSTAT_NR]; + int i; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + sum[i] = percpu_counter_sum_positive(&from->cpu_cnt[i]); + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + atomic64_add(sum[i] + atomic64_read(&from->aux_cnt[i]), + &to->aux_cnt[i]); +} +#endif /* _BLK_CGROUP_RWSTAT_H */ diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 8bfda04c72da0..eab96a3472e8e 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -561,103 +561,6 @@ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) } EXPORT_SYMBOL_GPL(__blkg_prfill_u64); -/** - * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat - * @sf: seq_file to print to - * @pd: policy private data of interest - * @rwstat: rwstat to print - * - * Print @rwstat to @sf for the device assocaited with @pd. - */ -u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - const struct blkg_rwstat_sample *rwstat) -{ - static const char *rwstr[] = { - [BLKG_RWSTAT_READ] = "Read", - [BLKG_RWSTAT_WRITE] = "Write", - [BLKG_RWSTAT_SYNC] = "Sync", - [BLKG_RWSTAT_ASYNC] = "Async", - [BLKG_RWSTAT_DISCARD] = "Discard", - }; - const char *dname = blkg_dev_name(pd->blkg); - u64 v; - int i; - - if (!dname) - return 0; - - for (i = 0; i < BLKG_RWSTAT_NR; i++) - seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], - rwstat->cnt[i]); - - v = rwstat->cnt[BLKG_RWSTAT_READ] + - rwstat->cnt[BLKG_RWSTAT_WRITE] + - rwstat->cnt[BLKG_RWSTAT_DISCARD]; - seq_printf(sf, "%s Total %llu\n", dname, v); - return v; -} -EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat); - -/** - * blkg_prfill_rwstat - prfill callback for blkg_rwstat - * @sf: seq_file to print to - * @pd: policy private data of interest - * @off: offset to the blkg_rwstat in @pd - * - * prfill callback for printing a blkg_rwstat. - */ -u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - int off) -{ - struct blkg_rwstat_sample rwstat = { }; - - blkg_rwstat_read((void *)pd + off, &rwstat); - return __blkg_prfill_rwstat(sf, pd, &rwstat); -} -EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); - -/** - * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat - * @blkg: blkg of interest - * @pol: blkcg_policy which contains the blkg_rwstat - * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg - * @sum: blkg_rwstat_sample structure containing the results - * - * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its - * online descendants and their aux counts. The caller must be holding the - * queue lock for online tests. - * - * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it - * is at @off bytes into @blkg's blkg_policy_data of the policy. - */ -void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, - int off, struct blkg_rwstat_sample *sum) -{ - struct blkcg_gq *pos_blkg; - struct cgroup_subsys_state *pos_css; - unsigned int i; - - lockdep_assert_held(&blkg->q->queue_lock); - - rcu_read_lock(); - blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { - struct blkg_rwstat *rwstat; - - if (!pos_blkg->online) - continue; - - if (pol) - rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off; - else - rwstat = (void *)pos_blkg + off; - - for (i = 0; i < BLKG_RWSTAT_NR; i++) - sum->cnt[i] = blkg_rwstat_read_counter(rwstat, i); - } - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); - /* Performs queue bypass and policy enabled checks then looks up blkg. */ static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, const struct blkcg_policy *pol, diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 69356af063786..b1729d07f1722 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -12,6 +12,7 @@ #include #include #include "blk.h" +#include "blk-cgroup-rwstat.h" /* Max dispatch from a group in 1 round */ static int throtl_grp_quantum = 8; diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 867ab391e409c..48a66738143df 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -41,17 +41,6 @@ enum blkg_iostat_type { BLKG_IOSTAT_NR, }; -enum blkg_rwstat_type { - BLKG_RWSTAT_READ, - BLKG_RWSTAT_WRITE, - BLKG_RWSTAT_SYNC, - BLKG_RWSTAT_ASYNC, - BLKG_RWSTAT_DISCARD, - - BLKG_RWSTAT_NR, - BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, -}; - struct blkcg_gq; struct blkcg { @@ -82,19 +71,6 @@ struct blkg_iostat_set { struct blkg_iostat last; }; -/* - * blkg_[rw]stat->aux_cnt is excluded for local stats but included for - * recursive. Used to carry stats of dead children. - */ -struct blkg_rwstat { - struct percpu_counter cpu_cnt[BLKG_RWSTAT_NR]; - atomic64_t aux_cnt[BLKG_RWSTAT_NR]; -}; - -struct blkg_rwstat_sample { - u64 cnt[BLKG_RWSTAT_NR]; -}; - /* * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a * request_queue (q). This is used by blkcg policies which need to track @@ -223,13 +199,6 @@ int blkcg_activate_policy(struct request_queue *q, void blkcg_deactivate_policy(struct request_queue *q, const struct blkcg_policy *pol); -static inline u64 blkg_rwstat_read_counter(struct blkg_rwstat *rwstat, - unsigned int idx) -{ - return atomic64_read(&rwstat->aux_cnt[idx]) + - percpu_counter_sum_positive(&rwstat->cpu_cnt[idx]); -} - const char *blkg_dev_name(struct blkcg_gq *blkg); void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 (*prfill)(struct seq_file *, @@ -237,12 +206,6 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, const struct blkcg_policy *pol, int data, bool show_total); u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); -u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - const struct blkg_rwstat_sample *rwstat); -u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - int off); -void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, - int off, struct blkg_rwstat_sample *sum); struct blkg_conf_ctx { struct gendisk *disk; @@ -594,128 +557,6 @@ static inline void blkg_put(struct blkcg_gq *blkg) if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q, false))) -static inline int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp) -{ - int i, ret; - - for (i = 0; i < BLKG_RWSTAT_NR; i++) { - ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp); - if (ret) { - while (--i >= 0) - percpu_counter_destroy(&rwstat->cpu_cnt[i]); - return ret; - } - atomic64_set(&rwstat->aux_cnt[i], 0); - } - return 0; -} - -static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat) -{ - int i; - - for (i = 0; i < BLKG_RWSTAT_NR; i++) - percpu_counter_destroy(&rwstat->cpu_cnt[i]); -} - -/** - * blkg_rwstat_add - add a value to a blkg_rwstat - * @rwstat: target blkg_rwstat - * @op: REQ_OP and flags - * @val: value to add - * - * Add @val to @rwstat. The counters are chosen according to @rw. The - * caller is responsible for synchronizing calls to this function. - */ -static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, - unsigned int op, uint64_t val) -{ - struct percpu_counter *cnt; - - if (op_is_discard(op)) - cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD]; - else if (op_is_write(op)) - cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE]; - else - cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ]; - - percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH); - - if (op_is_sync(op)) - cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC]; - else - cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC]; - - percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH); -} - -/** - * blkg_rwstat_read - read the current values of a blkg_rwstat - * @rwstat: blkg_rwstat to read - * - * Read the current snapshot of @rwstat and return it in the aux counts. - */ -static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat, - struct blkg_rwstat_sample *result) -{ - int i; - - for (i = 0; i < BLKG_RWSTAT_NR; i++) - result->cnt[i] = - percpu_counter_sum_positive(&rwstat->cpu_cnt[i]); -} - -/** - * blkg_rwstat_total - read the total count of a blkg_rwstat - * @rwstat: blkg_rwstat to read - * - * Return the total count of @rwstat regardless of the IO direction. This - * function can be called without synchronization and takes care of u64 - * atomicity. - */ -static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) -{ - struct blkg_rwstat_sample tmp = { }; - - blkg_rwstat_read(rwstat, &tmp); - return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; -} - -/** - * blkg_rwstat_reset - reset a blkg_rwstat - * @rwstat: blkg_rwstat to reset - */ -static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) -{ - int i; - - for (i = 0; i < BLKG_RWSTAT_NR; i++) { - percpu_counter_set(&rwstat->cpu_cnt[i], 0); - atomic64_set(&rwstat->aux_cnt[i], 0); - } -} - -/** - * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count - * @to: the destination blkg_rwstat - * @from: the source - * - * Add @from's count including the aux one to @to's aux count. - */ -static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to, - struct blkg_rwstat *from) -{ - u64 sum[BLKG_RWSTAT_NR]; - int i; - - for (i = 0; i < BLKG_RWSTAT_NR; i++) - sum[i] = percpu_counter_sum_positive(&from->cpu_cnt[i]); - - for (i = 0; i < BLKG_RWSTAT_NR; i++) - atomic64_add(sum[i] + atomic64_read(&from->aux_cnt[i]), - &to->aux_cnt[i]); -} - #ifdef CONFIG_BLK_DEV_THROTTLING extern bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, struct bio *bio); From 8a221c753adedf7c05fcbd6e7678767ecbd879b7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 6 Nov 2019 12:49:57 -0800 Subject: [PATCH 0031/5344] cgroup: use cgroup->last_bstat instead of cgroup->bstat_pending for consistency commit 1bb5ec2eec48dcab1d8ae3707e4a388da6a9c9dc upstream. cgroup->bstat_pending is used to determine the base stat delta to propagate to the parent. While correct, this is different from how percpu delta is determined for no good reason and the inconsistency makes the code more difficult to understand. This patch makes parent propagation delta calculation use the same method as percpu to global propagation. * cgroup_base_stat_accumulate() is renamed to cgroup_base_stat_add() and cgroup_base_stat_sub() is added. * percpu propagation calculation is updated to use the above helpers. * cgroup->bstat_pending is replaced with cgroup->last_bstat and updated to use the same calculation as percpu propagation. Signed-off-by: Tejun Heo Signed-off-by: Muchun Song --- include/linux/cgroup-defs.h | 2 +- kernel/cgroup/rstat.c | 46 ++++++++++++++++++++----------------- 2 files changed, 26 insertions(+), 22 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 1ccfa3779e18d..b22598c5a6061 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -458,7 +458,7 @@ struct cgroup { struct list_head rstat_css_list; /* cgroup basic resource statistics */ - struct cgroup_base_stat pending_bstat; /* pending from children */ + struct cgroup_base_stat last_bstat; struct cgroup_base_stat bstat; struct prev_cputime prev_cputime; /* for printing out cputime */ diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 4a942d4e9763d..41ca996568dfb 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -294,44 +294,48 @@ void __init cgroup_rstat_boot(void) * Functions for cgroup basic resource statistics implemented on top of * rstat. */ -static void cgroup_base_stat_accumulate(struct cgroup_base_stat *dst_bstat, - struct cgroup_base_stat *src_bstat) +static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat, + struct cgroup_base_stat *src_bstat) { dst_bstat->cputime.utime += src_bstat->cputime.utime; dst_bstat->cputime.stime += src_bstat->cputime.stime; dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; } +static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, + struct cgroup_base_stat *src_bstat) +{ + dst_bstat->cputime.utime -= src_bstat->cputime.utime; + dst_bstat->cputime.stime -= src_bstat->cputime.stime; + dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime; +} + static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); - struct task_cputime *last_cputime = &rstatc->last_bstat.cputime; - struct task_cputime cputime; - struct cgroup_base_stat delta; + struct cgroup_base_stat cur, delta; unsigned seq; /* fetch the current per-cpu values */ do { seq = __u64_stats_fetch_begin(&rstatc->bsync); - cputime = rstatc->bstat.cputime; + cur.cputime = rstatc->bstat.cputime; } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); - /* calculate the delta to propgate */ - delta.cputime.utime = cputime.utime - last_cputime->utime; - delta.cputime.stime = cputime.stime - last_cputime->stime; - delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime - - last_cputime->sum_exec_runtime; - *last_cputime = cputime; - - /* transfer the pending stat into delta */ - cgroup_base_stat_accumulate(&delta, &cgrp->pending_bstat); - memset(&cgrp->pending_bstat, 0, sizeof(cgrp->pending_bstat)); - - /* propagate delta into the global stat and the parent's pending */ - cgroup_base_stat_accumulate(&cgrp->bstat, &delta); - if (parent) - cgroup_base_stat_accumulate(&parent->pending_bstat, &delta); + /* propagate percpu delta to global */ + delta = cur; + cgroup_base_stat_sub(&delta, &rstatc->last_bstat); + cgroup_base_stat_add(&cgrp->bstat, &delta); + cgroup_base_stat_add(&rstatc->last_bstat, &delta); + + /* propagate global delta to parent */ + if (parent) { + delta = cgrp->bstat; + cgroup_base_stat_sub(&delta, &cgrp->last_bstat); + cgroup_base_stat_add(&parent->bstat, &delta); + cgroup_base_stat_add(&cgrp->last_bstat, &delta); + } } static struct cgroup_rstat_cpu * From c86e0bc20639291226dfa8303348bcd4a89d1e96 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Nov 2019 14:31:28 -0800 Subject: [PATCH 0032/5344] blk-cgroup: cgroup_rstat_updated() shouldn't be called on cgroup1 commit 496074f94b19574c77240d3b3f84cfb1097de51d upstream. Currently, cgroup rstat is supported only on cgroup2 hierarchy and rstat functions shouldn't be called on cgroup1 cgroups. While converting blk-cgroup core statistics to rstat, f73316482977 ("blk-cgroup: reimplement basic IO stats using cgroup rstat") accidentally ended up calling cgroup_rstat_updated() on cgroup1 cgroups causing crashes. Longer term, we probably should add cgroup1 support to rstat but for now let's mask the call directly. Fixes: f73316482977 ("blk-cgroup: reimplement basic IO stats using cgroup rstat") Tested-by: Faiz Abbas Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: Muchun Song --- include/linux/blk-cgroup.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 48a66738143df..19394c77ed995 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -626,7 +626,8 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, bis->cur.ios[rwd]++; u64_stats_update_end(&bis->sync); - cgroup_rstat_updated(blkg->blkcg->css.cgroup, cpu); + if (cgroup_subsys_on_dfl(io_cgrp_subsys)) + cgroup_rstat_updated(blkg->blkcg->css.cgroup, cpu); put_cpu(); } From da91a532e858d65855b2d7f79a4299a5677993d6 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Thu, 5 Dec 2019 20:53:11 +0800 Subject: [PATCH 0033/5344] bfq-iosched: Ensure bio->bi_blkg is valid before using it commit 08802ed665e47243393f43501bdafa032112eb1c upstream. bio->bi_blkg will be NULL when the issue of the request has bypassed the block layer as shown in the following oops: Internal error: Oops: 96000005 [#1] SMP CPU: 17 PID: 2996 Comm: scsi_id Not tainted 5.4.0 #4 Call trace: percpu_counter_add_batch+0x38/0x4c8 bfqg_stats_update_legacy_io+0x9c/0x280 bfq_insert_requests+0xbac/0x2190 blk_mq_sched_insert_request+0x288/0x670 blk_execute_rq_nowait+0x140/0x178 blk_execute_rq+0x8c/0x140 sg_io+0x604/0x9c0 scsi_cmd_ioctl+0xe38/0x10a8 scsi_cmd_blk_ioctl+0xac/0xe8 sd_ioctl+0xe4/0x238 blkdev_ioctl+0x590/0x20e0 block_ioctl+0x60/0x98 do_vfs_ioctl+0xe0/0x1b58 ksys_ioctl+0x80/0xd8 __arm64_sys_ioctl+0x40/0x78 el0_svc_handler+0xc4/0x270 so ensure its validity before using it. Fixes: fd41e60331b1 ("bfq-iosched: stop using blkg->stat_bytes and ->stat_ios") Signed-off-by: Hou Tao Signed-off-by: Jens Axboe Signed-off-by: Muchun Song --- block/bfq-cgroup.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 25c8201209af4..b6c8bfadcd3b3 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -351,6 +351,9 @@ void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq) { struct bfq_group *bfqg = blkg_to_bfqg(rq->bio->bi_blkg); + if (!bfqg) + return; + blkg_rwstat_add(&bfqg->stats.bytes, rq->cmd_flags, blk_rq_bytes(rq)); blkg_rwstat_add(&bfqg->stats.ios, rq->cmd_flags, 1); } From 127e727c52480fcf2b6b868ffa7c6566e91c2eac Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Mon, 1 Jun 2020 13:11:43 -0700 Subject: [PATCH 0034/5344] blk-cgroup: make iostat functions visible to stat printing commit cd1fc4b98fb5953a220d690d45b11470fd9325d6 upstream. Previously, the code which printed io.stat only needed access to the generic rstat flushing code, but since we plan to write some more specific code for preparing root cgroup stats, we need to manipulate iostat structs directly. Since declaring static functions ahead does not seem like common practice in this file, simply move the iostat functions up. We only plan to use blkg_iostat_set, but it seems better to keep them all together. Signed-off-by: Boris Burkov Acked-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: Muchun Song --- block/blk-cgroup.c | 142 ++++++++++++++++++++++----------------------- 1 file changed, 71 insertions(+), 71 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index eab96a3472e8e..b4b4255db9b0e 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -750,6 +750,77 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx) } EXPORT_SYMBOL_GPL(blkg_conf_finish); +static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] = src->bytes[i]; + dst->ios[i] = src->ios[i]; + } +} + +static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] += src->bytes[i]; + dst->ios[i] += src->ios[i]; + } +} + +static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] -= src->bytes[i]; + dst->ios[i] -= src->ios[i]; + } +} + +static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) +{ + struct blkcg *blkcg = css_to_blkcg(css); + struct blkcg_gq *blkg; + + rcu_read_lock(); + + hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + struct blkcg_gq *parent = blkg->parent; + struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu); + struct blkg_iostat cur, delta; + unsigned int seq; + + /* fetch the current per-cpu values */ + do { + seq = u64_stats_fetch_begin(&bisc->sync); + blkg_iostat_set(&cur, &bisc->cur); + } while (u64_stats_fetch_retry(&bisc->sync, seq)); + + /* propagate percpu delta to global */ + u64_stats_update_begin(&blkg->iostat.sync); + blkg_iostat_set(&delta, &cur); + blkg_iostat_sub(&delta, &bisc->last); + blkg_iostat_add(&blkg->iostat.cur, &delta); + blkg_iostat_add(&bisc->last, &delta); + u64_stats_update_end(&blkg->iostat.sync); + + /* propagate global delta to parent */ + if (parent) { + u64_stats_update_begin(&parent->iostat.sync); + blkg_iostat_set(&delta, &blkg->iostat.cur); + blkg_iostat_sub(&delta, &blkg->iostat.last); + blkg_iostat_add(&parent->iostat.cur, &delta); + blkg_iostat_add(&blkg->iostat.last, &delta); + u64_stats_update_end(&parent->iostat.sync); + } + } + + rcu_read_unlock(); +} + static int blkcg_print_stat(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); @@ -1140,77 +1211,6 @@ static int blkcg_can_attach(struct cgroup_taskset *tset) return ret; } -static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) -{ - int i; - - for (i = 0; i < BLKG_IOSTAT_NR; i++) { - dst->bytes[i] = src->bytes[i]; - dst->ios[i] = src->ios[i]; - } -} - -static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) -{ - int i; - - for (i = 0; i < BLKG_IOSTAT_NR; i++) { - dst->bytes[i] += src->bytes[i]; - dst->ios[i] += src->ios[i]; - } -} - -static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src) -{ - int i; - - for (i = 0; i < BLKG_IOSTAT_NR; i++) { - dst->bytes[i] -= src->bytes[i]; - dst->ios[i] -= src->ios[i]; - } -} - -static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) -{ - struct blkcg *blkcg = css_to_blkcg(css); - struct blkcg_gq *blkg; - - rcu_read_lock(); - - hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { - struct blkcg_gq *parent = blkg->parent; - struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu); - struct blkg_iostat cur, delta; - unsigned seq; - - /* fetch the current per-cpu values */ - do { - seq = u64_stats_fetch_begin(&bisc->sync); - blkg_iostat_set(&cur, &bisc->cur); - } while (u64_stats_fetch_retry(&bisc->sync, seq)); - - /* propagate percpu delta to global */ - u64_stats_update_begin(&blkg->iostat.sync); - blkg_iostat_set(&delta, &cur); - blkg_iostat_sub(&delta, &bisc->last); - blkg_iostat_add(&blkg->iostat.cur, &delta); - blkg_iostat_add(&bisc->last, &delta); - u64_stats_update_end(&blkg->iostat.sync); - - /* propagate global delta to parent */ - if (parent) { - u64_stats_update_begin(&parent->iostat.sync); - blkg_iostat_set(&delta, &blkg->iostat.cur); - blkg_iostat_sub(&delta, &blkg->iostat.last); - blkg_iostat_add(&parent->iostat.cur, &delta); - blkg_iostat_add(&blkg->iostat.last, &delta); - u64_stats_update_end(&parent->iostat.sync); - } - } - - rcu_read_unlock(); -} - static void blkcg_bind(struct cgroup_subsys_state *root_css) { int i; From a34a32a98b3a73ac12531634ad86b6d9c70d6fa7 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Mon, 1 Jun 2020 13:12:05 -0700 Subject: [PATCH 0035/5344] blk-cgroup: show global disk stats in root cgroup io.stat commit ef45fe470e1e5410db4af87abc5d5055427945ac upstream. In order to improve consistency and usability in cgroup stat accounting, we would like to support the root cgroup's io.stat. Since the root cgroup has processes doing io even if the system has no explicitly created cgroups, we need to be careful to avoid overhead in that case. For that reason, the rstat algorithms don't handle the root cgroup, so just turning the file on wouldn't give correct statistics. To get around this, we simulate flushing the iostat struct by filling it out directly from global disk stats. The result is a root cgroup io.stat file consistent with both /proc/diskstats and io.stat. Note that in order to collect the disk stats, we needed to iterate over devices. To facilitate that, we had to change the linkage of a disk_type to external so that it can be used from blk-cgroup.c to iterate over disks. Suggested-by: Tejun Heo Signed-off-by: Boris Burkov Acked-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: Muchun Song --- Documentation/admin-guide/cgroup-v2.rst | 3 +- block/blk-cgroup.c | 57 ++++++++++++++++++++++++- block/genhd.c | 4 +- include/linux/genhd.h | 1 + 4 files changed, 58 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 37925c07a3d2e..65a4a213bd48c 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1496,8 +1496,7 @@ IO Interface Files ~~~~~~~~~~~~~~~~~~ io.stat - A read-only nested-keyed file which exists on non-root - cgroups. + A read-only nested-keyed file. Lines are keyed by $MAJ:$MIN device numbers and not ordered. The following nested keys are defined. diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index b4b4255db9b0e..27786453d4904 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -821,12 +821,66 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) rcu_read_unlock(); } +/* + * The rstat algorithms intentionally don't handle the root cgroup to avoid + * incurring overhead when no cgroups are defined. For that reason, + * cgroup_rstat_flush in blkcg_print_stat does not actually fill out the + * iostat in the root cgroup's blkcg_gq. + * + * However, we would like to re-use the printing code between the root and + * non-root cgroups to the extent possible. For that reason, we simulate + * flushing the root cgroup's stats by explicitly filling in the iostat + * with disk level statistics. + */ +static void blkcg_fill_root_iostats(void) +{ + struct class_dev_iter iter; + struct device *dev; + + class_dev_iter_init(&iter, &block_class, NULL, &disk_type); + while ((dev = class_dev_iter_next(&iter))) { + struct gendisk *disk = dev_to_disk(dev); + struct hd_struct *part = disk_get_part(disk, 0); + struct blkcg_gq *blkg = blk_queue_root_blkg(disk->queue); + struct blkg_iostat tmp; + int cpu; + + memset(&tmp, 0, sizeof(tmp)); + for_each_possible_cpu(cpu) { + struct disk_stats *cpu_dkstats; + + cpu_dkstats = per_cpu_ptr(part->dkstats, cpu); + tmp.ios[BLKG_IOSTAT_READ] += + cpu_dkstats->ios[STAT_READ]; + tmp.ios[BLKG_IOSTAT_WRITE] += + cpu_dkstats->ios[STAT_WRITE]; + tmp.ios[BLKG_IOSTAT_DISCARD] += + cpu_dkstats->ios[STAT_DISCARD]; + // convert sectors to bytes + tmp.bytes[BLKG_IOSTAT_READ] += + cpu_dkstats->sectors[STAT_READ] << 9; + tmp.bytes[BLKG_IOSTAT_WRITE] += + cpu_dkstats->sectors[STAT_WRITE] << 9; + tmp.bytes[BLKG_IOSTAT_DISCARD] += + cpu_dkstats->sectors[STAT_DISCARD] << 9; + + u64_stats_update_begin(&blkg->iostat.sync); + blkg_iostat_set(&blkg->iostat.cur, &tmp); + u64_stats_update_end(&blkg->iostat.sync); + } + } +} + static int blkcg_print_stat(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct blkcg_gq *blkg; - cgroup_rstat_flush(blkcg->css.cgroup); + if (!seq_css(sf)->parent) + blkcg_fill_root_iostats(); + else + cgroup_rstat_flush(blkcg->css.cgroup); + rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { @@ -915,7 +969,6 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) static struct cftype blkcg_files[] = { { .name = "stat", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = blkcg_print_stat, }, { } /* terminate */ diff --git a/block/genhd.c b/block/genhd.c index ad2ac24b05e53..431fd567312f6 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -37,8 +37,6 @@ struct kobject *block_depr; static DEFINE_SPINLOCK(ext_devt_lock); static DEFINE_IDR(ext_devt_idr); -static const struct device_type disk_type; - static void disk_check_events(struct disk_events *ev, unsigned int *clearing_ptr); static void disk_alloc_events(struct gendisk *disk); @@ -1368,7 +1366,7 @@ static char *block_devnode(struct device *dev, umode_t *mode, return NULL; } -static const struct device_type disk_type = { +const struct device_type disk_type = { .name = "disk", .groups = disk_attr_groups, .release = disk_release, diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 23b845820e1dc..621c14a217217 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -26,6 +26,7 @@ #define disk_to_dev(disk) (&(disk)->part0.__dev) #define part_to_dev(part) (&((part)->__dev)) +extern const struct device_type disk_type; extern struct device_type part_type; extern struct kobject *block_depr; extern struct class block_class; From 33f9435920b4ba8eba047714fd7bdaa76454c542 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 29 Apr 2021 22:56:23 -0700 Subject: [PATCH 0036/5344] cgroup: rstat: punt root-level optimization to individual controllers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit dc26532aed0ab25c0801a34640d1f3b9b9098a48 upstream. Current users of the rstat code can source root-level statistics from the native counters of their respective subsystem, allowing them to forego aggregation at the root level. This optimization is currently implemented inside the generic rstat code, which doesn't track the root cgroup and doesn't invoke the subsystem flush callbacks on it. However, the memory controller cannot do this optimization, because cgroup1 breaks out memory specifically for the local level, including at the root level. In preparation for the memory controller switching to rstat, move the optimization from rstat core to the controllers. Afterwards, rstat will always track the root cgroup for changes and invoke the subsystem callbacks on it; and it's up to the subsystem to special-case and skip aggregation of the root cgroup if it can source this information through other, cheaper means. This is the case for the io controller and the cgroup base stats. In their respective flush callbacks, check whether the parent is the root cgroup, and if so, skip the unnecessary upward propagation. The extra cost of tracking the root cgroup is negligible: on stat changes, we actually remove a branch that checks for the root. The queueing for a flush touches only per-cpu data, and only the first stat change since a flush requires a (per-cpu) lock. Link: https://lkml.kernel.org/r/20210209163304.77088-6-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Tejun Heo Cc: Michal Hocko Cc: Michal Koutný Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Muchun Song --- block/blk-cgroup.c | 17 +++++++----- kernel/cgroup/rstat.c | 61 +++++++++++++++++++++++++------------------ 2 files changed, 47 insertions(+), 31 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 27786453d4904..586ef23e37961 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -785,6 +785,10 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; + /* Root-level stats are sourced from system-wide IO stats */ + if (!cgroup_parent(css->cgroup)) + return; + rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { @@ -807,8 +811,8 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) blkg_iostat_add(&bisc->last, &delta); u64_stats_update_end(&blkg->iostat.sync); - /* propagate global delta to parent */ - if (parent) { + /* propagate global delta to parent (unless that's root) */ + if (parent && parent->parent) { u64_stats_update_begin(&parent->iostat.sync); blkg_iostat_set(&delta, &blkg->iostat.cur); blkg_iostat_sub(&delta, &blkg->iostat.last); @@ -822,10 +826,11 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) } /* - * The rstat algorithms intentionally don't handle the root cgroup to avoid - * incurring overhead when no cgroups are defined. For that reason, - * cgroup_rstat_flush in blkcg_print_stat does not actually fill out the - * iostat in the root cgroup's blkcg_gq. + * We source root cgroup stats from the system-wide stats to avoid + * tracking the same information twice and incurring overhead when no + * cgroups are defined. For that reason, cgroup_rstat_flush in + * blkcg_print_stat does not actually fill out the iostat in the root + * cgroup's blkcg_gq. * * However, we would like to re-use the printing code between the root and * non-root cgroups to the extent possible. For that reason, we simulate diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 41ca996568dfb..047f132505e16 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -25,13 +25,8 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) { raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); - struct cgroup *parent; unsigned long flags; - /* nothing to do for root */ - if (!cgroup_parent(cgrp)) - return; - /* * Speculative already-on-list test. This may race leading to * temporary inaccuracies, which is fine. @@ -46,10 +41,10 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) raw_spin_lock_irqsave(cpu_lock, flags); /* put @cgrp and all ancestors on the corresponding updated lists */ - for (parent = cgroup_parent(cgrp); parent; - cgrp = parent, parent = cgroup_parent(cgrp)) { + while (true) { struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); - struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); + struct cgroup *parent = cgroup_parent(cgrp); + struct cgroup_rstat_cpu *prstatc; /* * Both additions and removals are bottom-up. If a cgroup @@ -58,8 +53,17 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) if (rstatc->updated_next) break; + /* Root has no parent to link it to, but mark it busy */ + if (!parent) { + rstatc->updated_next = cgrp; + break; + } + + prstatc = cgroup_rstat_cpu(parent, cpu); rstatc->updated_next = prstatc->updated_children; prstatc->updated_children = cgrp; + + cgrp = parent; } raw_spin_unlock_irqrestore(cpu_lock, flags); @@ -114,23 +118,26 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, */ if (rstatc->updated_next) { struct cgroup *parent = cgroup_parent(pos); - struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); - struct cgroup_rstat_cpu *nrstatc; - struct cgroup **nextp; - - nextp = &prstatc->updated_children; - while (true) { - nrstatc = cgroup_rstat_cpu(*nextp, cpu); - if (*nextp == pos) - break; - - WARN_ON_ONCE(*nextp == parent); - nextp = &nrstatc->updated_next; + + if (parent) { + struct cgroup_rstat_cpu *prstatc; + struct cgroup **nextp; + + prstatc = cgroup_rstat_cpu(parent, cpu); + nextp = &prstatc->updated_children; + while (true) { + struct cgroup_rstat_cpu *nrstatc; + + nrstatc = cgroup_rstat_cpu(*nextp, cpu); + if (*nextp == pos) + break; + WARN_ON_ONCE(*nextp == parent); + nextp = &nrstatc->updated_next; + } + *nextp = rstatc->updated_next; } - *nextp = rstatc->updated_next; rstatc->updated_next = NULL; - return pos; } @@ -312,11 +319,15 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { - struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_base_stat cur, delta; unsigned seq; + /* Root-level stats are sourced from system-wide CPU stats */ + if (!parent) + return; + /* fetch the current per-cpu values */ do { seq = __u64_stats_fetch_begin(&rstatc->bsync); @@ -329,8 +340,8 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) cgroup_base_stat_add(&cgrp->bstat, &delta); cgroup_base_stat_add(&rstatc->last_bstat, &delta); - /* propagate global delta to parent */ - if (parent) { + /* propagate global delta to parent (unless that's root) */ + if (cgroup_parent(parent)) { delta = cgrp->bstat; cgroup_base_stat_sub(&delta, &cgrp->last_bstat); cgroup_base_stat_add(&parent->bstat, &delta); From 84fb8f5e8b5e6118bfc12dcd4c5838eecedf63d1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Jul 2021 13:12:20 -1000 Subject: [PATCH 0037/5344] cgroup: rstat: fix A-A deadlock on 32bit around u64_stats_sync commit c3df5fb57fe8756d67fd56ed29da65cdfde839f9 upstream. 0fa294fb1985 ("cgroup: Replace cgroup_rstat_mutex with a spinlock") added cgroup_rstat_flush_irqsafe() allowing flushing to happen from the irq context. However, rstat paths use u64_stats_sync to synchronize access to 64bit stat counters on 32bit machines. u64_stats_sync is implemented using seq_lock and trying to read from an irq context can lead to A-A deadlock if the irq happens to interrupt the stat update. Fix it by using the irqsafe variants - u64_stats_update_begin_irqsave() and u64_stats_update_end_irqrestore() - in the update paths. Note that none of this matters on 64bit machines. All these are just for 32bit SMP setups. Note that the interface was introduced way back, its first and currently only use was recently added by 2d146aa3aa84 ("mm: memcontrol: switch to rstat"). Stable tagging targets this commit. Signed-off-by: Tejun Heo Reported-by: Rik van Riel Fixes: 2d146aa3aa84 ("mm: memcontrol: switch to rstat") Cc: stable@vger.kernel.org # v5.13+ Signed-off-by: Muchun Song --- block/blk-cgroup.c | 14 ++++++++------ kernel/cgroup/rstat.c | 19 +++++++++++-------- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 586ef23e37961..f13d5cbc8eee9 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -795,6 +795,7 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) struct blkcg_gq *parent = blkg->parent; struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu); struct blkg_iostat cur, delta; + unsigned long flags; unsigned int seq; /* fetch the current per-cpu values */ @@ -804,21 +805,21 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) } while (u64_stats_fetch_retry(&bisc->sync, seq)); /* propagate percpu delta to global */ - u64_stats_update_begin(&blkg->iostat.sync); + flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); blkg_iostat_set(&delta, &cur); blkg_iostat_sub(&delta, &bisc->last); blkg_iostat_add(&blkg->iostat.cur, &delta); blkg_iostat_add(&bisc->last, &delta); - u64_stats_update_end(&blkg->iostat.sync); + u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); /* propagate global delta to parent (unless that's root) */ if (parent && parent->parent) { - u64_stats_update_begin(&parent->iostat.sync); + flags = u64_stats_update_begin_irqsave(&parent->iostat.sync); blkg_iostat_set(&delta, &blkg->iostat.cur); blkg_iostat_sub(&delta, &blkg->iostat.last); blkg_iostat_add(&parent->iostat.cur, &delta); blkg_iostat_add(&blkg->iostat.last, &delta); - u64_stats_update_end(&parent->iostat.sync); + u64_stats_update_end_irqrestore(&parent->iostat.sync, flags); } } @@ -853,6 +854,7 @@ static void blkcg_fill_root_iostats(void) memset(&tmp, 0, sizeof(tmp)); for_each_possible_cpu(cpu) { struct disk_stats *cpu_dkstats; + unsigned long flags; cpu_dkstats = per_cpu_ptr(part->dkstats, cpu); tmp.ios[BLKG_IOSTAT_READ] += @@ -869,9 +871,9 @@ static void blkcg_fill_root_iostats(void) tmp.bytes[BLKG_IOSTAT_DISCARD] += cpu_dkstats->sectors[STAT_DISCARD] << 9; - u64_stats_update_begin(&blkg->iostat.sync); + flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); blkg_iostat_set(&blkg->iostat.cur, &tmp); - u64_stats_update_end(&blkg->iostat.sync); + u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); } } } diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 047f132505e16..2e1cfe874f85d 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -350,19 +350,20 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) } static struct cgroup_rstat_cpu * -cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp) +cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags) { struct cgroup_rstat_cpu *rstatc; rstatc = get_cpu_ptr(cgrp->rstat_cpu); - u64_stats_update_begin(&rstatc->bsync); + *flags = u64_stats_update_begin_irqsave(&rstatc->bsync); return rstatc; } static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, - struct cgroup_rstat_cpu *rstatc) + struct cgroup_rstat_cpu *rstatc, + unsigned long flags) { - u64_stats_update_end(&rstatc->bsync); + u64_stats_update_end_irqrestore(&rstatc->bsync, flags); cgroup_rstat_updated(cgrp, smp_processor_id()); put_cpu_ptr(rstatc); } @@ -370,18 +371,20 @@ static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) { struct cgroup_rstat_cpu *rstatc; + unsigned long flags; - rstatc = cgroup_base_stat_cputime_account_begin(cgrp); + rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); rstatc->bstat.cputime.sum_exec_runtime += delta_exec; - cgroup_base_stat_cputime_account_end(cgrp, rstatc); + cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags); } void __cgroup_account_cputime_field(struct cgroup *cgrp, enum cpu_usage_stat index, u64 delta_exec) { struct cgroup_rstat_cpu *rstatc; + unsigned long flags; - rstatc = cgroup_base_stat_cputime_account_begin(cgrp); + rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); switch (index) { case CPUTIME_USER: @@ -397,7 +400,7 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp, break; } - cgroup_base_stat_cputime_account_end(cgrp, rstatc); + cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags); } void cgroup_base_stat_cputime_show(struct seq_file *seq) From d4a1233e574884e2b2c49e203e33f37312bbf670 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sun, 13 Feb 2022 16:59:02 +0800 Subject: [PATCH 0038/5344] blk-cgroup: set blkg iostat after percpu stat aggregation commit f122d103b564e5fb7c82de902c6f8f6cbdf50ec9 upstream. Don't need to do blkg_iostat_set for top blkg iostat on each CPU, so move it after percpu stat aggregation. Fixes: ef45fe470e1e ("blk-cgroup: show global disk stats in root cgroup io.stat") Signed-off-by: Chengming Zhou Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220213085902.88884-1-zhouchengming@bytedance.com Signed-off-by: Jens Axboe Signed-off-by: Muchun Song --- block/blk-cgroup.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index f13d5cbc8eee9..3358daa0ac8e3 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -850,11 +850,11 @@ static void blkcg_fill_root_iostats(void) struct blkcg_gq *blkg = blk_queue_root_blkg(disk->queue); struct blkg_iostat tmp; int cpu; + unsigned long flags; memset(&tmp, 0, sizeof(tmp)); for_each_possible_cpu(cpu) { struct disk_stats *cpu_dkstats; - unsigned long flags; cpu_dkstats = per_cpu_ptr(part->dkstats, cpu); tmp.ios[BLKG_IOSTAT_READ] += @@ -870,11 +870,11 @@ static void blkcg_fill_root_iostats(void) cpu_dkstats->sectors[STAT_WRITE] << 9; tmp.bytes[BLKG_IOSTAT_DISCARD] += cpu_dkstats->sectors[STAT_DISCARD] << 9; - - flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); - blkg_iostat_set(&blkg->iostat.cur, &tmp); - u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); } + + flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); + blkg_iostat_set(&blkg->iostat.cur, &tmp); + u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); } } From 8ace4579bcdea0c2d080267d5ea194bca9dcc309 Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Thu, 14 Apr 2022 13:34:26 -0700 Subject: [PATCH 0039/5344] ip6_gre: Avoid updating tunnel->tun_hlen in __gre6_xmit() commit f40c064e933d7787ca7411b699504d7a2664c1f5 upstream. Do not update tunnel->tun_hlen in data plane code. Use a local variable instead, just like "tunnel_hlen" in net/ipv4/ip_gre.c:gre_fb_xmit(). Co-developed-by: Cong Wang Signed-off-by: Cong Wang Signed-off-by: Peilin Ye Signed-off-by: David S. Miller Signed-off-by: Peilin Ye --- net/ipv6/ip6_gre.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 0cb8056d98003..40ce2f768db92 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -730,6 +730,7 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; __be16 flags; + int tun_hlen; tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || @@ -747,9 +748,9 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, dsfield = key->tos; flags = key->tun_flags & (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); - tunnel->tun_hlen = gre_calc_hlen(flags); + tun_hlen = gre_calc_hlen(flags); - gre_build_header(skb, tunnel->tun_hlen, + gre_build_header(skb, tun_hlen, flags, protocol, tunnel_id_to_key32(tun_info->key.tun_id), (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) From 154dca9e607d4d8628713ed228ca264f4854de07 Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Thu, 14 Apr 2022 13:35:40 -0700 Subject: [PATCH 0040/5344] ip6_gre: Fix skb_under_panic in __gre6_xmit() commit ab198e1d0dd8dc4bc7575fb50758e2cbd51e14e1 upstream. Feng reported an skb_under_panic BUG triggered by running test_ip6gretap() in tools/testing/selftests/bpf/test_tunnel.sh: [ 82.492551] skbuff: skb_under_panic: text:ffffffffb268bb8e len:403 put:12 head:ffff9997c5480000 data:ffff9997c547fff8 tail:0x18b end:0x2c0 dev:ip6gretap11 <...> [ 82.607380] Call Trace: [ 82.609389] [ 82.611136] skb_push.cold.109+0x10/0x10 [ 82.614289] __gre6_xmit+0x41e/0x590 [ 82.617169] ip6gre_tunnel_xmit+0x344/0x3f0 [ 82.620526] dev_hard_start_xmit+0xf1/0x330 [ 82.623882] sch_direct_xmit+0xe4/0x250 [ 82.626961] __dev_queue_xmit+0x720/0xfe0 <...> [ 82.633431] packet_sendmsg+0x96a/0x1cb0 [ 82.636568] sock_sendmsg+0x30/0x40 <...> The following sequence of events caused the BUG: 1. During ip6gretap device initialization, tunnel->tun_hlen (e.g. 4) is calculated based on old flags (see ip6gre_calc_hlen()); 2. packet_snd() reserves header room for skb A, assuming tunnel->tun_hlen is 4; 3. Later (in clsact Qdisc), the eBPF program sets a new tunnel key for skb A using bpf_skb_set_tunnel_key() (see _ip6gretap_set_tunnel()); 4. __gre6_xmit() detects the new tunnel key, and recalculates "tun_hlen" (e.g. 12) based on new flags (e.g. TUNNEL_KEY and TUNNEL_SEQ); 5. gre_build_header() calls skb_push() with insufficient reserved header room, triggering the BUG. As sugguested by Cong, fix it by moving the call to skb_cow_head() after the recalculation of tun_hlen. Reproducer: OBJ=$LINUX/tools/testing/selftests/bpf/test_tunnel_kern.o ip netns add at_ns0 ip link add veth0 type veth peer name veth1 ip link set veth0 netns at_ns0 ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0 ip netns exec at_ns0 ip link set dev veth0 up ip link set dev veth1 up mtu 1500 ip addr add dev veth1 172.16.1.200/24 ip netns exec at_ns0 ip addr add ::11/96 dev veth0 ip netns exec at_ns0 ip link set dev veth0 up ip addr add dev veth1 ::22/96 ip link set dev veth1 up ip netns exec at_ns0 \ ip link add dev ip6gretap00 type ip6gretap seq flowlabel 0xbcdef key 2 \ local ::11 remote ::22 ip netns exec at_ns0 ip addr add dev ip6gretap00 10.1.1.100/24 ip netns exec at_ns0 ip addr add dev ip6gretap00 fc80::100/96 ip netns exec at_ns0 ip link set dev ip6gretap00 up ip link add dev ip6gretap11 type ip6gretap external ip addr add dev ip6gretap11 10.1.1.200/24 ip addr add dev ip6gretap11 fc80::200/24 ip link set dev ip6gretap11 up tc qdisc add dev ip6gretap11 clsact tc filter add dev ip6gretap11 egress bpf da obj $OBJ sec ip6gretap_set_tunnel tc filter add dev ip6gretap11 ingress bpf da obj $OBJ sec ip6gretap_get_tunnel ping6 -c 3 -w 10 -q ::11 Fixes: 6712abc168eb ("ip6_gre: add ip6 gre and gretap collect_md mode") Reported-by: Feng Zhou Co-developed-by: Cong Wang Signed-off-by: Cong Wang Signed-off-by: Peilin Ye Signed-off-by: David S. Miller Signed-off-by: Peilin Ye --- net/ipv6/ip6_gre.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 40ce2f768db92..1469b8b1f8940 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -720,9 +720,6 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, else fl6->daddr = tunnel->parms.raddr; - if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen)) - return -ENOMEM; - /* Push GRE header. */ protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto; @@ -750,6 +747,9 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); tun_hlen = gre_calc_hlen(flags); + if (skb_cow_head(skb, dev->needed_headroom ?: tun_hlen + tunnel->encap_hlen)) + return -ENOMEM; + gre_build_header(skb, tun_hlen, flags, protocol, tunnel_id_to_key32(tun_info->key.tun_id), @@ -760,6 +760,9 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, if (tunnel->parms.o_flags & TUNNEL_SEQ) tunnel->o_seqno++; + if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen)) + return -ENOMEM; + gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno)); From 9892f50a020a0fa443b64f6b4724d8e17d3124b0 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 22 Apr 2022 14:59:27 +0800 Subject: [PATCH 0041/5344] bytedance: config: enable CONFIG_CGROUP_HUGETLB This option add a new controller that allows us to control HugeTLB allocations. Signed-off-by: Qi Zheng --- config.aarch64 | 2 +- config.x86_64 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/config.aarch64 b/config.aarch64 index 9e939bf6057b8..95d54dd58f8cf 100644 --- a/config.aarch64 +++ b/config.aarch64 @@ -139,7 +139,7 @@ CONFIG_CFS_BANDWIDTH=y CONFIG_CGROUP_PIDS=y CONFIG_CGROUP_RDMA=y CONFIG_CGROUP_FREEZER=y -# CONFIG_CGROUP_HUGETLB is not set +CONFIG_CGROUP_HUGETLB=y CONFIG_CPUSETS=y CONFIG_PROC_PID_CPUSET=y CONFIG_CGROUP_DEVICE=y diff --git a/config.x86_64 b/config.x86_64 index 372f03b499cfb..7517987f3226e 100644 --- a/config.x86_64 +++ b/config.x86_64 @@ -157,7 +157,7 @@ CONFIG_CFS_BANDWIDTH=y CONFIG_CGROUP_PIDS=y # CONFIG_CGROUP_RDMA is not set CONFIG_CGROUP_FREEZER=y -# CONFIG_CGROUP_HUGETLB is not set +CONFIG_CGROUP_HUGETLB=y CONFIG_CPUSETS=y CONFIG_PROC_PID_CPUSET=y CONFIG_CGROUP_DEVICE=y From 6715fc847bbefdfbe7139bca6e8ad207e7106049 Mon Sep 17 00:00:00 2001 From: Feng Zhou Date: Mon, 24 Jan 2022 18:36:16 +0800 Subject: [PATCH 0042/5344] bytedance: bpf: Optimized compatibility with private helper functions Since the private helper function (unsafe_helper) doesn't merge to the kernel of the community. In order to ensure that the order of the new helper functions which backported from community kernel is consistent, Now the private helper function needs to be placed last. But it's not good enough. So set BPF_FUNC_unsafe_helper to be INT_MAX away from the __BPF_FUNC_MAX_ID. Signed-off-by: Feng Zhou --- include/uapi/linux/bpf.h | 17 ++++++++++------- kernel/bpf/verifier.c | 4 +++- tools/include/uapi/linux/bpf.h | 35 ++++++++++++++++++---------------- 3 files changed, 32 insertions(+), 24 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e119122e85731..f41386fc9041f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -10,6 +10,7 @@ #include #include +#include /* Extended instruction set based on top of classic BPF */ @@ -3737,13 +3738,11 @@ union bpf_attr { FN(bpf_per_cpu_ptr), \ FN(bpf_this_cpu_ptr), \ FN(redirect_peer), \ - FN(unsafe_helper), \ - /* - * unsafe_helper is the helper functions we implement - * ourselves. In order to ensure that the order of the new helper - * functions which backported from community kernel is consistent, the - * unsafe_helper need to be be placed last. - */ + /* */ + +#ifndef INT_MAX +#define INT_MAX 0x7fffffff +#endif /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3752,6 +3751,10 @@ union bpf_attr { enum bpf_func_id { __BPF_FUNC_MAPPER(__BPF_ENUM_FN) __BPF_FUNC_MAX_ID, + + __BPF_BYTEDANCE_FUNC_MIN_ID = INT_MAX / 2, + __BPF_ENUM_FN(unsafe_helper), + __BPF_BYTEDANCE_FUNC_MAX_ID, }; #undef __BPF_ENUM_FN diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 280a2bac4ef6f..5e2456109405a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4007,7 +4007,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn int i, err; /* find function prototype */ - if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { + if (func_id < 0 || (func_id >= __BPF_FUNC_MAX_ID && + func_id <= __BPF_BYTEDANCE_FUNC_MIN_ID) || + func_id >= __BPF_BYTEDANCE_FUNC_MAX_ID) { verbose(env, "invalid func %s#%d\n", func_id_name(func_id), func_id); return -EINVAL; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index bb6a3e932d98c..ec2e4c24a6cbd 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -10,6 +10,7 @@ #include #include +#include /* Extended instruction set based on top of classic BPF */ @@ -3737,13 +3738,11 @@ union bpf_attr { FN(bpf_per_cpu_ptr), \ FN(bpf_this_cpu_ptr), \ FN(redirect_peer), \ - FN(unsafe_helper), \ - /* - * unsafe_helper is the helper functions we implement - * ourselves. In order to ensure that the order of the new helper - * functions which backported from community kernel is consistent, the - * unsafe_helper need to be be placed last. - */ + /* */ + +#ifndef INT_MAX +#define INT_MAX 0x7fffffff +#endif /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3752,6 +3751,10 @@ union bpf_attr { enum bpf_func_id { __BPF_FUNC_MAPPER(__BPF_ENUM_FN) __BPF_FUNC_MAX_ID, + + __BPF_BYTEDANCE_FUNC_MIN_ID = INT_MAX / 2, + __BPF_ENUM_FN(unsafe_helper), + __BPF_BYTEDANCE_FUNC_MAX_ID, }; #undef __BPF_ENUM_FN @@ -4043,15 +4046,6 @@ struct bpf_sock_tuple { }; }; -struct bpf_unsafe_ctx { - void *ctx; - __u16 mod; - __u16 cmd; - __u16 flags; - __u16 data_len; - char data[]; -}; - struct bpf_xdp_sock { __u32 queue_id; }; @@ -4071,6 +4065,15 @@ enum xdp_action { XDP_REDIRECT, }; +struct bpf_unsafe_ctx { + void *ctx; + __u16 mod; + __u16 cmd; + __u16 flags; + __u16 data_len; + char data[]; +}; + /* user accessible metadata for XDP packet hook * new fields must be added to the end of this structure */ From 6a975cba3d0c3570df4983d9bfee7e32a78885f6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 19 May 2020 11:39:53 -0700 Subject: [PATCH 0043/5344] tools/bpf: sync bpf.h commit fb53d3b63743585ce918094d6109a3865fa66e5f upstream. Sync tools/include/uapi/linux/bpf.h from include/uapi. Signed-off-by: Alexei Starovoitov Signed-off-by: Feng Zhou --- tools/include/uapi/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ec2e4c24a6cbd..f41386fc9041f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -74,7 +74,7 @@ struct bpf_insn { /* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ struct bpf_lpm_trie_key { __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ - __u8 data[]; /* Arbitrary size */ + __u8 data[0]; /* Arbitrary size */ }; struct bpf_cgroup_storage_key { From e3517034b8cedfa3290a6c8435353347c3a10f2e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 17 Jan 2020 11:54:35 -0800 Subject: [PATCH 0044/5344] iocost: Fix iocost_monitor.py due to helper type mismatch commit 9ea37e24d4a95dd934a0600d65caa25e409705bb upstream. iocost_monitor.py broke with recent versions of drgn due to helper being stricter about types. Fix it so that it uses the correct type. Signed-off-by: Tejun Heo Suggested-by: Omar Sandoval Signed-off-by: hanjinke --- tools/cgroup/iocost_monitor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/cgroup/iocost_monitor.py b/tools/cgroup/iocost_monitor.py index b8c082c9fd7d5..103605f5be8c0 100644 --- a/tools/cgroup/iocost_monitor.py +++ b/tools/cgroup/iocost_monitor.py @@ -72,7 +72,7 @@ def walk(self, blkcg, q_id, parent_path): name = BlkgIterator.blkcg_name(blkcg) path = parent_path + '/' + name if parent_path else name blkg = drgn.Object(prog, 'struct blkcg_gq', - address=radix_tree_lookup(blkcg.blkg_tree, q_id)) + address=radix_tree_lookup(blkcg.blkg_tree.address_of_(), q_id)) if not blkg.address_: return @@ -233,7 +233,7 @@ def table_row_str(self, path): root_iocg = None ioc = None -for i, ptr in radix_tree_for_each(blkcg_root.blkg_tree): +for i, ptr in radix_tree_for_each(blkcg_root.blkg_tree.address_of_()): blkg = drgn.Object(prog, 'struct blkcg_gq', address=ptr) try: if devname == blkg.q.kobj.parent.name.string_().decode('utf-8'): From 65fe62b7279de70f2f3b5c16f379638fddeefec3 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 28 Jun 2021 19:35:41 -0700 Subject: [PATCH 0045/5344] writeback, cgroup: do not switch inodes with I_WILL_FREE flag commit 4ade5867b4b878b00a4526b8621442f9442536ce upstream. Patch series "cgroup, blkcg: prevent dirty inodes to pin dying memory cgroups", v9. When an inode is getting dirty for the first time it's associated with a wb structure (see __inode_attach_wb()). It can later be switched to another wb (if e.g. some other cgroup is writing a lot of data to the same inode), but otherwise stays attached to the original wb until being reclaimed. The problem is that the wb structure holds a reference to the original memory and blkcg cgroups. So if an inode has been dirty once and later is actively used in read-only mode, it has a good chance to pin down the original memory and blkcg cgroups forever. This is often the case with services bringing data for other services, e.g. updating some rpm packages. In the real life it becomes a problem due to a large size of the memcg structure, which can easily be 1000x larger than an inode. Also a really large number of dying cgroups can raise different scalability issues, e.g. making the memory reclaim costly and less effective. To solve the problem inodes should be eventually detached from the corresponding writeback structure. It's inefficient to do it after every writeback completion. Instead it can be done whenever the original memory cgroup is offlined and writeback structure is getting killed. Scanning over a (potentially long) list of inodes and detach them from the writeback structure can take quite some time. To avoid scanning all inodes, attached inodes are kept on a new list (b_attached). To make it less noticeable to a user, the scanning and switching is performed from a work context. Big thanks to Jan Kara, Dennis Zhou, Hillf Danton and Tejun Heo for their ideas and contribution to this patchset. This patch (of 8): If an inode's state has I_WILL_FREE flag set, the inode will be freed soon, so there is no point in trying to switch the inode to a different cgwb. I_WILL_FREE was ignored since the introduction of the inode switching, so it looks like it doesn't lead to any noticeable issues for a user. This is why the patch is not intended for a stable backport. Link: https://lkml.kernel.org/r/20210608230225.2078447-1-guro@fb.com Link: https://lkml.kernel.org/r/20210608230225.2078447-2-guro@fb.com Signed-off-by: Roman Gushchin Suggested-by: Jan Kara Acked-by: Tejun Heo Reviewed-by: Jan Kara Acked-by: Dennis Zhou Cc: Alexander Viro Cc: Dave Chinner Cc: Jan Kara Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Qi Zheng --- fs/fs-writeback.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 22e9c88f3960a..6d46570653a6a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -389,10 +389,10 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) xa_lock_irq(&mapping->i_pages); /* - * Once I_FREEING is visible under i_lock, the eviction path owns - * the inode and we shouldn't modify ->i_io_list. + * Once I_FREEING or I_WILL_FREE are visible under i_lock, the eviction + * path owns the inode and we shouldn't modify ->i_io_list. */ - if (unlikely(inode->i_state & I_FREEING)) + if (unlikely(inode->i_state & (I_FREEING | I_WILL_FREE))) goto skip_switch; trace_inode_switch_wbs(inode, old_wb, new_wb); @@ -524,7 +524,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) /* while holding I_WB_SWITCH, no one else can update the association */ spin_lock(&inode->i_lock); if (!(inode->i_sb->s_flags & SB_ACTIVE) || - inode->i_state & (I_WB_SWITCH | I_FREEING) || + inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) || inode_to_wb(inode) == isw->new_wb) { spin_unlock(&inode->i_lock); goto out_free; From d67ad9a44f371355658a9f014d83ba0a37b09543 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 28 Jun 2021 19:35:44 -0700 Subject: [PATCH 0046/5344] writeback, cgroup: add smp_mb() to cgroup_writeback_umount() commit 592fa002180af3425ba962b8e74edd680f0ec77b upstream. A full memory barrier is required between clearing SB_ACTIVE flag in generic_shutdown_super() and checking isw_nr_in_flight in cgroup_writeback_umount(), otherwise a new switch operation might be scheduled after atomic_read(&isw_nr_in_flight) returned 0. This would result in a non-flushed isw_wq, and a potential crash. The problem hasn't yet been seen in the real life and was discovered by Jan Kara by looking into the code. Link: https://lkml.kernel.org/r/20210608230225.2078447-3-guro@fb.com Signed-off-by: Roman Gushchin Suggested-by: Jan Kara Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Dave Chinner Cc: Dennis Zhou Cc: Jan Kara Cc: Tejun Heo Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Qi Zheng --- fs/fs-writeback.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 6d46570653a6a..478e00880c43d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1006,6 +1006,12 @@ int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr, */ void cgroup_writeback_umount(void) { + /* + * SB_ACTIVE should be reliably cleared before checking + * isw_nr_in_flight, see generic_shutdown_super(). + */ + smp_mb(); + if (atomic_read(&isw_nr_in_flight)) { /* * Use rcu_barrier() to wait for all pending callbacks to From 7b9cebdbefa8e2340aba40222c2000e37571cbca Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 28 Jun 2021 19:35:50 -0700 Subject: [PATCH 0047/5344] writeback, cgroup: switch to rcu_work API in inode_switch_wbs() commit 29264d92a0f157f3147129066d912718b99fc6b0 upstream. Inode's wb switching requires two steps divided by an RCU grace period. It's currently implemented as an RCU callback inode_switch_wbs_rcu_fn(), which schedules inode_switch_wbs_work_fn() as a work. Switching to the rcu_work API allows to do the same in a cleaner and slightly shorter form. Link: https://lkml.kernel.org/r/20210608230225.2078447-5-guro@fb.com Signed-off-by: Roman Gushchin Reviewed-by: Jan Kara Acked-by: Tejun Heo Acked-by: Dennis Zhou Cc: Alexander Viro Cc: Dave Chinner Cc: Jan Kara Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Qi Zheng --- fs/fs-writeback.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 478e00880c43d..6c2966ffa51ef 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -335,8 +335,7 @@ struct inode_switch_wbs_context { struct inode *inode; struct bdi_writeback *new_wb; - struct rcu_head rcu_head; - struct work_struct work; + struct rcu_work work; }; static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) @@ -352,7 +351,7 @@ static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) static void inode_switch_wbs_work_fn(struct work_struct *work) { struct inode_switch_wbs_context *isw = - container_of(work, struct inode_switch_wbs_context, work); + container_of(to_rcu_work(work), struct inode_switch_wbs_context, work); struct inode *inode = isw->inode; struct backing_dev_info *bdi = inode_to_bdi(inode); struct address_space *mapping = inode->i_mapping; @@ -469,16 +468,6 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) atomic_dec(&isw_nr_in_flight); } -static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head) -{ - struct inode_switch_wbs_context *isw = container_of(rcu_head, - struct inode_switch_wbs_context, rcu_head); - - /* needs to grab bh-unsafe locks, bounce to work item */ - INIT_WORK(&isw->work, inode_switch_wbs_work_fn); - queue_work(isw_wq, &isw->work); -} - /** * inode_switch_wbs - change the wb association of an inode * @inode: target inode @@ -541,7 +530,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) * lock so that stat transfer can synchronize against them. * Let's continue after I_WB_SWITCH is guaranteed to be visible. */ - call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); + INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); + queue_rcu_work(isw_wq, &isw->work); return; out_free: From e4158cebc935aae28dfa547cee46f2bc3626c7b9 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 28 Jun 2021 19:35:53 -0700 Subject: [PATCH 0048/5344] writeback, cgroup: keep list of inodes attached to bdi_writeback commit f3b6a6df38aa514d97e8c6fcc748be1d4142bec9 upstream. Currently there is no way to iterate over inodes attached to a specific cgwb structure. It limits the ability to efficiently reclaim the writeback structure itself and associated memory and block cgroup structures without scanning all inodes belonging to a sb, which can be prohibitively expensive. While dirty/in-active-writeback an inode belongs to one of the bdi_writeback's io lists: b_dirty, b_io, b_more_io and b_dirty_time. Once cleaned up, it's removed from all io lists. So the inode->i_io_list can be reused to maintain the list of inodes, attached to a bdi_writeback structure. This patch introduces a new wb->b_attached list, which contains all inodes which were dirty at least once and are attached to the given cgwb. Inodes attached to the root bdi_writeback structures are never placed on such list. The following patch will use this list to try to release cgwbs structures more efficiently. Link: https://lkml.kernel.org/r/20210608230225.2078447-6-guro@fb.com Signed-off-by: Roman Gushchin Suggested-by: Jan Kara Reviewed-by: Jan Kara Acked-by: Tejun Heo Acked-by: Dennis Zhou Cc: Alexander Viro Cc: Dave Chinner Cc: Jan Kara Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Qi Zheng --- fs/fs-writeback.c | 93 ++++++++++++++++++++------------ include/linux/backing-dev-defs.h | 1 + mm/backing-dev.c | 2 + 3 files changed, 62 insertions(+), 34 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 6c2966ffa51ef..6c511530f744f 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -131,25 +131,6 @@ static bool inode_io_list_move_locked(struct inode *inode, return false; } -/** - * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list - * @inode: inode to be removed - * @wb: bdi_writeback @inode is being removed from - * - * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and - * clear %WB_has_dirty_io if all are empty afterwards. - */ -static void inode_io_list_del_locked(struct inode *inode, - struct bdi_writeback *wb) -{ - assert_spin_locked(&wb->list_lock); - assert_spin_locked(&inode->i_lock); - - inode->i_state &= ~I_SYNC_QUEUED; - list_del_init(&inode->i_io_list); - wb_io_lists_depopulated(wb); -} - static void wb_wakeup(struct bdi_writeback *wb) { spin_lock_bh(&wb->work_lock); @@ -278,6 +259,28 @@ void __inode_attach_wb(struct inode *inode, struct page *page) } EXPORT_SYMBOL_GPL(__inode_attach_wb); +/** + * inode_cgwb_move_to_attached - put the inode onto wb->b_attached list + * @inode: inode of interest with i_lock held + * @wb: target bdi_writeback + * + * Remove the inode from wb's io lists and if necessarily put onto b_attached + * list. Only inodes attached to cgwb's are kept on this list. + */ +static void inode_cgwb_move_to_attached(struct inode *inode, + struct bdi_writeback *wb) +{ + assert_spin_locked(&wb->list_lock); + assert_spin_locked(&inode->i_lock); + + inode->i_state &= ~I_SYNC_QUEUED; + if (wb != &wb->bdi->wb) + list_move(&inode->i_io_list, &wb->b_attached); + else + list_del_init(&inode->i_io_list); + wb_io_lists_depopulated(wb); +} + /** * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it * @inode: inode of interest with i_lock held @@ -418,21 +421,28 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) wb_get(new_wb); /* - * Transfer to @new_wb's IO list if necessary. The specific list - * @inode was on is ignored and the inode is put on ->b_dirty which - * is always correct including from ->b_dirty_time. The transfer - * preserves @inode->dirtied_when ordering. + * Transfer to @new_wb's IO list if necessary. If the @inode is dirty, + * the specific list @inode was on is ignored and the @inode is put on + * ->b_dirty which is always correct including from ->b_dirty_time. + * The transfer preserves @inode->dirtied_when ordering. If the @inode + * was clean, it means it was on the b_attached list, so move it onto + * the b_attached list of @new_wb. */ if (!list_empty(&inode->i_io_list)) { - struct inode *pos; - - inode_io_list_del_locked(inode, old_wb); inode->i_wb = new_wb; - list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) - if (time_after_eq(inode->dirtied_when, - pos->dirtied_when)) - break; - inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev); + + if (inode->i_state & I_DIRTY_ALL) { + struct inode *pos; + + list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) + if (time_after_eq(inode->dirtied_when, + pos->dirtied_when)) + break; + inode_io_list_move_locked(inode, new_wb, + pos->i_io_list.prev); + } else { + inode_cgwb_move_to_attached(inode, new_wb); + } } else { inode->i_wb = new_wb; } @@ -1026,6 +1036,17 @@ fs_initcall(cgroup_writeback_init); static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } +static void inode_cgwb_move_to_attached(struct inode *inode, + struct bdi_writeback *wb) +{ + assert_spin_locked(&wb->list_lock); + assert_spin_locked(&inode->i_lock); + + inode->i_state &= ~I_SYNC_QUEUED; + list_del_init(&inode->i_io_list); + wb_io_lists_depopulated(wb); +} + static struct bdi_writeback * locked_inode_to_wb_and_lock_list(struct inode *inode) __releases(&inode->i_lock) @@ -1127,7 +1148,11 @@ void inode_io_list_del(struct inode *inode) wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); - inode_io_list_del_locked(inode, wb); + + inode->i_state &= ~I_SYNC_QUEUED; + list_del_init(&inode->i_io_list); + wb_io_lists_depopulated(wb); + spin_unlock(&inode->i_lock); spin_unlock(&wb->list_lock); } @@ -1439,7 +1464,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, inode->i_state &= ~I_SYNC_QUEUED; } else { /* The inode is clean. Remove from writeback lists. */ - inode_io_list_del_locked(inode, wb); + inode_cgwb_move_to_attached(inode, wb); } } @@ -1582,7 +1607,7 @@ static int writeback_single_inode(struct inode *inode, * touch it. See comment above for explanation. */ if (!(inode->i_state & I_DIRTY_ALL)) - inode_io_list_del_locked(inode, wb); + inode_cgwb_move_to_attached(inode, wb); spin_unlock(&wb->list_lock); inode_sync_complete(inode); out: diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 2849bdbb3acbe..95bf05d137783 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -177,6 +177,7 @@ struct bdi_writeback { struct cgroup_subsys_state *blkcg_css; /* and blkcg */ struct list_head memcg_node; /* anchored at memcg->cgwb_list */ struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */ + struct list_head b_attached; /* attached inodes, protected by list_lock */ union { struct work_struct release_work; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 3f2480e4c5af3..42bc8cf88524e 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -497,6 +497,7 @@ static void cgwb_release_workfn(struct work_struct *work) fprop_local_destroy_percpu(&wb->memcg_completions); percpu_ref_exit(&wb->refcnt); wb_exit(wb); + WARN_ON_ONCE(!list_empty(&wb->b_attached)); kfree_rcu(wb, rcu); } @@ -573,6 +574,7 @@ static int cgwb_create(struct backing_dev_info *bdi, wb->memcg_css = memcg_css; wb->blkcg_css = blkcg_css; + INIT_LIST_HEAD(&wb->b_attached); INIT_WORK(&wb->release_work, cgwb_release_workfn); set_bit(WB_registered, &wb->state); From b3167af1807b0c9690cd61b421dc40b6406af9a0 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 28 Jun 2021 19:35:56 -0700 Subject: [PATCH 0049/5344] writeback, cgroup: split out the functional part of inode_switch_wbs_work_fn() commit 72d4512e9cb14d790e361c0e085186a7ef2d2431 upstream. Split out the functional part of the inode_switch_wbs_work_fn() function as inode_do switch_wbs() to reuse it later for switching inodes attached to dying cgwbs. This commit doesn't bring any functional changes. Link: https://lkml.kernel.org/r/20210608230225.2078447-7-guro@fb.com Signed-off-by: Roman Gushchin Reviewed-by: Jan Kara Acked-by: Tejun Heo Acked-by: Dennis Zhou Cc: Alexander Viro Cc: Dave Chinner Cc: Jan Kara Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Qi Zheng --- fs/fs-writeback.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 6c511530f744f..8df8d14844cdc 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -351,15 +351,12 @@ static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) up_write(&bdi->wb_switch_rwsem); } -static void inode_switch_wbs_work_fn(struct work_struct *work) +static void inode_do_switch_wbs(struct inode *inode, + struct bdi_writeback *new_wb) { - struct inode_switch_wbs_context *isw = - container_of(to_rcu_work(work), struct inode_switch_wbs_context, work); - struct inode *inode = isw->inode; struct backing_dev_info *bdi = inode_to_bdi(inode); struct address_space *mapping = inode->i_mapping; struct bdi_writeback *old_wb = inode->i_wb; - struct bdi_writeback *new_wb = isw->new_wb; XA_STATE(xas, &mapping->i_pages, 0); struct page *page; bool switched = false; @@ -470,11 +467,17 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) wb_wakeup(new_wb); wb_put(old_wb); } - wb_put(new_wb); +} - iput(inode); - kfree(isw); +static void inode_switch_wbs_work_fn(struct work_struct *work) +{ + struct inode_switch_wbs_context *isw = + container_of(to_rcu_work(work), struct inode_switch_wbs_context, work); + inode_do_switch_wbs(isw->inode, isw->new_wb); + wb_put(isw->new_wb); + iput(isw->inode); + kfree(isw); atomic_dec(&isw_nr_in_flight); } From b8d78afcd8d2a6d058dc1f938155f7f487ee3e37 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 28 Jun 2021 19:35:59 -0700 Subject: [PATCH 0050/5344] writeback, cgroup: support switching multiple inodes at once commit f5fbe6b7ad6ef1fbdf8074a6ca9fdab739bf86d4 upstream. Currently only a single inode can be switched to another writeback structure at once. That means to switch an inode a separate inode_switch_wbs_context structure must be allocated, and a separate rcu callback and work must be scheduled. It's fine for the existing ad-hoc switching, which is not happening that often, but sub-optimal for massive switching required in order to release a writeback structure. To prepare for it, let's add a support for switching multiple inodes at once. Instead of containing a single inode pointer, inode_switch_wbs_context will contain a NULL-terminated array of inode pointers. inode_do_switch_wbs() will be called for each inode. To optimize the locking bdi->wb_switch_rwsem, old_wb's and new_wb's list_locks will be acquired and released only once altogether for all inodes. wb_wakeup() will be also be called only once. Instead of calling wb_put(old_wb) after each successful switch, wb_put_many() is introduced and used. Link: https://lkml.kernel.org/r/20210608230225.2078447-8-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Tejun Heo Reviewed-by: Jan Kara Acked-by: Dennis Zhou Cc: Alexander Viro Cc: Dave Chinner Cc: Jan Kara Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Qi Zheng --- fs/fs-writeback.c | 106 +++++++++++++++++++------------ include/linux/backing-dev-defs.h | 18 +++++- 2 files changed, 80 insertions(+), 44 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 8df8d14844cdc..cadb77f2dd5af 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -335,10 +335,18 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) } struct inode_switch_wbs_context { - struct inode *inode; - struct bdi_writeback *new_wb; - struct rcu_work work; + + /* + * Multiple inodes can be switched at once. The switching procedure + * consists of two parts, separated by a RCU grace period. To make + * sure that the second part is executed for each inode gone through + * the first part, all inode pointers are placed into a NULL-terminated + * array embedded into struct inode_switch_wbs_context. Otherwise + * an inode could be left in a non-consistent state. + */ + struct bdi_writeback *new_wb; + struct inode *inodes[]; }; static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) @@ -351,39 +359,15 @@ static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) up_write(&bdi->wb_switch_rwsem); } -static void inode_do_switch_wbs(struct inode *inode, +static bool inode_do_switch_wbs(struct inode *inode, + struct bdi_writeback *old_wb, struct bdi_writeback *new_wb) { - struct backing_dev_info *bdi = inode_to_bdi(inode); struct address_space *mapping = inode->i_mapping; - struct bdi_writeback *old_wb = inode->i_wb; XA_STATE(xas, &mapping->i_pages, 0); struct page *page; bool switched = false; - /* - * If @inode switches cgwb membership while sync_inodes_sb() is - * being issued, sync_inodes_sb() might miss it. Synchronize. - */ - down_read(&bdi->wb_switch_rwsem); - - /* - * By the time control reaches here, RCU grace period has passed - * since I_WB_SWITCH assertion and all wb stat update transactions - * between unlocked_inode_to_wb_begin/end() are guaranteed to be - * synchronizing against the i_pages lock. - * - * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock - * gives us exclusion against all wb related operations on @inode - * including IO list manipulations and stat updates. - */ - if (old_wb < new_wb) { - spin_lock(&old_wb->list_lock); - spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); - } else { - spin_lock(&new_wb->list_lock); - spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); - } spin_lock(&inode->i_lock); xa_lock_irq(&mapping->i_pages); @@ -458,25 +442,63 @@ static void inode_do_switch_wbs(struct inode *inode, xa_unlock_irq(&mapping->i_pages); spin_unlock(&inode->i_lock); - spin_unlock(&new_wb->list_lock); - spin_unlock(&old_wb->list_lock); - - up_read(&bdi->wb_switch_rwsem); - if (switched) { - wb_wakeup(new_wb); - wb_put(old_wb); - } + return switched; } static void inode_switch_wbs_work_fn(struct work_struct *work) { struct inode_switch_wbs_context *isw = container_of(to_rcu_work(work), struct inode_switch_wbs_context, work); + struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]); + struct bdi_writeback *old_wb = isw->inodes[0]->i_wb; + struct bdi_writeback *new_wb = isw->new_wb; + unsigned long nr_switched = 0; + struct inode **inodep; + + /* + * If @inode switches cgwb membership while sync_inodes_sb() is + * being issued, sync_inodes_sb() might miss it. Synchronize. + */ + down_read(&bdi->wb_switch_rwsem); + + /* + * By the time control reaches here, RCU grace period has passed + * since I_WB_SWITCH assertion and all wb stat update transactions + * between unlocked_inode_to_wb_begin/end() are guaranteed to be + * synchronizing against the i_pages lock. + * + * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock + * gives us exclusion against all wb related operations on @inode + * including IO list manipulations and stat updates. + */ + if (old_wb < new_wb) { + spin_lock(&old_wb->list_lock); + spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); + } else { + spin_lock(&new_wb->list_lock); + spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); + } + + for (inodep = isw->inodes; *inodep; inodep++) { + WARN_ON_ONCE((*inodep)->i_wb != old_wb); + if (inode_do_switch_wbs(*inodep, old_wb, new_wb)) + nr_switched++; + } + + spin_unlock(&new_wb->list_lock); + spin_unlock(&old_wb->list_lock); + + up_read(&bdi->wb_switch_rwsem); + + if (nr_switched) { + wb_wakeup(new_wb); + wb_put_many(old_wb, nr_switched); + } - inode_do_switch_wbs(isw->inode, isw->new_wb); - wb_put(isw->new_wb); - iput(isw->inode); + for (inodep = isw->inodes; *inodep; inodep++) + iput(*inodep); + wb_put(new_wb); kfree(isw); atomic_dec(&isw_nr_in_flight); } @@ -503,7 +525,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT) return; - isw = kzalloc(sizeof(*isw), GFP_ATOMIC); + isw = kzalloc(sizeof(*isw) + 2 * sizeof(struct inode *), GFP_ATOMIC); if (!isw) return; @@ -535,7 +557,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) __iget(inode); spin_unlock(&inode->i_lock); - isw->inode = inode; + isw->inodes[0] = inode; /* * In addition to synchronizing among switchers, I_WB_SWITCH tells diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 95bf05d137783..daab20f84dcaa 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -280,8 +280,9 @@ static inline void wb_get(struct bdi_writeback *wb) /** * wb_put - decrement a wb's refcount * @wb: bdi_writeback to put + * @nr: number of references to put */ -static inline void wb_put(struct bdi_writeback *wb) +static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr) { if (WARN_ON_ONCE(!wb->bdi)) { /* @@ -292,7 +293,16 @@ static inline void wb_put(struct bdi_writeback *wb) } if (wb != &wb->bdi->wb) - percpu_ref_put(&wb->refcnt); + percpu_ref_put_many(&wb->refcnt, nr); +} + +/** + * wb_put - decrement a wb's refcount + * @wb: bdi_writeback to put + */ +static inline void wb_put(struct bdi_writeback *wb) +{ + wb_put_many(wb, 1); } /** @@ -321,6 +331,10 @@ static inline void wb_put(struct bdi_writeback *wb) { } +static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr) +{ +} + static inline bool wb_dying(struct bdi_writeback *wb) { return false; From dbd576b79b334dc86d0d667f475723809a1c42a5 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 28 Jun 2021 19:36:03 -0700 Subject: [PATCH 0051/5344] writeback, cgroup: release dying cgwbs by switching attached inodes commit c22d70a162d3cc177282c4487be4d54876ca55c8 upstream. Asynchronously try to release dying cgwbs by switching attached inodes to the nearest living ancestor wb. It helps to get rid of per-cgroup writeback structures themselves and of pinned memory and block cgroups, which are significantly larger structures (mostly due to large per-cpu statistics data). This prevents memory waste and helps to avoid different scalability problems caused by large piles of dying cgroups. Reuse the existing mechanism of inode switching used for foreign inode detection. To speed things up batch up to 115 inode switching in a single operation (the maximum number is selected so that the resulting struct inode_switch_wbs_context can fit into 1024 bytes). Because every switching consists of two steps divided by an RCU grace period, it would be too slow without batching. Please note that the whole batch counts as a single operation (when increasing/decreasing isw_nr_in_flight). This allows to keep umounting working (flush the switching queue), however prevents cleanups from consuming the whole switching quota and effectively blocking the frn switching. A cgwb cleanup operation can fail due to different reasons (e.g. not enough memory, the cgwb has an in-flight/pending io, an attached inode in a wrong state, etc). In this case the next scheduled cleanup will make a new attempt. An attempt is made each time a new cgwb is offlined (in other words a memcg and/or a blkcg is deleted by a user). In the future an additional attempt scheduled by a timer can be implemented. [guro@fb.com: replace open-coded "115" with arithmetic] Link: https://lkml.kernel.org/r/YMEcSBcq/VXMiPPO@carbon.dhcp.thefacebook.com [guro@fb.com: add smp_mb() to inode_prepare_wbs_switch()] Link: https://lkml.kernel.org/r/YMFa+guFw7OFjf3X@carbon.dhcp.thefacebook.com [willy@infradead.org: fix documentation] Link: https://lkml.kernel.org/r/20210615200242.1716568-2-willy@infradead.org Link: https://lkml.kernel.org/r/20210608230225.2078447-9-guro@fb.com Signed-off-by: Roman Gushchin Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Tejun Heo Acked-by: Dennis Zhou Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Dave Chinner Cc: Jan Kara Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Qi Zheng --- fs/fs-writeback.c | 111 ++++++++++++++++++++++++++++--- include/linux/backing-dev-defs.h | 1 + include/linux/writeback.h | 1 + mm/backing-dev.c | 64 +++++++++++++++++- 4 files changed, 164 insertions(+), 13 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index cadb77f2dd5af..7b279e75572d2 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -225,6 +225,13 @@ void wb_wait_for_completion(struct wb_completion *done) /* one round can affect upto 5 slots */ #define WB_FRN_MAX_IN_FLIGHT 1024 /* don't queue too many concurrently */ +/* + * Maximum inodes per isw. A specific value has been chosen to make + * struct inode_switch_wbs_context fit into 1024 bytes kmalloc. + */ +#define WB_MAX_INODES_PER_ISW ((1024UL - sizeof(struct inode_switch_wbs_context)) \ + / sizeof(struct inode *)) + static atomic_t isw_nr_in_flight = ATOMIC_INIT(0); static struct workqueue_struct *isw_wq; @@ -503,6 +510,32 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) atomic_dec(&isw_nr_in_flight); } +static bool inode_prepare_wbs_switch(struct inode *inode, + struct bdi_writeback *new_wb) +{ + /* + * Paired with smp_mb() in cgroup_writeback_umount(). + * isw_nr_in_flight must be increased before checking SB_ACTIVE and + * grabbing an inode, otherwise isw_nr_in_flight can be observed as 0 + * in cgroup_writeback_umount() and the isw_wq will be not flushed. + */ + smp_mb(); + + /* while holding I_WB_SWITCH, no one else can update the association */ + spin_lock(&inode->i_lock); + if (!(inode->i_sb->s_flags & SB_ACTIVE) || + inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) || + inode_to_wb(inode) == new_wb) { + spin_unlock(&inode->i_lock); + return false; + } + inode->i_state |= I_WB_SWITCH; + __iget(inode); + spin_unlock(&inode->i_lock); + + return true; +} + /** * inode_switch_wbs - change the wb association of an inode * @inode: target inode @@ -545,17 +578,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) if (!isw->new_wb) goto out_free; - /* while holding I_WB_SWITCH, no one else can update the association */ - spin_lock(&inode->i_lock); - if (!(inode->i_sb->s_flags & SB_ACTIVE) || - inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) || - inode_to_wb(inode) == isw->new_wb) { - spin_unlock(&inode->i_lock); + if (!inode_prepare_wbs_switch(inode, isw->new_wb)) goto out_free; - } - inode->i_state |= I_WB_SWITCH; - __iget(inode); - spin_unlock(&inode->i_lock); isw->inodes[0] = inode; @@ -576,6 +600,73 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) kfree(isw); } +/** + * cleanup_offline_cgwb - detach associated inodes + * @wb: target wb + * + * Switch all inodes attached to @wb to a nearest living ancestor's wb in order + * to eventually release the dying @wb. Returns %true if not all inodes were + * switched and the function has to be restarted. + */ +bool cleanup_offline_cgwb(struct bdi_writeback *wb) +{ + struct cgroup_subsys_state *memcg_css; + struct inode_switch_wbs_context *isw; + struct inode *inode; + int nr; + bool restart = false; + + isw = kzalloc(sizeof(*isw) + WB_MAX_INODES_PER_ISW * + sizeof(struct inode *), GFP_KERNEL); + if (!isw) + return restart; + + atomic_inc(&isw_nr_in_flight); + + for (memcg_css = wb->memcg_css->parent; memcg_css; + memcg_css = memcg_css->parent) { + isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL); + if (isw->new_wb) + break; + } + if (unlikely(!isw->new_wb)) + isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */ + + nr = 0; + spin_lock(&wb->list_lock); + list_for_each_entry(inode, &wb->b_attached, i_io_list) { + if (!inode_prepare_wbs_switch(inode, isw->new_wb)) + continue; + + isw->inodes[nr++] = inode; + + if (nr >= WB_MAX_INODES_PER_ISW - 1) { + restart = true; + break; + } + } + spin_unlock(&wb->list_lock); + + /* no attached inodes? bail out */ + if (nr == 0) { + atomic_dec(&isw_nr_in_flight); + wb_put(isw->new_wb); + kfree(isw); + return restart; + } + + /* + * In addition to synchronizing among switchers, I_WB_SWITCH tells + * the RCU protected stat update paths to grab the i_page + * lock so that stat transfer can synchronize against them. + * Let's continue after I_WB_SWITCH is guaranteed to be visible. + */ + INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); + queue_rcu_work(isw_wq, &isw->work); + + return restart; +} + /** * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it * @wbc: writeback_control of interest diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index daab20f84dcaa..edbbca35f4c3d 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -178,6 +178,7 @@ struct bdi_writeback { struct list_head memcg_node; /* anchored at memcg->cgwb_list */ struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */ struct list_head b_attached; /* attached inodes, protected by list_lock */ + struct list_head offline_node; /* anchored at offline_cgwbs */ union { struct work_struct release_work; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a19d845dd7eb9..718966969ef61 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -220,6 +220,7 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr_pages, enum wb_reason reason, struct wb_completion *done); void cgroup_writeback_umount(void); +bool cleanup_offline_cgwb(struct bdi_writeback *wb); /** * inode_attach_wb - associate an inode with its wb diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 42bc8cf88524e..0646c04af5dae 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -385,9 +385,8 @@ static void wb_exit(struct bdi_writeback *wb) #include /* - * cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree, - * blkcg->cgwb_list, and memcg->cgwb_list. bdi->cgwb_tree is also RCU - * protected. + * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and + * memcg->cgwb_list. bdi->cgwb_tree is also RCU protected. */ static DEFINE_SPINLOCK(cgwb_lock); static struct workqueue_struct *cgwb_release_wq; @@ -477,6 +476,9 @@ void wb_congested_put(struct bdi_writeback_congested *congested) spin_unlock_irqrestore(&cgwb_lock, flags); kfree(congested); } +static LIST_HEAD(offline_cgwbs); +static void cleanup_offline_cgwbs_workfn(struct work_struct *work); +static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn); static void cgwb_release_workfn(struct work_struct *work) { @@ -496,6 +498,11 @@ static void cgwb_release_workfn(struct work_struct *work) fprop_local_destroy_percpu(&wb->memcg_completions); percpu_ref_exit(&wb->refcnt); + + spin_lock_irq(&cgwb_lock); + list_del(&wb->offline_node); + spin_unlock_irq(&cgwb_lock); + wb_exit(wb); WARN_ON_ONCE(!list_empty(&wb->b_attached)); kfree_rcu(wb, rcu); @@ -515,6 +522,7 @@ static void cgwb_kill(struct bdi_writeback *wb) WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id)); list_del(&wb->memcg_node); list_del(&wb->blkcg_node); + list_add(&wb->offline_node, &offline_cgwbs); percpu_ref_kill(&wb->refcnt); } @@ -737,6 +745,54 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi) mutex_unlock(&bdi->cgwb_release_mutex); } +/* + * cleanup_offline_cgwbs_workfn - try to release dying cgwbs + * + * Try to release dying cgwbs by switching attached inodes to the nearest + * living ancestor's writeback. Processed wbs are placed at the end + * of the list to guarantee the forward progress. + */ +static void cleanup_offline_cgwbs_workfn(struct work_struct *work) +{ + struct bdi_writeback *wb; + LIST_HEAD(processed); + + spin_lock_irq(&cgwb_lock); + + while (!list_empty(&offline_cgwbs)) { + wb = list_first_entry(&offline_cgwbs, struct bdi_writeback, + offline_node); + list_move(&wb->offline_node, &processed); + + /* + * If wb is dirty, cleaning up the writeback by switching + * attached inodes will result in an effective removal of any + * bandwidth restrictions, which isn't the goal. Instead, + * it can be postponed until the next time, when all io + * will be likely completed. If in the meantime some inodes + * will get re-dirtied, they should be eventually switched to + * a new cgwb. + */ + if (wb_has_dirty_io(wb)) + continue; + + if (!wb_tryget(wb)) + continue; + + spin_unlock_irq(&cgwb_lock); + while (cleanup_offline_cgwb(wb)) + cond_resched(); + spin_lock_irq(&cgwb_lock); + + wb_put(wb); + } + + if (!list_empty(&processed)) + list_splice_tail(&processed, &offline_cgwbs); + + spin_unlock_irq(&cgwb_lock); +} + /** * wb_memcg_offline - kill all wb's associated with a memcg being offlined * @memcg: memcg being offlined @@ -753,6 +809,8 @@ void wb_memcg_offline(struct mem_cgroup *memcg) cgwb_kill(wb); memcg_cgwb_list->next = NULL; /* prevent new wb's */ spin_unlock_irq(&cgwb_lock); + + queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work); } /** From f4781d455f01070343f22b23a00426a8e437dc49 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 23 Jul 2021 15:50:29 -0700 Subject: [PATCH 0052/5344] writeback, cgroup: remove wb from offline list before releasing refcnt commit b43a9e76b4cc78cdaa8c809dd31cd452797b7661 upstream. Boyang reported that the commit c22d70a162d3 ("writeback, cgroup: release dying cgwbs by switching attached inodes") causes the kernel to crash while running xfstests generic/256 on ext4 on aarch64 and ppc64le. run fstests generic/256 at 2021-07-12 05:41:40 EXT4-fs (vda3): mounted filesystem with ordered data mode. Opts: . Quota mode: none. Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 Mem abort info: ESR = 0x96000005 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x05: level 1 translation fault Data abort info: ISV = 0, ISS = 0x00000005 CM = 0, WnR = 0 user pgtable: 64k pages, 48-bit VAs, pgdp=00000000b0502000 [0000000000000000] pgd=0000000000000000, p4d=0000000000000000, pud=0000000000000000 Internal error: Oops: 96000005 [#1] SMP Modules linked in: dm_flakey dm_snapshot dm_bufio dm_zero dm_mod loop tls rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace fscache netfs rfkill sunrpc ext4 vfat fat mbcache jbd2 drm fuse xfs libcrc32c crct10dif_ce ghash_ce sha2_ce sha256_arm64 sha1_ce virtio_blk virtio_net net_failover virtio_console failover virtio_mmio aes_neon_bs [last unloaded: scsi_debug] CPU: 0 PID: 408468 Comm: kworker/u8:5 Tainted: G X --------- --- 5.14.0-0.rc1.15.bx.el9.aarch64 #1 Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 Workqueue: events_unbound cleanup_offline_cgwbs_workfn pstate: 004000c5 (nzcv daIF +PAN -UAO -TCO BTYPE=--) pc : cleanup_offline_cgwbs_workfn+0x320/0x394 lr : cleanup_offline_cgwbs_workfn+0xe0/0x394 sp : ffff80001554fd10 x29: ffff80001554fd10 x28: 0000000000000000 x27: 0000000000000001 x26: 0000000000000000 x25: 00000000000000e0 x24: ffffd2a2fbe671a8 x23: ffff80001554fd88 x22: ffffd2a2fbe67198 x21: ffffd2a2fc25a730 x20: ffff210412bc3000 x19: ffff210412bc3280 x18: 0000000000000000 x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000 x14: 0000000000000000 x13: 0000000000000030 x12: 0000000000000040 x11: ffff210481572238 x10: ffff21048157223a x9 : ffffd2a2fa276c60 x8 : ffff210484106b60 x7 : 0000000000000000 x6 : 000000000007d18a x5 : ffff210416a86400 x4 : ffff210412bc0280 x3 : 0000000000000000 x2 : ffff80001554fd88 x1 : ffff210412bc0280 x0 : 0000000000000003 Call trace: cleanup_offline_cgwbs_workfn+0x320/0x394 process_one_work+0x1f4/0x4b0 worker_thread+0x184/0x540 kthread+0x114/0x120 ret_from_fork+0x10/0x18 Code: d63f0020 97f99963 17ffffa6 f8588263 (f9400061) ---[ end trace e250fe289272792a ]--- Kernel panic - not syncing: Oops: Fatal exception SMP: stopping secondary CPUs SMP: failed to stop secondary CPUs 0-2 Kernel Offset: 0x52a2e9fa0000 from 0xffff800010000000 PHYS_OFFSET: 0xfff0defca0000000 CPU features: 0x00200251,23200840 Memory Limit: none ---[ end Kernel panic - not syncing: Oops: Fatal exception ]--- The problem happens when cgwb_release_workfn() races with cleanup_offline_cgwbs_workfn(): wb_tryget() in cleanup_offline_cgwbs_workfn() can be called after percpu_ref_exit() is cgwb_release_workfn(), which is basically a use-after-free error. Fix the problem by making removing the writeback structure from the offline list before releasing the percpu reference counter. It will guarantee that cleanup_offline_cgwbs_workfn() will not see and not access writeback structures which are about to be released. Link: https://lkml.kernel.org/r/20210716201039.3762203-1-guro@fb.com Fixes: c22d70a162d3 ("writeback, cgroup: release dying cgwbs by switching attached inodes") Signed-off-by: Roman Gushchin Reported-by: Boyang Xue Suggested-by: Jan Kara Tested-by: Darrick J. Wong Cc: Will Deacon Cc: Dave Chinner Cc: Murphy Zhou Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Qi Zheng --- mm/backing-dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 0646c04af5dae..44c9abf56c9d1 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -497,12 +497,12 @@ static void cgwb_release_workfn(struct work_struct *work) blkcg_cgwb_put(blkcg); fprop_local_destroy_percpu(&wb->memcg_completions); - percpu_ref_exit(&wb->refcnt); spin_lock_irq(&cgwb_lock); list_del(&wb->offline_node); spin_unlock_irq(&cgwb_lock); + percpu_ref_exit(&wb->refcnt); wb_exit(wb); WARN_ON_ONCE(!list_empty(&wb->b_attached)); kfree_rcu(wb, rcu); From 7183208ec18b6b8152ddcfbfc57dceb4619174cb Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 23 Jul 2021 15:50:32 -0700 Subject: [PATCH 0053/5344] writeback, cgroup: do not reparent dax inodes commit 593311e85b26ecc6e4d45b6fb81b942b6672df09 upstream. The inode switching code is not suited for dax inodes. An attempt to switch a dax inode to a parent writeback structure (as a part of a writeback cleanup procedure) results in a panic like this: run fstests generic/270 at 2021-07-15 05:54:02 XFS (pmem0p2): EXPERIMENTAL big timestamp feature in use. Use at your own risk! XFS (pmem0p2): DAX enabled. Warning: EXPERIMENTAL, use at your own risk XFS (pmem0p2): EXPERIMENTAL inode btree counters feature in use. Use at your own risk! XFS (pmem0p2): Mounting V5 Filesystem XFS (pmem0p2): Ending clean mount XFS (pmem0p2): Quotacheck needed: Please wait. XFS (pmem0p2): Quotacheck: Done. XFS (pmem0p2): xlog_verify_grant_tail: space > BBTOB(tail_blocks) XFS (pmem0p2): xlog_verify_grant_tail: space > BBTOB(tail_blocks) XFS (pmem0p2): xlog_verify_grant_tail: space > BBTOB(tail_blocks) BUG: unable to handle page fault for address: 0000000005b0f669 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI CPU: 13 PID: 10479 Comm: kworker/13:16 Not tainted 5.14.0-rc1-master-8096acd7442e+ #8 Hardware name: HP ProLiant DL360 Gen9/ProLiant DL360 Gen9, BIOS P89 09/13/2016 Workqueue: inode_switch_wbs inode_switch_wbs_work_fn RIP: 0010:inode_do_switch_wbs+0xaf/0x470 Code: 00 30 0f 85 c1 03 00 00 0f 1f 44 00 00 31 d2 48 c7 c6 ff ff ff ff 48 8d 7c 24 08 e8 eb 49 1a 00 48 85 c0 74 4a bb ff ff ff ff <48> 8b 50 08 48 8d 4a ff 83 e2 01 48 0f 45 c1 48 8b 00 a8 08 0f 85 RSP: 0018:ffff9c66691abdc8 EFLAGS: 00010002 RAX: 0000000005b0f661 RBX: 00000000ffffffff RCX: ffff89e6a21382b0 RDX: 0000000000000001 RSI: ffff89e350230248 RDI: ffffffffffffffff RBP: ffff89e681d19400 R08: 0000000000000000 R09: 0000000000000228 R10: ffffffffffffffff R11: ffffffffffffffc0 R12: ffff89e6a2138130 R13: ffff89e316af7400 R14: ffff89e316af6e78 R15: ffff89e6a21382b0 FS: 0000000000000000(0000) GS:ffff89ee5fb40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000005b0f669 CR3: 0000000cb2410004 CR4: 00000000001706e0 Call Trace: inode_switch_wbs_work_fn+0xb6/0x2a0 process_one_work+0x1e6/0x380 worker_thread+0x53/0x3d0 kthread+0x10f/0x130 ret_from_fork+0x22/0x30 Modules linked in: xt_CHECKSUM xt_MASQUERADE xt_conntrack ipt_REJECT nf_reject_ipv4 nft_compat nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 nft_counter nf_tables nfnetlink bridge stp llc rfkill sunrpc intel_rapl_msr intel_rapl_common sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel ipmi_ssif kvm mgag200 i2c_algo_bit iTCO_wdt irqbypass drm_kms_helper iTCO_vendor_support acpi_ipmi rapl syscopyarea sysfillrect intel_cstate ipmi_si sysimgblt ioatdma dax_pmem_compat fb_sys_fops ipmi_devintf device_dax i2c_i801 pcspkr intel_uncore hpilo nd_pmem cec dax_pmem_core dca i2c_smbus acpi_tad lpc_ich ipmi_msghandler acpi_power_meter drm fuse xfs libcrc32c sd_mod t10_pi crct10dif_pclmul crc32_pclmul crc32c_intel tg3 ghash_clmulni_intel serio_raw hpsa hpwdt scsi_transport_sas wmi dm_mirror dm_region_hash dm_log dm_mod CR2: 0000000005b0f669 ---[ end trace ed2105faff8384f3 ]--- RIP: 0010:inode_do_switch_wbs+0xaf/0x470 Code: 00 30 0f 85 c1 03 00 00 0f 1f 44 00 00 31 d2 48 c7 c6 ff ff ff ff 48 8d 7c 24 08 e8 eb 49 1a 00 48 85 c0 74 4a bb ff ff ff ff <48> 8b 50 08 48 8d 4a ff 83 e2 01 48 0f 45 c1 48 8b 00 a8 08 0f 85 RSP: 0018:ffff9c66691abdc8 EFLAGS: 00010002 RAX: 0000000005b0f661 RBX: 00000000ffffffff RCX: ffff89e6a21382b0 RDX: 0000000000000001 RSI: ffff89e350230248 RDI: ffffffffffffffff RBP: ffff89e681d19400 R08: 0000000000000000 R09: 0000000000000228 R10: ffffffffffffffff R11: ffffffffffffffc0 R12: ffff89e6a2138130 R13: ffff89e316af7400 R14: ffff89e316af6e78 R15: ffff89e6a21382b0 FS: 0000000000000000(0000) GS:ffff89ee5fb40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000005b0f669 CR3: 0000000cb2410004 CR4: 00000000001706e0 Kernel panic - not syncing: Fatal exception Kernel Offset: 0x15200000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) ---[ end Kernel panic - not syncing: Fatal exception ]--- The crash happens on an attempt to iterate over attached pagecache pages and check the dirty flag: a dax inode's xarray contains pfn's instead of generic struct page pointers. This happens for DAX and not for other kinds of non-page entries in the inodes because it's a tagged iteration, and shadow/swap entries are never tagged; only DAX entries get tagged. Fix the problem by bailing out (with the false return value) of inode_prepare_sbs_switch() if a dax inode is passed. [willy@infradead.org: changelog addition] Link: https://lkml.kernel.org/r/20210719171350.3876830-1-guro@fb.com Fixes: c22d70a162d3 ("writeback, cgroup: release dying cgwbs by switching attached inodes") Signed-off-by: Roman Gushchin Reported-by: Murphy Zhou Reported-by: Darrick J. Wong Tested-by: Darrick J. Wong Tested-by: Murphy Zhou Acked-by: Matthew Wilcox (Oracle) Cc: Jan Kara Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Qi Zheng --- fs/fs-writeback.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 7b279e75572d2..1e65cf323faf5 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -521,6 +521,9 @@ static bool inode_prepare_wbs_switch(struct inode *inode, */ smp_mb(); + if (IS_DAX(inode)) + return false; + /* while holding I_WB_SWITCH, no one else can update the association */ spin_lock(&inode->i_lock); if (!(inode->i_sb->s_flags & SB_ACTIVE) || From 2532b11ff056b1a46cc5f8877cdbf63e859cef6f Mon Sep 17 00:00:00 2001 From: Luca Stefani Date: Wed, 5 Aug 2020 11:57:08 +0200 Subject: [PATCH 0054/5344] RAS/CEC: Fix cec_init() prototype commit 85e6084e0b436cabe9c909e679937998ffbf9c9d upstream. late_initcall() expects a function that returns an integer. Update the function signature to match. [ bp: Massage commit message into proper sentences. ] Fixes: 9554bfe403bd ("x86/mce: Convert the CEC to use the MCE notifier") Signed-off-by: Luca Stefani Signed-off-by: Borislav Petkov Reviewed-by: Sami Tolvanen Tested-by: Sami Tolvanen Link: https://lkml.kernel.org/r/20200805095708.83939-1-luca.stefani.ge1@gmail.com Signed-off-by: Muchun Song --- drivers/ras/cec.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c index 20191e785467a..5352b7f335e45 100644 --- a/drivers/ras/cec.c +++ b/drivers/ras/cec.c @@ -562,20 +562,20 @@ static struct notifier_block cec_nb = { .priority = MCE_PRIO_CEC, }; -static void __init cec_init(void) +static int __init cec_init(void) { if (ce_arr.disabled) - return; + return -ENODEV; ce_arr.array = (void *)get_zeroed_page(GFP_KERNEL); if (!ce_arr.array) { pr_err("Error allocating CE array page!\n"); - return; + return -ENOMEM; } if (create_debugfs_nodes()) { free_page((unsigned long)ce_arr.array); - return; + return -ENOMEM; } INIT_DELAYED_WORK(&cec_work, cec_work_fn); @@ -584,6 +584,7 @@ static void __init cec_init(void) mce_register_decode_chain(&cec_nb); pr_info("Correctable Errors collector initialized.\n"); + return 0; } late_initcall(cec_init); From 7236d2e8377b8cd8aa8b192d154ea42feae22de9 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 15 Apr 2020 12:58:26 -0700 Subject: [PATCH 0055/5344] x86/mce: Drop bogus comment about mce.kflags commit f82cdff1aa7f6dcf6625c7219342a6e5c977098d upstream. The bit definitions for kflags are for internal use only. A late edit moved them from uapi/asm/mce.h to the internal x86 , but the comment saying "See below" was accidentally left here. Delete "See below". Just labelling this field as internal kernel use is sufficient. Fixes: 1de08dccd383 ("x86/mce: Add a struct mce.kflags field") Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200415195826.GA13681@agluck-desk2.amr.corp.intel.com Signed-off-by: Muchun Song --- arch/x86/include/uapi/asm/mce.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h index 5b59d80f1d4e1..db9adc081c5af 100644 --- a/arch/x86/include/uapi/asm/mce.h +++ b/arch/x86/include/uapi/asm/mce.h @@ -35,7 +35,7 @@ struct mce { __u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */ __u64 ppin; /* Protected Processor Inventory Number */ __u32 microcode; /* Microcode revision */ - __u64 kflags; /* Internal kernel use. See below */ + __u64 kflags; /* Internal kernel use */ }; #define MCE_GET_RECORD_LEN _IOR('M', 1, int) From 0673dc122b3d712b4fd91d48808ff3f9f02a0a73 Mon Sep 17 00:00:00 2001 From: Zhengjun Xing Date: Thu, 23 Dec 2021 22:48:26 +0800 Subject: [PATCH 0056/5344] perf/x86/intel/uncore: Fix CAS_COUNT_WRITE issue for ICX MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 96fd2e89fba1aaada6f4b1e5d25a9d9ecbe1943d upstream. The user recently report a perf issue in the ICX platform, when test by perf event “uncore_imc_x/cas_count_write”,the write bandwidth is always very small (only 0.38MB/s), it is caused by the wrong "umask" for the "cas_count_write" event. When double-checking, find "cas_count_read" also is wrong. The public document for ICX uncore: 3rd Gen Intel® Xeon® Processor Scalable Family, Codename Ice Lake,Uncore Performance Monitoring Reference Manual, Revision 1.00, May 2021 On 2.4.7, it defines Unit Masks for CAS_COUNT: RD b00001111 WR b00110000 So corrected both "cas_count_read" and "cas_count_write" for ICX. Old settings: hswep_uncore_imc_events INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03") INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c") New settings: snr_uncore_imc_events INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x0f") INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x30") Fixes: 2b3b76b5ec67 ("perf/x86/intel/uncore: Add Ice Lake server uncore support") Signed-off-by: Zhengjun Xing Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Adrian Hunter Reviewed-by: Kan Liang Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20211223144826.841267-1-zhengjun.xing@linux.intel.com Signed-off-by: Muchun Song --- arch/x86/events/intel/uncore_snbep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 0aa40798252b6..684dd0a1831ea 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -4972,7 +4972,7 @@ static struct intel_uncore_type icx_uncore_imc = { .fixed_ctr_bits = 48, .fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR, .fixed_ctl = SNR_IMC_MMIO_PMON_FIXED_CTL, - .event_descs = hswep_uncore_imc_events, + .event_descs = snr_uncore_imc_events, .perf_ctr = SNR_IMC_MMIO_PMON_CTR0, .event_ctl = SNR_IMC_MMIO_PMON_CTL0, .event_mask = SNBEP_PMON_RAW_EVENT_MASK, From 7acdd0672da7d80b91bbfaffd569c55e759d20c5 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 26 Aug 2021 08:32:39 -0700 Subject: [PATCH 0057/5344] perf/x86/intel/uncore: Fix Intel ICX IIO event constraints commit f42e8a603c88f72bf047a710b9fc1d3579f31e71 upstream. According to the latest uncore document, both NUM_OUTSTANDING_REQ_OF_CPU (0x88) event and COMP_BUF_OCCUPANCY(0xd5) event also have constraints. Add them into the event constraints table. Fixes: 2b3b76b5ec67 ("perf/x86/intel/uncore: Add Ice Lake server uncore support") Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1629991963-102621-4-git-send-email-kan.liang@linux.intel.com Signed-off-by: Muchun Song --- arch/x86/events/intel/uncore_snbep.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 684dd0a1831ea..d543ec7897ecf 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -4638,8 +4638,10 @@ static struct event_constraint icx_uncore_iio_constraints[] = { UNCORE_EVENT_CONSTRAINT(0x02, 0x3), UNCORE_EVENT_CONSTRAINT(0x03, 0x3), UNCORE_EVENT_CONSTRAINT(0x83, 0x3), + UNCORE_EVENT_CONSTRAINT(0x88, 0xc), UNCORE_EVENT_CONSTRAINT(0xc0, 0xc), UNCORE_EVENT_CONSTRAINT(0xc5, 0xc), + UNCORE_EVENT_CONSTRAINT(0xd5, 0xc), EVENT_CONSTRAINT_END }; From ce41823aa5e6f7a1636c244e7d1ffc50b239f3a4 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 26 Aug 2021 08:32:37 -0700 Subject: [PATCH 0058/5344] perf/x86/intel/uncore: Support extra IMC channel on Ice Lake server commit 496a18f09374ad89b3ab4366019bc3975db90234 upstream. There are three channels on a Ice Lake server, but only two channels will ever be active. Current perf only enables two channels. Support the extra IMC channel, which may be activated on some Ice Lake machines. For a non-activated channel, the SW can still access it. The write will be ignored by the HW. 0 is always returned for the reading. Fixes: 2b3b76b5ec67 ("perf/x86/intel/uncore: Add Ice Lake server uncore support") Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1629991963-102621-2-git-send-email-kan.liang@linux.intel.com Signed-off-by: Muchun Song --- arch/x86/events/intel/uncore_snbep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index d543ec7897ecf..ed967c47fd9a4 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -415,7 +415,7 @@ #define ICX_M3UPI_PCI_PMON_BOX_CTL 0xa0 /* ICX IMC */ -#define ICX_NUMBER_IMC_CHN 2 +#define ICX_NUMBER_IMC_CHN 3 #define ICX_IMC_MEM_STRIDE 0x4 DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); @@ -4969,7 +4969,7 @@ static struct intel_uncore_ops icx_uncore_mmio_ops = { static struct intel_uncore_type icx_uncore_imc = { .name = "imc", .num_counters = 4, - .num_boxes = 8, + .num_boxes = 12, .perf_ctr_bits = 48, .fixed_ctr_bits = 48, .fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR, From 9875f4af38c7d2eb622b2e53b194f1da2c771821 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 1 Jun 2021 06:09:03 -0700 Subject: [PATCH 0059/5344] perf/x86/intel/uncore: Fix M2M event umask for Ice Lake server commit 848ff3768684701a4ce73a2ec0e5d438d4e2b0da upstream. Perf tool errors out with the latest event list for the Ice Lake server. event syntax error: 'unc_m2m_imc_reads.to_pmm' \___ value too big for format, maximum is 255 The same as the Snow Ridge server, the M2M uncore unit in the Ice Lake server has the unit mask extension field as well. Fixes: 2b3b76b5ec67 ("perf/x86/intel/uncore: Add Ice Lake server uncore support") Reported-by: Jin Yao Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1622552943-119174-1-git-send-email-kan.liang@linux.intel.com Signed-off-by: Muchun Song --- arch/x86/events/intel/uncore_snbep.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index ed967c47fd9a4..93d729726f303 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -4811,9 +4811,10 @@ static struct intel_uncore_type icx_uncore_m2m = { .perf_ctr = SNR_M2M_PCI_PMON_CTR0, .event_ctl = SNR_M2M_PCI_PMON_CTL0, .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .event_mask_ext = SNR_M2M_PCI_PMON_UMASK_EXT, .box_ctl = SNR_M2M_PCI_PMON_BOX_CTL, .ops = &snr_m2m_uncore_pci_ops, - .format_group = &skx_uncore_format_group, + .format_group = &snr_m2m_uncore_format_group, }; static struct attribute *icx_upi_uncore_formats_attr[] = { From 75468ee5f9abc37463e1020af864842bd106fca6 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Sun, 25 Oct 2020 00:29:53 +0800 Subject: [PATCH 0060/5344] intel_idle: Fix max_cstate for processor models without C-state tables commit 4e0ba5577dba686f96c1c10ef4166380667fdec7 upstream. Currently intel_idle driver gets the c-state information from ACPI _CST if the processor model is not recognized by it. However the c-state in _CST starts with index 1 which is different from the index in intel_idle driver's internal c-state table. While intel_idle_max_cstate_reached() was previously introduced to deal with intel_idle driver's internal c-state table, re-using this function directly on _CST is incorrect. Fix this by subtracting 1 from the index when checking max_cstate in the _CST case. For example, append intel_idle.max_cstate=1 in boot command line, Before the patch: grep . /sys/devices/system/cpu/cpu0/cpuidle/state*/name POLL After the patch: grep . /sys/devices/system/cpu/cpu0/cpuidle/state*/name /sys/devices/system/cpu/cpu0/cpuidle/state0/name:POLL /sys/devices/system/cpu/cpu0/cpuidle/state1/name:C1_ACPI Fixes: 18734958e9bf ("intel_idle: Use ACPI _CST for processor models without C-state tables") Reported-by: Pengfei Xu Cc: 5.6+ # 5.6+ Signed-off-by: Chen Yu [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki Signed-off-by: Muchun Song --- drivers/idle/intel_idle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 4b0ad1864d76e..74a76614df23d 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1280,7 +1280,7 @@ static void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) struct acpi_processor_cx *cx; struct cpuidle_state *state; - if (intel_idle_max_cstate_reached(cstate)) + if (intel_idle_max_cstate_reached(cstate - 1)) break; cx = &acpi_state_table.states[cstate]; From 18155b31a51e63f900e5500f2df04fdf01707e60 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 16 Oct 2020 17:28:32 +0200 Subject: [PATCH 0061/5344] intel_idle: Ignore _CST if control cannot be taken from the platform commit 75af76d0a34e048651a6af311781d7206b6964c7 upstream. e6d4f08a6776 ("intel_idle: Use ACPI _CST on server systems") avoids enabling c-states that have been disabled by the platform with the exception of C1E. Unfortunately, BIOS implementations are not always consistent in terms of how capabilities are advertised and control cannot always be handed over. If control cannot be handed over then intel_idle reports that "ACPI _CST not found or not usable" but does not clear acpi_state_table.count meaning the information is still partially used. This patch ignores ACPI information if CST control cannot be requested from the platform. This was only observed on a number of Haswell platforms that had identical CPUs but not identical BIOS versions. While this problem may be rare overall, 24 separate test cases bisected to this specific commit across 4 separate test machines and is worth addressing. If the situation occurs, the kernel behaves as it did before commit e6d4f08a6776 and uses any c-states that are discovered. The affected test cases were all ones that involved a small number of processes -- exec microbenchmark, pipe microbenchmark, git test suite, netperf, tbench with one client and system call microbenchmark. Each case benefits from being able to use turboboost which is prevented if the lower c-states are unavailable. This may mask real regressions specific to older hardware so it is worth addressing. C-state status before and after the patch 5.9.0-vanilla POLL latency:0 disabled:0 default:enabled 5.9.0-vanilla C1 latency:2 disabled:0 default:enabled 5.9.0-vanilla C1E latency:10 disabled:0 default:enabled 5.9.0-vanilla C3 latency:33 disabled:1 default:disabled 5.9.0-vanilla C6 latency:133 disabled:1 default:disabled 5.9.0-ignore-cst-v1r1 POLL latency:0 disabled:0 default:enabled 5.9.0-ignore-cst-v1r1 C1 latency:2 disabled:0 default:enabled 5.9.0-ignore-cst-v1r1 C1E latency:10 disabled:0 default:enabled 5.9.0-ignore-cst-v1r1 C3 latency:33 disabled:0 default:enabled 5.9.0-ignore-cst-v1r1 C6 latency:133 disabled:0 default:enabled Patch enables C3/C6. Netperf UDP_STREAM netperf-udp 5.5.0 5.9.0 vanilla ignore-cst-v1r1 Hmean send-64 193.41 ( 0.00%) 226.54 * 17.13%* Hmean send-128 392.16 ( 0.00%) 450.54 * 14.89%* Hmean send-256 769.94 ( 0.00%) 881.85 * 14.53%* Hmean send-1024 2994.21 ( 0.00%) 3468.95 * 15.85%* Hmean send-2048 5725.60 ( 0.00%) 6628.99 * 15.78%* Hmean send-3312 8468.36 ( 0.00%) 10288.02 * 21.49%* Hmean send-4096 10135.46 ( 0.00%) 12387.57 * 22.22%* Hmean send-8192 17142.07 ( 0.00%) 19748.11 * 15.20%* Hmean send-16384 28539.71 ( 0.00%) 30084.45 * 5.41%* Fixes: e6d4f08a6776 ("intel_idle: Use ACPI _CST on server systems") Signed-off-by: Mel Gorman Cc: 5.6+ # 5.6+ Signed-off-by: Rafael J. Wysocki Signed-off-by: Muchun Song --- drivers/idle/intel_idle.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 74a76614df23d..9d029d8979e94 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1256,14 +1256,13 @@ static bool intel_idle_acpi_cst_extract(void) if (!intel_idle_cst_usable()) continue; - if (!acpi_processor_claim_cst_control()) { - acpi_state_table.count = 0; - return false; - } + if (!acpi_processor_claim_cst_control()) + break; return true; } + acpi_state_table.count = 0; pr_debug("ACPI _CST not found or not usable\n"); return false; } From 44ac6f614f9d7019d7eadec91bb0427d6396ebb4 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 28 Sep 2020 06:32:40 -0700 Subject: [PATCH 0062/5344] perf/x86/intel/uncore: Fix the scale of the IMC free-running events commit 8191016a026b8dfbb14dea64efc8e723ee99fe65 upstream. The "MiB" result of the IMC free-running bandwidth events, uncore_imc_free_running/read/ and uncore_imc_free_running/write/ are 16 times too small. The "MiB" value equals the raw IMC free-running bandwidth counter value times a "scale" which is inaccurate. The IMC free-running bandwidth events should be incremented per 64B cache line, not DWs (4 bytes). The "scale" should be 6.103515625e-5. Fix the "scale" for both Snow Ridge and Ice Lake. Fixes: 2b3b76b5ec67 ("perf/x86/intel/uncore: Add Ice Lake server uncore support") Fixes: ee49532b38dd ("perf/x86/intel/uncore: Add IMC uncore support for Snow Ridge") Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200928133240.12977-1-kan.liang@linux.intel.com Signed-off-by: Muchun Song --- arch/x86/events/intel/uncore_snbep.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 93d729726f303..998cfae77a707 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -4541,10 +4541,10 @@ static struct uncore_event_desc snr_uncore_imc_freerunning_events[] = { INTEL_UNCORE_EVENT_DESC(dclk, "event=0xff,umask=0x10"), INTEL_UNCORE_EVENT_DESC(read, "event=0xff,umask=0x20"), - INTEL_UNCORE_EVENT_DESC(read.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(read.scale, "6.103515625e-5"), INTEL_UNCORE_EVENT_DESC(read.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(write, "event=0xff,umask=0x21"), - INTEL_UNCORE_EVENT_DESC(write.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(write.scale, "6.103515625e-5"), INTEL_UNCORE_EVENT_DESC(write.unit, "MiB"), { /* end: all zeroes */ }, }; @@ -5003,17 +5003,17 @@ static struct uncore_event_desc icx_uncore_imc_freerunning_events[] = { INTEL_UNCORE_EVENT_DESC(dclk, "event=0xff,umask=0x10"), INTEL_UNCORE_EVENT_DESC(read, "event=0xff,umask=0x20"), - INTEL_UNCORE_EVENT_DESC(read.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(read.scale, "6.103515625e-5"), INTEL_UNCORE_EVENT_DESC(read.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(write, "event=0xff,umask=0x21"), - INTEL_UNCORE_EVENT_DESC(write.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(write.scale, "6.103515625e-5"), INTEL_UNCORE_EVENT_DESC(write.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(ddrt_read, "event=0xff,umask=0x30"), - INTEL_UNCORE_EVENT_DESC(ddrt_read.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(ddrt_read.scale, "6.103515625e-5"), INTEL_UNCORE_EVENT_DESC(ddrt_read.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(ddrt_write, "event=0xff,umask=0x31"), - INTEL_UNCORE_EVENT_DESC(ddrt_write.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(ddrt_write.scale, "6.103515625e-5"), INTEL_UNCORE_EVENT_DESC(ddrt_write.unit, "MiB"), { /* end: all zeroes */ }, }; From 46433cd94711bf8e4d7a41a1d0e74f8b0f9f8607 Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Thu, 7 Nov 2019 22:48:55 +0000 Subject: [PATCH 0063/5344] scsi: qla2xxx: initialize fc4_type_priority commit f5a2b219a7897751f9bb1c8d7308933e33000f2f upstream. ha->fc4_type_priority is currently initialized only in qla81xx_nvram_config(). That makes it default to NVMe for other adapters. Fix it. Fixes: 84ed362ac40c ("scsi: qla2xxx: Dual FCP-NVMe target port support") Link: https://lore.kernel.org/r/20191107224839.32417-2-martin.wilck@suse.com Tested-by: David Bond Acked-by: Himanshu Madhani Reviewed-by: Bart Van Assche Signed-off-by: Martin Wilck Signed-off-by: Martin K. Petersen Signed-off-by: Gang Li --- drivers/scsi/qla2xxx/qla_init.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c index 643b8ae36cbeb..e59fb9429033c 100644 --- a/drivers/scsi/qla2xxx/qla_init.c +++ b/drivers/scsi/qla2xxx/qla_init.c @@ -2264,8 +2264,18 @@ qla2x00_initialize_adapter(scsi_qla_host_t *vha) ql_dbg(ql_dbg_init, vha, 0x0061, "Configure NVRAM parameters...\n"); + /* Let priority default to FCP, can be overridden by nvram_config */ + ha->fc4_type_priority = FC4_PRIORITY_FCP; + ha->isp_ops->nvram_config(vha); + if (ha->fc4_type_priority != FC4_PRIORITY_FCP && + ha->fc4_type_priority != FC4_PRIORITY_NVME) + ha->fc4_type_priority = FC4_PRIORITY_FCP; + + ql_log(ql_log_info, vha, 0xffff, "FC4 priority set to %s\n", + ha->fc4_type_priority == FC4_PRIORITY_FCP ? "FCP" : "NVMe"); + if (ha->flags.disable_serdes) { /* Mask HBA via NVRAM settings? */ ql_log(ql_log_info, vha, 0x0077, @@ -8565,8 +8575,6 @@ qla81xx_nvram_config(scsi_qla_host_t *vha) /* Determine NVMe/FCP priority for target ports */ ha->fc4_type_priority = qla2xxx_get_fc4_priority(vha); - ql_log(ql_log_info, vha, 0xffff, "FC4 priority set to %s\n", - ha->fc4_type_priority & BIT_0 ? "FCP" : "NVMe"); if (rval) { ql_log(ql_log_warn, vha, 0x0076, From 940ab59505ad6014256c4368c7749ab1ad0582cf Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Fri, 20 Dec 2019 08:07:31 -0800 Subject: [PATCH 0064/5344] xfs: Make the symbol 'xfs_rtalloc_log_count' static commit 5084bf6b2006fcd46f1e44e3c51b687507b362e2 upstream. Fix the following sparse warning: fs/xfs/libxfs/xfs_trans_resv.c:206:1: warning: symbol 'xfs_rtalloc_log_count' was not declared. Should it be static? Fixes: b1de6fc7520f ("xfs: fix log reservation overflows when allocating large rt extents") Signed-off-by: Chen Wandun Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Gang Li --- fs/xfs/libxfs/xfs_trans_resv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index b3584cd2cc164..a9b32879a9d15 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -202,7 +202,7 @@ xfs_calc_inode_chunk_res( * blocks as needed to mark inuse MAXEXTLEN blocks' worth of realtime extents, * as well as the realtime summary block. */ -unsigned int +static unsigned int xfs_rtalloc_log_count( struct xfs_mount *mp, unsigned int num_ops) From 46fd1c3b2e6cb177863cf7a4137b4d7469893253 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 27 Nov 2019 06:20:02 +0000 Subject: [PATCH 0065/5344] ubi: wl: Remove set but not used variable 'prev_e' commit 770aa73d8965b6fe7b08b0e5a8f4cceb4cb17f12 upstream. Fixes gcc '-Wunused-but-set-variable' warning: drivers/mtd/ubi/wl.c: In function 'find_wl_entry': drivers/mtd/ubi/wl.c:322:27: warning: variable 'prev_e' set but not used [-Wunused-but-set-variable] It's not used any more now, so remove it. Fixes: f9c34bb52997 ("ubi: Fix producing anchor PEBs") Signed-off-by: YueHaibing Signed-off-by: Richard Weinberger Signed-off-by: Gang Li --- drivers/mtd/ubi/wl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c index 7def041bbe484..7bbaa49d94815 100644 --- a/drivers/mtd/ubi/wl.c +++ b/drivers/mtd/ubi/wl.c @@ -319,7 +319,7 @@ static struct ubi_wl_entry *find_wl_entry(struct ubi_device *ubi, struct rb_root *root, int diff) { struct rb_node *p; - struct ubi_wl_entry *e, *prev_e = NULL; + struct ubi_wl_entry *e; int max; e = rb_entry(rb_first(root), struct ubi_wl_entry, u.rb); @@ -334,7 +334,6 @@ static struct ubi_wl_entry *find_wl_entry(struct ubi_device *ubi, p = p->rb_left; else { p = p->rb_right; - prev_e = e; e = e1; } } From 489df241941b13b5a36af6cd7f5e9bef815659e0 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 6 Apr 2020 08:12:47 -0400 Subject: [PATCH 0066/5344] vhost: disable for OABI commit d085eb8ce727e581abf8145244eaa3339021be2f upstream. vhost is currently broken on the some ARM configs. The reason is that the ring element addresses are passed between components with different alignments assumptions. Thus, if guest selects a pointer and host then gets and dereferences it, then alignment assumed by the host's compiler might be greater than the actual alignment of the pointer. compiler on the host from assuming pointer is aligned. This actually triggers on ARM with -mabi=apcs-gnu - which is a deprecated configuration. With this OABI, compiler assumes that all structures are 4 byte aligned - which is stronger than virtio guarantees for available and used rings, which are merely 2 bytes. Thus a guest without -mabi=apcs-gnu running on top of host with -mabi=apcs-gnu will be broken. The correct fix is to force alignment of structures - however that is an intrusive fix that's best deferred until the next release. We didn't previously support such ancient systems at all - this surfaced after vdpa support prompted removing dependency of vhost on VIRTULIZATION. So for now, let's just add something along the lines of depends on !ARM || AEABI to the virtio Kconfig declaration, and add a comment that it has to do with struct member alignment. Note: we can't make VHOST and VHOST_RING themselves have a dependency since these are selected. Add a new symbol for that. We should be able to drop this dependency down the road. Fixes: 20c384f1ea1a0bc7 ("vhost: refine vhost and vringh kconfig") Suggested-by: Ard Biesheuvel Suggested-by: Richard Earnshaw Signed-off-by: Michael S. Tsirkin Signed-off-by: Gang Li --- drivers/misc/mic/Kconfig | 2 +- drivers/net/caif/Kconfig | 2 +- drivers/vdpa/Kconfig | 2 +- drivers/vhost/Kconfig | 17 +++++++++++++---- 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/drivers/misc/mic/Kconfig b/drivers/misc/mic/Kconfig index 90137f2bc8e1d..d308e40921d79 100644 --- a/drivers/misc/mic/Kconfig +++ b/drivers/misc/mic/Kconfig @@ -132,7 +132,7 @@ comment "VOP Driver" config VOP tristate "VOP Driver" - depends on VOP_BUS + depends on VOP_BUS && VHOST_DPN select VHOST_RING select VIRTIO help diff --git a/drivers/net/caif/Kconfig b/drivers/net/caif/Kconfig index 50cc051b90d84..45b6ee7e4dd0e 100644 --- a/drivers/net/caif/Kconfig +++ b/drivers/net/caif/Kconfig @@ -44,7 +44,7 @@ config CAIF_HSI config CAIF_VIRTIO tristate "CAIF virtio transport driver" - depends on CAIF && HAS_DMA + depends on CAIF && HAS_DMA && VHOST_DPN select VHOST_RING select VIRTIO select GENERIC_ALLOCATOR diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig index 1192765845b2d..c0556378b0376 100644 --- a/drivers/vdpa/Kconfig +++ b/drivers/vdpa/Kconfig @@ -11,7 +11,7 @@ if VDPA config VDPA_SIM tristate "vDPA device simulator core" - depends on RUNTIME_TESTING_MENU && HAS_DMA + depends on RUNTIME_TESTING_MENU && HAS_DMA && VHOST_DPN select VHOST_RING select IOMMU_IOVA help diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index bf85b61cfb9c9..ff9cb43bd989f 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -11,6 +11,15 @@ config VHOST_RING This option is selected by any driver which needs to access the host side of a virtio ring. +config VHOST_DPN + bool + depends on !ARM || AEABI + default y + help + Anything selecting VHOST or VHOST_RING must depend on VHOST_DPN. + This excludes the deprecated ARM ABI since that forces a 4 byte + alignment on all structs - incompatible with virtio spec requirements. + config VHOST tristate select VHOST_IOTLB @@ -26,7 +35,7 @@ if VHOST_MENU config VHOST_NET tristate "Host kernel accelerator for virtio net" - depends on NET && EVENTFD && (TUN || !TUN) && (TAP || !TAP) + depends on NET && EVENTFD && (TUN || !TUN) && (TAP || !TAP) && VHOST_DPN select VHOST ---help--- This kernel module can be loaded in host kernel to accelerate @@ -38,7 +47,7 @@ config VHOST_NET config VHOST_SCSI tristate "VHOST_SCSI TCM fabric driver" - depends on TARGET_CORE && EVENTFD + depends on TARGET_CORE && EVENTFD && VHOST_DPN select VHOST default n ---help--- @@ -47,7 +56,7 @@ config VHOST_SCSI config VHOST_VSOCK tristate "vhost virtio-vsock driver" - depends on VSOCKETS && EVENTFD + depends on VSOCKETS && EVENTFD && VHOST_DPN select VHOST select VIRTIO_VSOCKETS_COMMON default n @@ -61,7 +70,7 @@ config VHOST_VSOCK config VHOST_VDPA tristate "Vhost driver for vDPA-based backend" - depends on EVENTFD + depends on EVENTFD && VHOST_DPN select VHOST select IRQ_BYPASS_MANAGER depends on VDPA From 3e4ec77f0117619aef72d88effaa9229434cf9f1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 29 Apr 2020 15:27:32 +0200 Subject: [PATCH 0067/5344] btrfs: fix gcc-4.8 build warning for struct initializer commit 9c6c723f48f5f05eab133b4fee8f2a2b7ec57a15 upstream. Some older compilers like gcc-4.8 warn about mismatched curly braces in a initializer: fs/btrfs/backref.c: In function 'is_shared_data_backref': fs/btrfs/backref.c:394:9: error: missing braces around initializer [-Werror=missing-braces] struct prelim_ref target = {0}; ^ fs/btrfs/backref.c:394:9: error: (near initialization for 'target.rbnode') [-Werror=missing-braces] Use the GNU empty initializer extension to avoid this. Fixes: ed58f2e66e84 ("btrfs: backref, don't add refs from shared block when resolving normal backref") Reviewed-by: Qu Wenruo Signed-off-by: Arnd Bergmann Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Gang Li --- fs/btrfs/backref.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 7f644a58db511..0cd83234353e1 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -391,7 +391,7 @@ static int is_shared_data_backref(struct preftrees *preftrees, u64 bytenr) struct rb_node **p = &preftrees->direct.root.rb_root.rb_node; struct rb_node *parent = NULL; struct prelim_ref *ref = NULL; - struct prelim_ref target = {0}; + struct prelim_ref target = {}; int result; target.parent = bytenr; From 774feb033d9598eb7e4d471619cc87e7c2e9ffc4 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 8 Jun 2020 08:42:09 -0400 Subject: [PATCH 0068/5344] vhost/test: fix up after API change commit 044e4b09223039e571e6ec540e25552054208765 upstream. Pass a flag to request kernel thread use. Fixes: 01fcb1cbc88e ("vhost: allow device that does not depend on vhost worker") Signed-off-by: Michael S. Tsirkin Signed-off-by: Gang Li --- drivers/vhost/test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c index 10897b29e616f..cca0a2b6b0b00 100644 --- a/drivers/vhost/test.c +++ b/drivers/vhost/test.c @@ -120,7 +120,7 @@ static int vhost_test_open(struct inode *inode, struct file *f) vqs[VHOST_TEST_VQ] = &n->vqs[VHOST_TEST_VQ]; n->vqs[VHOST_TEST_VQ].handle_kick = handle_vq_kick; vhost_dev_init(dev, vqs, VHOST_TEST_VQ_MAX, UIO_MAXIOV, - VHOST_TEST_PKT_WEIGHT, VHOST_TEST_WEIGHT); + VHOST_TEST_PKT_WEIGHT, VHOST_TEST_WEIGHT, true, NULL); f->private_data = n; From a4f1ae2ad9734f8cc23c96f8846d5a37eab68487 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 1 Jun 2020 12:00:49 +0200 Subject: [PATCH 0069/5344] Documentation/CodingStyle: Fix duplicate "are" typo commit 77d22a4388d33a76180cad69a4309d6636d30855 upstream. The improved paragraph about line lengths contains a sentence with a duplicate word: there is one "are" at the end of a line, followed by a second one at the beginning of the next line. Drop the first one, as that one is part of the longest line. Fixes: bdc48fa11e46f867 ("checkpatch/coding-style: deprecate 80-column warning") Signed-off-by: Geert Uytterhoeven Signed-off-by: Linus Torvalds Signed-off-by: Gang Li --- Documentation/process/coding-style.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/process/coding-style.rst b/Documentation/process/coding-style.rst index 8a3b6485b33be..b2eaad23e4715 100644 --- a/Documentation/process/coding-style.rst +++ b/Documentation/process/coding-style.rst @@ -90,7 +90,7 @@ Statements longer than 80 columns should be broken into sensible chunks, unless exceeding 80 columns significantly increases readability and does not hide information. -Descendants are always substantially shorter than the parent and are +Descendants are always substantially shorter than the parent and are placed substantially to the right. A very commonly used style is to align descendants to a function open parenthesis. From 6525a9b2d55f3a0063ca2d31e344d0a362142ad8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 14 Jun 2020 20:22:19 -0700 Subject: [PATCH 0070/5344] Documentation: fix filesystems/locking.rst malformed table warnings commit 6cbef2adb730f5adb4c44fe6da8f20653fed9317 upstream. Fix Sphinx malformed table warnings in filesystems/locking.rst: lnx-58-rc1/Documentation/filesystems/locking.rst:443: WARNING: Malformed table. Text in column margin in table line 8. lnx-58-rc1/Documentation/filesystems/locking.rst:620: WARNING: Malformed table. Text in column margin in table line 2. Fixes: ec23eb54fbc7 ("docs: fs: convert docs without extension to ReST") Fixes: c1e8d7c6a7a6 ("mmap locking API: convert mmap_sem comments") Signed-off-by: Randy Dunlap Acked-by: Michel Lespinasse Link: https://lore.kernel.org/r/12c2afd1-2dcf-2ea0-02aa-bc2759729c77@infradead.org Signed-off-by: Jonathan Corbet Signed-off-by: Gang Li --- Documentation/filesystems/locking.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index fc3a0704553cf..fdaeda5fc20c8 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -428,14 +428,14 @@ prototypes:: locking rules: -========== ============= ================= ========= +====================== ============= ================= ========= ops inode->i_lock blocked_lock_lock may block -========== ============= ================= ========= +====================== ============= ================= ========= lm_notify: yes yes no lm_grant: no no no lm_break: yes no no lm_change yes no no -========== ============= ================= ========= +====================== ============= ================= ========= buffer_head =========== @@ -610,9 +610,9 @@ prototypes:: locking rules: -============= ======== =========================== +============= ========= =========================== ops mmap_sem PageLocked(page) -============= ======== =========================== +============= ========= =========================== open: yes close: yes fault: yes can return with page locked @@ -620,7 +620,7 @@ map_pages: yes page_mkwrite: yes can return with page locked pfn_mkwrite: yes access: yes -============= ======== =========================== +============= ========= =========================== ->fault() is called when a previously not present pte is about to be faulted in. The filesystem must find and return the page associated From a9fc6c49309ecc2cc140d306c4c7fb6af2b4a5c0 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 24 Jun 2020 16:06:06 -0600 Subject: [PATCH 0071/5344] wil6210: account for napi_gro_receive never returning GRO_DROP commit 045790b7bc66a75070c112a61558c639cef2263e upstream. The napi_gro_receive function no longer returns GRO_DROP ever, making handling GRO_DROP dead code. This commit removes that dead code. Further, it's not even clear that device drivers have any business in taking action after passing off received packets; that's arguably out of their hands. In this case, too, the non-gro path didn't bother checking the return value. Plus, this had some clunky debugging functions that duplicated code from elsewhere and was generally pretty messy. So, this commit cleans that all up too. Fixes: 6570bc79c0df ("net: core: use listified Rx for GRO_NORMAL in napi_gro_receive()") Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller Signed-off-by: Gang Li --- drivers/net/wireless/ath/wil6210/txrx.c | 39 +++++++------------------ 1 file changed, 11 insertions(+), 28 deletions(-) diff --git a/drivers/net/wireless/ath/wil6210/txrx.c b/drivers/net/wireless/ath/wil6210/txrx.c index 598c1fba9dac4..579bfa2670416 100644 --- a/drivers/net/wireless/ath/wil6210/txrx.c +++ b/drivers/net/wireless/ath/wil6210/txrx.c @@ -907,7 +907,6 @@ static void wil_rx_handle_eapol(struct wil6210_vif *vif, struct sk_buff *skb) void wil_netif_rx(struct sk_buff *skb, struct net_device *ndev, int cid, struct wil_net_stats *stats, bool gro) { - gro_result_t rc = GRO_NORMAL; struct wil6210_vif *vif = ndev_to_vif(ndev); struct wil6210_priv *wil = ndev_to_wil(ndev); struct wireless_dev *wdev = vif_to_wdev(vif); @@ -918,22 +917,16 @@ void wil_netif_rx(struct sk_buff *skb, struct net_device *ndev, int cid, */ int mcast = is_multicast_ether_addr(da); struct sk_buff *xmit_skb = NULL; - static const char * const gro_res_str[] = { - [GRO_MERGED] = "GRO_MERGED", - [GRO_MERGED_FREE] = "GRO_MERGED_FREE", - [GRO_HELD] = "GRO_HELD", - [GRO_NORMAL] = "GRO_NORMAL", - [GRO_DROP] = "GRO_DROP", - [GRO_CONSUMED] = "GRO_CONSUMED", - }; if (wdev->iftype == NL80211_IFTYPE_STATION) { sa = wil_skb_get_sa(skb); if (mcast && ether_addr_equal(sa, ndev->dev_addr)) { /* mcast packet looped back to us */ - rc = GRO_DROP; dev_kfree_skb(skb); - goto stats; + ndev->stats.rx_dropped++; + stats->rx_dropped++; + wil_dbg_txrx(wil, "Rx drop %d bytes\n", len); + return; } } else if (wdev->iftype == NL80211_IFTYPE_AP && !vif->ap_isolate) { if (mcast) { @@ -977,26 +970,16 @@ void wil_netif_rx(struct sk_buff *skb, struct net_device *ndev, int cid, wil_rx_handle_eapol(vif, skb); if (gro) - rc = napi_gro_receive(&wil->napi_rx, skb); + napi_gro_receive(&wil->napi_rx, skb); else netif_rx_ni(skb); - wil_dbg_txrx(wil, "Rx complete %d bytes => %s\n", - len, gro_res_str[rc]); - } -stats: - /* statistics. rc set to GRO_NORMAL for AP bridging */ - if (unlikely(rc == GRO_DROP)) { - ndev->stats.rx_dropped++; - stats->rx_dropped++; - wil_dbg_txrx(wil, "Rx drop %d bytes\n", len); - } else { - ndev->stats.rx_packets++; - stats->rx_packets++; - ndev->stats.rx_bytes += len; - stats->rx_bytes += len; - if (mcast) - ndev->stats.multicast++; } + ndev->stats.rx_packets++; + stats->rx_packets++; + ndev->stats.rx_bytes += len; + stats->rx_bytes += len; + if (mcast) + ndev->stats.multicast++; } void wil_netif_rx_any(struct sk_buff *skb, struct net_device *ndev) From c5e417d2c7baa4db91915fe2ba6123737db923ce Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Thu, 25 Jun 2020 20:29:17 -0700 Subject: [PATCH 0072/5344] openrisc: fix boot oops when DEBUG_VM is enabled commit 313a5257b84c26b7f080c5d294aabe7d38ca439c upstream. Since v5.8-rc1 OpenRISC Linux fails to boot when DEBUG_VM is enabled. This has been bisected to commit 42fc541404f2 ("mmap locking API: add mmap_assert_locked() and mmap_assert_write_locked()"). The added locking checks exposed the issue that OpenRISC was not taking this mmap lock when during page walks for DMA operations. This patch locks and unlocks the mmap lock for page walking. Link: http://lkml.kernel.org/r/20200617090247.1680188-1-shorne@gmail.com Fixes: 42fc541404f2 ("mmap locking API: add mmap_assert_locked() and mmap_assert_write_locked()" Signed-off-by: Stafford Horne Reviewed-by: Michel Lespinasse Cc: Jonas Bonn Cc: Stefan Kristiansson Cc: Jason Gunthorpe Cc: Steven Price Cc: Thomas Hellstrom Cc: Robin Murphy Cc: Vlastimil Babka Cc: Daniel Jordan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Gang Li --- arch/openrisc/kernel/dma.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c index 4d5b8bd1d7956..bbff5510b3484 100644 --- a/arch/openrisc/kernel/dma.c +++ b/arch/openrisc/kernel/dma.c @@ -103,11 +103,14 @@ arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, * We need to iterate through the pages, clearing the dcache for * them and setting the cache-inhibit bit. */ + mmap_read_lock(&init_mm); if (walk_page_range(&init_mm, va, va + size, &set_nocache_walk_ops, NULL)) { + mmap_read_unlock(&init_mm); free_pages_exact(page, size); return NULL; } + mmap_read_unlock(&init_mm); return (void *)va; } @@ -119,8 +122,10 @@ arch_dma_free(struct device *dev, size_t size, void *vaddr, unsigned long va = (unsigned long)vaddr; /* walk_page_range shouldn't be able to fail here */ + mmap_read_lock(&init_mm); WARN_ON(walk_page_range(&init_mm, va, va + size, &clear_nocache_walk_ops, NULL)); + mmap_read_unlock(&init_mm); free_pages_exact(vaddr, size); } From 3ddd22b2b5221fa2eef289aa511c0da15f48f7ee Mon Sep 17 00:00:00 2001 From: Phil Auld Date: Wed, 5 Aug 2020 16:31:38 -0400 Subject: [PATCH 0073/5344] sched: Fix use of count for nr_running tracepoint commit a1bd06853ee478d37fae9435c5521e301de94c67 upstream. The count field is meant to tell if an update to nr_running is an add or a subtract. Make it do so by adding the missing minus sign. Fixes: 9d246053a691 ("sched: Add a tracepoint to track rq->nr_running") Signed-off-by: Phil Auld Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200805203138.1411-1-pauld@redhat.com Signed-off-by: Gang Li --- kernel/sched/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ba0a7d00155e0..42ed210b9552f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1958,7 +1958,7 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) { rq->nr_running -= count; if (trace_sched_update_nr_running_tp_enabled()) { - call_trace_sched_update_nr_running(rq, count); + call_trace_sched_update_nr_running(rq, -count); } /* Check if we still need preemption */ From 943795daf32081ebfa8b37dacc11d86169126914 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 17 Aug 2020 11:08:18 -0600 Subject: [PATCH 0074/5344] vfio-pci: Avoid recursive read-lock usage commit bc93b9ae0151ae5ad5b8504cdc598428ea99570b upstream. A down_read on memory_lock is held when performing read/write accesses to MMIO BAR space, including across the copy_to/from_user() callouts which may fault. If the user buffer for these copies resides in an mmap of device MMIO space, the mmap fault handler will acquire a recursive read-lock on memory_lock. Avoid this by reducing the lock granularity. Sequential accesses requiring multiple ioread/iowrite cycles are expected to be rare, therefore typical accesses should not see additional overhead. VGA MMIO accesses are expected to be non-fatal regardless of the PCI memory enable bit to allow legacy probing, this behavior remains with a comment added. ioeventfds are now included in memory access testing, with writes dropped while memory space is disabled. Fixes: abafbc551fdd ("vfio-pci: Invalidate mmaps and block MMIO access on disabled memory") Reported-by: Zhiyi Guo Tested-by: Zhiyi Guo Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson Signed-off-by: Gang Li --- drivers/vfio/pci/vfio_pci_private.h | 2 + drivers/vfio/pci/vfio_pci_rdwr.c | 120 ++++++++++++++++++++++------ 2 files changed, 98 insertions(+), 24 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 987b4d311fde9..de7ac0cd6c5c2 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -31,12 +31,14 @@ struct vfio_pci_ioeventfd { struct list_head next; + struct vfio_pci_device *vdev; struct virqfd *virqfd; void __iomem *addr; uint64_t data; loff_t pos; int bar; int count; + bool test_mem; }; struct vfio_pci_irq_ctx { diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index 83f81d24df78e..eaa1975ebae2c 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -37,17 +37,70 @@ #define vfio_ioread8 ioread8 #define vfio_iowrite8 iowrite8 +#define VFIO_IOWRITE(size) \ +static int vfio_pci_iowrite##size(struct vfio_pci_device *vdev, \ + bool test_mem, u##size val, void __iomem *io) \ +{ \ + if (test_mem) { \ + down_read(&vdev->memory_lock); \ + if (!__vfio_pci_memory_enabled(vdev)) { \ + up_read(&vdev->memory_lock); \ + return -EIO; \ + } \ + } \ + \ + vfio_iowrite##size(val, io); \ + \ + if (test_mem) \ + up_read(&vdev->memory_lock); \ + \ + return 0; \ +} + +VFIO_IOWRITE(8) +VFIO_IOWRITE(16) +VFIO_IOWRITE(32) +#ifdef iowrite64 +VFIO_IOWRITE(64) +#endif + +#define VFIO_IOREAD(size) \ +static int vfio_pci_ioread##size(struct vfio_pci_device *vdev, \ + bool test_mem, u##size *val, void __iomem *io) \ +{ \ + if (test_mem) { \ + down_read(&vdev->memory_lock); \ + if (!__vfio_pci_memory_enabled(vdev)) { \ + up_read(&vdev->memory_lock); \ + return -EIO; \ + } \ + } \ + \ + *val = vfio_ioread##size(io); \ + \ + if (test_mem) \ + up_read(&vdev->memory_lock); \ + \ + return 0; \ +} + +VFIO_IOREAD(8) +VFIO_IOREAD(16) +VFIO_IOREAD(32) + /* * Read or write from an __iomem region (MMIO or I/O port) with an excluded * range which is inaccessible. The excluded range drops writes and fills * reads with -1. This is intended for handling MSI-X vector tables and * leftover space for ROM BARs. */ -static ssize_t do_io_rw(void __iomem *io, char __user *buf, +static ssize_t do_io_rw(struct vfio_pci_device *vdev, bool test_mem, + void __iomem *io, char __user *buf, loff_t off, size_t count, size_t x_start, size_t x_end, bool iswrite) { ssize_t done = 0; + int ret; while (count) { size_t fillable, filled; @@ -66,9 +119,15 @@ static ssize_t do_io_rw(void __iomem *io, char __user *buf, if (copy_from_user(&val, buf, 4)) return -EFAULT; - vfio_iowrite32(val, io + off); + ret = vfio_pci_iowrite32(vdev, test_mem, + val, io + off); + if (ret) + return ret; } else { - val = vfio_ioread32(io + off); + ret = vfio_pci_ioread32(vdev, test_mem, + &val, io + off); + if (ret) + return ret; if (copy_to_user(buf, &val, 4)) return -EFAULT; @@ -82,9 +141,15 @@ static ssize_t do_io_rw(void __iomem *io, char __user *buf, if (copy_from_user(&val, buf, 2)) return -EFAULT; - vfio_iowrite16(val, io + off); + ret = vfio_pci_iowrite16(vdev, test_mem, + val, io + off); + if (ret) + return ret; } else { - val = vfio_ioread16(io + off); + ret = vfio_pci_ioread16(vdev, test_mem, + &val, io + off); + if (ret) + return ret; if (copy_to_user(buf, &val, 2)) return -EFAULT; @@ -98,9 +163,15 @@ static ssize_t do_io_rw(void __iomem *io, char __user *buf, if (copy_from_user(&val, buf, 1)) return -EFAULT; - vfio_iowrite8(val, io + off); + ret = vfio_pci_iowrite8(vdev, test_mem, + val, io + off); + if (ret) + return ret; } else { - val = vfio_ioread8(io + off); + ret = vfio_pci_ioread8(vdev, test_mem, + &val, io + off); + if (ret) + return ret; if (copy_to_user(buf, &val, 1)) return -EFAULT; @@ -178,14 +249,6 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf, count = min(count, (size_t)(end - pos)); - if (res->flags & IORESOURCE_MEM) { - down_read(&vdev->memory_lock); - if (!__vfio_pci_memory_enabled(vdev)) { - up_read(&vdev->memory_lock); - return -EIO; - } - } - if (bar == PCI_ROM_RESOURCE) { /* * The ROM can fill less space than the BAR, so we start the @@ -213,7 +276,8 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf, x_end = vdev->msix_offset + vdev->msix_size; } - done = do_io_rw(io, buf, pos, count, x_start, x_end, iswrite); + done = do_io_rw(vdev, res->flags & IORESOURCE_MEM, io, buf, pos, + count, x_start, x_end, iswrite); if (done >= 0) *ppos += done; @@ -221,9 +285,6 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf, if (bar == PCI_ROM_RESOURCE) pci_unmap_rom(pdev, io); out: - if (res->flags & IORESOURCE_MEM) - up_read(&vdev->memory_lock); - return done; } @@ -278,7 +339,12 @@ ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf, return ret; } - done = do_io_rw(iomem, buf, off, count, 0, 0, iswrite); + /* + * VGA MMIO is a legacy, non-BAR resource that hopefully allows + * probing, so we don't currently worry about access in relation + * to the memory enable bit in the command register. + */ + done = do_io_rw(vdev, false, iomem, buf, off, count, 0, 0, iswrite); vga_put(vdev->pdev, rsrc); @@ -296,17 +362,21 @@ static int vfio_pci_ioeventfd_handler(void *opaque, void *unused) switch (ioeventfd->count) { case 1: - vfio_iowrite8(ioeventfd->data, ioeventfd->addr); + vfio_pci_iowrite8(ioeventfd->vdev, ioeventfd->test_mem, + ioeventfd->data, ioeventfd->addr); break; case 2: - vfio_iowrite16(ioeventfd->data, ioeventfd->addr); + vfio_pci_iowrite16(ioeventfd->vdev, ioeventfd->test_mem, + ioeventfd->data, ioeventfd->addr); break; case 4: - vfio_iowrite32(ioeventfd->data, ioeventfd->addr); + vfio_pci_iowrite32(ioeventfd->vdev, ioeventfd->test_mem, + ioeventfd->data, ioeventfd->addr); break; #ifdef iowrite64 case 8: - vfio_iowrite64(ioeventfd->data, ioeventfd->addr); + vfio_pci_iowrite64(ioeventfd->vdev, ioeventfd->test_mem, + ioeventfd->data, ioeventfd->addr); break; #endif } @@ -378,11 +448,13 @@ long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset, goto out_unlock; } + ioeventfd->vdev = vdev; ioeventfd->addr = vdev->barmap[bar] + pos; ioeventfd->data = data; ioeventfd->pos = pos; ioeventfd->bar = bar; ioeventfd->count = count; + ioeventfd->test_mem = vdev->pdev->resource[bar].flags & IORESOURCE_MEM; ret = vfio_virqfd_enable(ioeventfd, vfio_pci_ioeventfd_handler, NULL, NULL, &ioeventfd->virqfd, fd); From 41ee6767fff1e41e2713a3113953df66e2d1526d Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Mon, 24 Aug 2020 12:55:37 +0200 Subject: [PATCH 0075/5344] drm/etnaviv: always start/stop scheduler in timeout processing commit 50248a3ec0f5e5debd18033eb2a29f0b793a7000 upstream. The drm scheduler currently expects that the stop/start sequence is always executed in the timeout handling, as the job at the head of the hardware execution list is always removed from the ring mirror before the driver function is called and only inserted back into the list when starting the scheduler. This adds some unnecessary overhead if the timeout handler determines that the GPU is still executing jobs normally and just wished to extend the timeout, but a better solution requires a major rearchitecture of the scheduler, which is not applicable as a fix. Fixes: 135517d3565b ("drm/scheduler: Avoid accessing freed bad job.") Signed-off-by: Lucas Stach Tested-by: Russell King Signed-off-by: Gang Li --- drivers/gpu/drm/etnaviv/etnaviv_sched.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c index 4e3e95dce6d87..cd46c882269cc 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c @@ -89,12 +89,15 @@ static void etnaviv_sched_timedout_job(struct drm_sched_job *sched_job) u32 dma_addr; int change; + /* block scheduler */ + drm_sched_stop(&gpu->sched, sched_job); + /* * If the GPU managed to complete this jobs fence, the timout is * spurious. Bail out. */ if (dma_fence_is_signaled(submit->out_fence)) - return; + goto out_no_timeout; /* * If the GPU is still making forward progress on the front-end (which @@ -105,12 +108,9 @@ static void etnaviv_sched_timedout_job(struct drm_sched_job *sched_job) change = dma_addr - gpu->hangcheck_dma_addr; if (change < 0 || change > 16) { gpu->hangcheck_dma_addr = dma_addr; - return; + goto out_no_timeout; } - /* block scheduler */ - drm_sched_stop(&gpu->sched, sched_job); - if(sched_job) drm_sched_increase_karma(sched_job); @@ -120,6 +120,7 @@ static void etnaviv_sched_timedout_job(struct drm_sched_job *sched_job) drm_sched_resubmit_jobs(&gpu->sched); +out_no_timeout: /* restart scheduler after GPU is usable again */ drm_sched_start(&gpu->sched, true); } From 09fa96af5d7d713992b42c598ce4906f64fb6975 Mon Sep 17 00:00:00 2001 From: Alex Dewar Date: Fri, 28 Aug 2020 14:55:23 +0100 Subject: [PATCH 0076/5344] netlabel: remove unused param from audit_log_format() commit 0f091e43310f5c292b7094f9f115e651358e8053 upstream. Commit d3b990b7f327 ("netlabel: fix problems with mapping removal") added a check to return an error if ret_val != 0, before ret_val is later used in a log message. Now it will unconditionally print "... res=1". So just drop the check. Addresses-Coverity: ("Dead code") Fixes: d3b990b7f327 ("netlabel: fix problems with mapping removal") Signed-off-by: Alex Dewar Acked-by: Paul Moore Signed-off-by: David S. Miller Signed-off-by: Gang Li --- net/netlabel/netlabel_domainhash.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c index 12aa803b2f689..1ad55f27c582b 100644 --- a/net/netlabel/netlabel_domainhash.c +++ b/net/netlabel/netlabel_domainhash.c @@ -611,9 +611,8 @@ int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry, audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info); if (audit_buf != NULL) { audit_log_format(audit_buf, - " nlbl_domain=%s res=%u", - entry->domain ? entry->domain : "(default)", - ret_val == 0 ? 1 : 0); + " nlbl_domain=%s res=1", + entry->domain ? entry->domain : "(default)"); audit_log_end(audit_buf); } From f8e0f25a0996b7283c628aae16c0963b062e12ca Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 31 Aug 2020 13:03:06 +0530 Subject: [PATCH 0077/5344] opp: Don't drop reference for an OPP table that was never parsed commit 922ff0759a16299e24cacfc981ac07914d8f1826 upstream. dev_pm_opp_remove_table() should drop a reference to the OPP table only if the DT OPP table was parsed earlier with a call to dev_pm_opp_of_add_table() earlier. Else it may end up dropping the reference to the OPP table, which was added as a result of other calls like dev_pm_opp_set_clkname(). And would hence result in undesirable behavior later on when caller would try to free the resource again. Fixes: 03758d60265c ("opp: Replace list_kref with a local counter") Reported-by: Naresh Kamboju Reported-by: Anders Roxell Tested-by: Naresh Kamboju Signed-off-by: Viresh Kumar Signed-off-by: Gang Li --- drivers/opp/core.c | 22 ++++++++++++++++------ drivers/opp/opp.h | 2 +- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index 088c93dc0085c..e98b295b20484 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -1177,13 +1177,19 @@ void dev_pm_opp_remove(struct device *dev, unsigned long freq) } EXPORT_SYMBOL_GPL(dev_pm_opp_remove); -void _opp_remove_all_static(struct opp_table *opp_table) +bool _opp_remove_all_static(struct opp_table *opp_table) { struct dev_pm_opp *opp, *tmp; + bool ret = true; mutex_lock(&opp_table->lock); - if (!opp_table->parsed_static_opps || --opp_table->parsed_static_opps) + if (!opp_table->parsed_static_opps) { + ret = false; + goto unlock; + } + + if (--opp_table->parsed_static_opps) goto unlock; list_for_each_entry_safe(opp, tmp, &opp_table->opp_list, node) { @@ -1193,6 +1199,8 @@ void _opp_remove_all_static(struct opp_table *opp_table) unlock: mutex_unlock(&opp_table->lock); + + return ret; } /** @@ -2206,13 +2214,15 @@ void _dev_pm_opp_find_and_remove_table(struct device *dev) return; } - _opp_remove_all_static(opp_table); + /* + * Drop the extra reference only if the OPP table was successfully added + * with dev_pm_opp_of_add_table() earlier. + **/ + if (_opp_remove_all_static(opp_table)) + dev_pm_opp_put_opp_table(opp_table); /* Drop reference taken by _find_opp_table() */ dev_pm_opp_put_opp_table(opp_table); - - /* Drop reference taken while the OPP table was added */ - dev_pm_opp_put_opp_table(opp_table); } /** diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h index d14e27102730c..469af6954686f 100644 --- a/drivers/opp/opp.h +++ b/drivers/opp/opp.h @@ -203,7 +203,7 @@ struct opp_table { /* Routines internal to opp core */ void dev_pm_opp_get(struct dev_pm_opp *opp); -void _opp_remove_all_static(struct opp_table *opp_table); +bool _opp_remove_all_static(struct opp_table *opp_table); void _get_opp_table_kref(struct opp_table *opp_table); int _get_opp_count(struct opp_table *opp_table); struct opp_table *_find_opp_table(struct device *dev); From 3ccbe9597123e2adb9024f3cef67dfede60be346 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 28 Aug 2020 12:01:50 -0700 Subject: [PATCH 0078/5344] nvme: Revert: Fix controller creation races with teardown flow commit b63de8400a6e1001b5732286cf6f5ec27799b7b4 upstream. The indicated patch introduced a barrier in the sysfs_delete attribute for the controller that rejects the request if the controller isn't created. "Created" is defined as at least 1 call to nvme_start_ctrl(). This is problematic in error-injection testing. If an error occurs on the initial attempt to create an association and the controller enters reconnect(s) attempts, the admin cannot delete the controller until either there is a successful association created or ctrl_loss_tmo times out. Where this issue is particularly hurtful is when the "admin" is the nvme-cli, it is performing a connection to a discovery controller, and it is initiated via auto-connect scripts. With the FC transport, if the first connection attempt fails, the controller enters a normal reconnect state but returns control to the cli thread that created the controller. In this scenario, the cli attempts to read the discovery log via ioctl, which fails, causing the cli to see it as an empty log and then proceeds to delete the discovery controller. The delete is rejected and the controller is left live. If the discovery controller reconnect then succeeds, there is no action to delete it, and it sits live doing nothing. Cc: # v5.7+ Fixes: ce1518139e69 ("nvme: Fix controller creation races with teardown flow") Signed-off-by: James Smart CC: Israel Rukshin CC: Max Gurtovoy CC: Christoph Hellwig CC: Keith Busch CC: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Gang Li --- drivers/nvme/host/core.c | 5 ----- drivers/nvme/host/nvme.h | 1 - 2 files changed, 6 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a5b5a2305791d..7f0ad9aa28fcc 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3237,10 +3237,6 @@ static ssize_t nvme_sysfs_delete(struct device *dev, { struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - /* Can't delete non-created controllers */ - if (!ctrl->created) - return -EBUSY; - if (device_remove_file_self(dev, attr)) nvme_delete_ctrl_sync(ctrl); return count; @@ -4041,7 +4037,6 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl) nvme_queue_scan(ctrl); nvme_start_queues(ctrl); } - ctrl->created = true; } EXPORT_SYMBOL_GPL(nvme_start_ctrl); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 2df90d4355b9a..01e739ffedbf0 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -253,7 +253,6 @@ struct nvme_ctrl { struct nvme_command ka_cmd; struct work_struct fw_act_work; unsigned long events; - bool created; #ifdef CONFIG_NVME_MULTIPATH /* asymmetric namespace access: */ From 5ec90676b1315440dd484004f7059773bc6101cf Mon Sep 17 00:00:00 2001 From: Alok Prasad Date: Thu, 1 Oct 2020 10:09:59 +0000 Subject: [PATCH 0079/5344] RDMA/qedr: Endianness warnings cleanup commit f45271acdf9eeb003296862b017806d41ec4ec55 upstream. Making a change to fix following sparse warnings reported by kbuild bot. CHECK drivers/infiniband/hw/qedr/verbs.c drivers/infiniband/hw/qedr/verbs.c:3872:59: warning: incorrect type in assignment (different base types) drivers/infiniband/hw/qedr/verbs.c:3872:59: expected restricted __le32 [usertype] sge_prod drivers/infiniband/hw/qedr/verbs.c:3872:59: got unsigned int [usertype] sge_prod drivers/infiniband/hw/qedr/verbs.c:3875:59: warning: incorrect type in assignment (different base types) drivers/infiniband/hw/qedr/verbs.c:3875:59: expected restricted __le32 [usertype] wqe_prod drivers/infiniband/hw/qedr/verbs.c:3875:59: got unsigned int [usertype] wqe_prod Link: https://lore.kernel.org/r/20201001100959.19940-1-palok@marvell.com Reported-by: kbuild test robot Fixes: acca72e2b031 ("RDMA/qedr: SRQ's bug fixes") Signed-off-by: Igor Russkikh Signed-off-by: Michal Kalderon Signed-off-by: Alok Prasad Signed-off-by: Jason Gunthorpe Signed-off-by: Gang Li --- drivers/infiniband/hw/qedr/verbs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 4408d33646647..aaea511fd42e3 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -3533,10 +3533,10 @@ int qedr_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, * in first 4 bytes and need to update WQE producer in * next 4 bytes. */ - srq->hw_srq.virt_prod_pair_addr->sge_prod = hw_srq->sge_prod; + srq->hw_srq.virt_prod_pair_addr->sge_prod = cpu_to_le32(hw_srq->sge_prod); /* Make sure sge producer is updated first */ dma_wmb(); - srq->hw_srq.virt_prod_pair_addr->wqe_prod = hw_srq->wqe_prod; + srq->hw_srq.virt_prod_pair_addr->wqe_prod = cpu_to_le32(hw_srq->wqe_prod); wr = wr->next; } From 21e8b092ef28cf8737147458c8c79c9fa4c3bc3c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 13 Oct 2020 16:48:27 -0700 Subject: [PATCH 0080/5344] fs/xattr.c: fix kernel-doc warnings for setxattr & removexattr commit da5c1c0bb316e3a0454d2c6980e9c2b618c149b0 upstream. Fix kernel-doc warnings in fs/xattr.c: ../fs/xattr.c:251: warning: Function parameter or member 'dentry' not described in '__vfs_setxattr_locked' ../fs/xattr.c:251: warning: Function parameter or member 'name' not described in '__vfs_setxattr_locked' ../fs/xattr.c:251: warning: Function parameter or member 'value' not described in '__vfs_setxattr_locked' ../fs/xattr.c:251: warning: Function parameter or member 'size' not described in '__vfs_setxattr_locked' ../fs/xattr.c:251: warning: Function parameter or member 'flags' not described in '__vfs_setxattr_locked' ../fs/xattr.c:251: warning: Function parameter or member 'delegated_inode' not described in '__vfs_setxattr_locked' ../fs/xattr.c:458: warning: Function parameter or member 'dentry' not described in '__vfs_removexattr_locked' ../fs/xattr.c:458: warning: Function parameter or member 'name' not described in '__vfs_removexattr_locked' ../fs/xattr.c:458: warning: Function parameter or member 'delegated_inode' not described in '__vfs_removexattr_locked' Fixes: 08b5d5014a27 ("xattr: break delegations in {set,remove}xattr") Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Cc: Al Viro Cc: Frank van der Linden Cc: Chuck Lever Link: http://lkml.kernel.org/r/7a3dd5a2-5787-adf3-d525-c203f9910ec4@infradead.org Signed-off-by: Linus Torvalds Signed-off-by: Gang Li --- fs/xattr.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/xattr.c b/fs/xattr.c index f2854570d4119..ad59c9e2d0cf8 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -205,15 +205,15 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, } /** - * __vfs_setxattr_locked: set an extended attribute while holding the inode + * __vfs_setxattr_locked - set an extended attribute while holding the inode * lock * - * @dentry - object to perform setxattr on - * @name - xattr name to set - * @value - value to set @name to - * @size - size of @value - * @flags - flags to pass into filesystem operations - * @delegated_inode - on return, will contain an inode pointer that + * @dentry: object to perform setxattr on + * @name: xattr name to set + * @value: value to set @name to + * @size: size of @value + * @flags: flags to pass into filesystem operations + * @delegated_inode: on return, will contain an inode pointer that * a delegation was broken on, NULL if none. */ int @@ -416,12 +416,12 @@ __vfs_removexattr(struct dentry *dentry, const char *name) EXPORT_SYMBOL(__vfs_removexattr); /** - * __vfs_removexattr_locked: set an extended attribute while holding the inode + * __vfs_removexattr_locked - set an extended attribute while holding the inode * lock * - * @dentry - object to perform setxattr on - * @name - name of xattr to remove - * @delegated_inode - on return, will contain an inode pointer that + * @dentry: object to perform setxattr on + * @name: name of xattr to remove + * @delegated_inode: on return, will contain an inode pointer that * a delegation was broken on, NULL if none. */ int From 1496f253addd974cca7fd9409a87957bfbcd256d Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 2 Nov 2020 15:02:00 -0700 Subject: [PATCH 0081/5344] vfio/pci: Implement ioeventfd thread handler for contended memory lock commit 38565c93c8a1306dc5f245572a545fbea908ac41 upstream. The ioeventfd is called under spinlock with interrupts disabled, therefore if the memory lock is contended defer code that might sleep to a thread context. Fixes: bc93b9ae0151 ("vfio-pci: Avoid recursive read-lock usage") Link: https://bugzilla.kernel.org/show_bug.cgi?id=209253#c1 Reported-by: Ian Pilcher Tested-by: Ian Pilcher Tested-by: Justin Gatzen Signed-off-by: Alex Williamson Signed-off-by: Gang Li --- drivers/vfio/pci/vfio_pci_rdwr.c | 43 ++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index eaa1975ebae2c..ea755e02a23bc 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -356,34 +356,60 @@ ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf, return done; } -static int vfio_pci_ioeventfd_handler(void *opaque, void *unused) +static void vfio_pci_ioeventfd_do_write(struct vfio_pci_ioeventfd *ioeventfd, + bool test_mem) { - struct vfio_pci_ioeventfd *ioeventfd = opaque; - switch (ioeventfd->count) { case 1: - vfio_pci_iowrite8(ioeventfd->vdev, ioeventfd->test_mem, + vfio_pci_iowrite8(ioeventfd->vdev, test_mem, ioeventfd->data, ioeventfd->addr); break; case 2: - vfio_pci_iowrite16(ioeventfd->vdev, ioeventfd->test_mem, + vfio_pci_iowrite16(ioeventfd->vdev, test_mem, ioeventfd->data, ioeventfd->addr); break; case 4: - vfio_pci_iowrite32(ioeventfd->vdev, ioeventfd->test_mem, + vfio_pci_iowrite32(ioeventfd->vdev, test_mem, ioeventfd->data, ioeventfd->addr); break; #ifdef iowrite64 case 8: - vfio_pci_iowrite64(ioeventfd->vdev, ioeventfd->test_mem, + vfio_pci_iowrite64(ioeventfd->vdev, test_mem, ioeventfd->data, ioeventfd->addr); break; #endif } +} + +static int vfio_pci_ioeventfd_handler(void *opaque, void *unused) +{ + struct vfio_pci_ioeventfd *ioeventfd = opaque; + struct vfio_pci_device *vdev = ioeventfd->vdev; + + if (ioeventfd->test_mem) { + if (!down_read_trylock(&vdev->memory_lock)) + return 1; /* Lock contended, use thread */ + if (!__vfio_pci_memory_enabled(vdev)) { + up_read(&vdev->memory_lock); + return 0; + } + } + + vfio_pci_ioeventfd_do_write(ioeventfd, false); + + if (ioeventfd->test_mem) + up_read(&vdev->memory_lock); return 0; } +static void vfio_pci_ioeventfd_thread(void *opaque, void *unused) +{ + struct vfio_pci_ioeventfd *ioeventfd = opaque; + + vfio_pci_ioeventfd_do_write(ioeventfd, ioeventfd->test_mem); +} + long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset, uint64_t data, int count, int fd) { @@ -457,7 +483,8 @@ long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset, ioeventfd->test_mem = vdev->pdev->resource[bar].flags & IORESOURCE_MEM; ret = vfio_virqfd_enable(ioeventfd, vfio_pci_ioeventfd_handler, - NULL, NULL, &ioeventfd->virqfd, fd); + vfio_pci_ioeventfd_thread, NULL, + &ioeventfd->virqfd, fd); if (ret) { kfree(ioeventfd); goto out_unlock; From 162bda6884f7ea6b4c0e194fed074073736160b3 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Fri, 6 Nov 2020 22:12:16 +0800 Subject: [PATCH 0082/5344] x86/mce: Correct the detection of invalid notifier priorities commit 15af36596ae305aefc8c502c2d3e8c58221709eb upstream. Commit c9c6d216ed28 ("x86/mce: Rename "first" function as "early"") changed the enumeration of MCE notifier priorities. Correct the check for notifier priorities to cover the new range. [ bp: Rewrite commit message, remove superfluous brackets in conditional. ] Fixes: c9c6d216ed28 ("x86/mce: Rename "first" function as "early"") Signed-off-by: Zhen Lei Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20201106141216.2062-2-thunder.leizhen@huawei.com Signed-off-by: Gang Li --- arch/x86/include/asm/mce.h | 3 ++- arch/x86/kernel/cpu/mce/core.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index dd2e21b130eb6..b2ec08746245d 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -174,7 +174,8 @@ enum mce_notifier_prios { MCE_PRIO_EXTLOG, MCE_PRIO_UC, MCE_PRIO_EARLY, - MCE_PRIO_CEC + MCE_PRIO_CEC, + MCE_PRIO_HIGHEST = MCE_PRIO_CEC }; struct notifier_block; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 6f55ed1cab6fc..147f3ffa0fa38 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -171,7 +171,8 @@ EXPORT_SYMBOL_GPL(mce_inject_log); void mce_register_decode_chain(struct notifier_block *nb) { - if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC)) + if (WARN_ON(nb->priority < MCE_PRIO_LOWEST || + nb->priority > MCE_PRIO_HIGHEST)) return; blocking_notifier_chain_register(&x86_mce_decoder_chain, nb); From 7b2c85e9c028ce450a280c12401f5ad9c3c0276f Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 9 Dec 2020 16:00:04 +0200 Subject: [PATCH 0083/5344] vdpa/mlx5: Use write memory barrier after updating CQ index commit 83ef73b27eb2363f44faf9c3ee28a3fe752cfd15 upstream. Make sure to put dma write memory barrier after updating CQ consumer index so the hardware knows that there are available CQE slots in the queue. Failure to do this can cause the update of the RX doorbell record to get updated before the CQ consumer index resulting in CQ overrun. Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices") Signed-off-by: Eli Cohen Link: https://lore.kernel.org/r/20201209140004.15892-1-elic@nvidia.com Signed-off-by: Michael S. Tsirkin Signed-off-by: Gang Li --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 92d20f680e1ce..e66763ada976e 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -464,6 +464,11 @@ static int mlx5_vdpa_poll_one(struct mlx5_vdpa_cq *vcq) static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num) { mlx5_cq_set_ci(&mvq->cq.mcq); + + /* make sure CQ cosumer update is visible to the hardware before updating + * RX doorbell record. + */ + dma_wmb(); rx_post(&mvq->vqqp, num); if (mvq->event_cb.callback) mvq->event_cb.callback(mvq->event_cb.private); From 67e98aaf9e72a10b7245fb22f46ceceb107dea6f Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 7 Jan 2021 09:18:45 +0200 Subject: [PATCH 0084/5344] vdpa/mlx5: Fix memory key MTT population commit 710eb8e32d04714452759f2b66884bfa7e97d495 upstream. map_direct_mr() assumed that the number of scatter/gather entries returned by dma_map_sg_attrs() was equal to the number of segments in the sgl list. This led to wrong population of the mkey object. Fix this by properly referring to the returned value. The hardware expects each MTT entry to contain the DMA address of a contiguous block of memory of size (1 << mr->log_size) bytes. dma_map_sg_attrs() can coalesce several sg entries into a single scatter/gather entry of contiguous DMA range so we need to scan the list and refer to the size of each s/g entry. In addition, get rid of fill_sg() which effect is overwritten by populate_mtts(). Fixes: 94abbccdf291 ("vdpa/mlx5: Add shared memory registration code") Signed-off-by: Eli Cohen Link: https://lore.kernel.org/r/20210107071845.GA224876@mtl-vdi-166.wap.labs.mlnx Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Signed-off-by: Gang Li --- drivers/vdpa/mlx5/core/mlx5_vdpa.h | 1 + drivers/vdpa/mlx5/core/mr.c | 28 ++++++++++++---------------- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h index 5c92a576edae8..08f742fd24099 100644 --- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h +++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h @@ -15,6 +15,7 @@ struct mlx5_vdpa_direct_mr { struct sg_table sg_head; int log_size; int nsg; + int nent; struct list_head list; u64 offset; }; diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c index 4b6195666c589..d300f799efcd1 100644 --- a/drivers/vdpa/mlx5/core/mr.c +++ b/drivers/vdpa/mlx5/core/mr.c @@ -25,17 +25,6 @@ static int get_octo_len(u64 len, int page_shift) return (npages + 1) / 2; } -static void fill_sg(struct mlx5_vdpa_direct_mr *mr, void *in) -{ - struct scatterlist *sg; - __be64 *pas; - int i; - - pas = MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); - for_each_sg(mr->sg_head.sgl, sg, mr->nsg, i) - (*pas) = cpu_to_be64(sg_dma_address(sg)); -} - static void mlx5_set_access_mode(void *mkc, int mode) { MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3); @@ -45,10 +34,18 @@ static void mlx5_set_access_mode(void *mkc, int mode) static void populate_mtts(struct mlx5_vdpa_direct_mr *mr, __be64 *mtt) { struct scatterlist *sg; + int nsg = mr->nsg; + u64 dma_addr; + u64 dma_len; + int j = 0; int i; - for_each_sg(mr->sg_head.sgl, sg, mr->nsg, i) - mtt[i] = cpu_to_be64(sg_dma_address(sg)); + for_each_sg(mr->sg_head.sgl, sg, mr->nent, i) { + for (dma_addr = sg_dma_address(sg), dma_len = sg_dma_len(sg); + nsg && dma_len; + nsg--, dma_addr += BIT(mr->log_size), dma_len -= BIT(mr->log_size)) + mtt[j++] = cpu_to_be64(dma_addr); + } } static int create_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr *mr) @@ -64,7 +61,6 @@ static int create_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct return -ENOMEM; MLX5_SET(create_mkey_in, in, uid, mvdev->res.uid); - fill_sg(mr, in); mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); MLX5_SET(mkc, mkc, lw, !!(mr->perm & VHOST_MAP_WO)); MLX5_SET(mkc, mkc, lr, !!(mr->perm & VHOST_MAP_RO)); @@ -276,8 +272,8 @@ static int map_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr done: mr->log_size = log_entity_size; mr->nsg = nsg; - err = dma_map_sg_attrs(dma, mr->sg_head.sgl, mr->nsg, DMA_BIDIRECTIONAL, 0); - if (!err) + mr->nent = dma_map_sg_attrs(dma, mr->sg_head.sgl, mr->nsg, DMA_BIDIRECTIONAL, 0); + if (!mr->nent) goto err_map; err = create_direct_mr(mvdev, mr); From 52baa3f2e4bc5458720b9a1efa51fb27efd89cda Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Tue, 19 Jan 2021 10:53:26 +0100 Subject: [PATCH 0085/5344] arch/Kconfig: update a broken file reference commit ba1a297d78f47a23b138fb5cad953ae4a3fa4929 upstream. Commit adab66b71abf ("Revert: "ring-buffer: Remove HAVE_64BIT_ALIGNED_ACCESS"") added the config HAVE_64BIT_ALIGNED_ACCESS back into arch/Kconfig with this revert. In the meantime, commit c9b54d6f362c ("docs: move other kAPI documents to core-api") changed ./Documentation/unaligned-memory-access.txt to ./Documentation/core-api/unaligned-memory-access.rst. Fortunately, ./scripts/documentation-file-ref-check detects this and warns about this broken reference. Update the file reference in arch/Kconfig. Fixes: adab66b71abf ("Revert: "ring-buffer: Remove HAVE_64BIT_ALIGNED_ACCESS"") Signed-off-by: Lukas Bulwahn Link: https://lore.kernel.org/r/20210119095326.13896-1-lukas.bulwahn@gmail.com Signed-off-by: Jonathan Corbet Signed-off-by: Gang Li --- arch/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index bb67f906faa22..cf4c96c20d6d5 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -147,8 +147,8 @@ config HAVE_64BIT_ALIGNED_ACCESS accesses are required to be 64 bit aligned in this way even though it is not a 64 bit architecture. - See Documentation/unaligned-memory-access.txt for more - information on the topic of unaligned memory accesses. + See Documentation/core-api/unaligned-memory-access.rst for + more information on the topic of unaligned memory accesses. config HAVE_EFFICIENT_UNALIGNED_ACCESS bool From be56a29b8b9eeaaf47eabcfad2bbba2b97784000 Mon Sep 17 00:00:00 2001 From: Jens Wiklander Date: Mon, 25 Jan 2021 10:36:57 +0100 Subject: [PATCH 0086/5344] tee: optee: remove need_resched() before cond_resched() commit 958567600517fd15b7f35ca1a8be0104f0eb0686 upstream. Testing need_resched() before cond_resched() is not needed as an equivalent test is done internally in cond_resched(). So drop the need_resched() test. Fixes: dcb3b06d9c34 ("tee: optee: replace might_sleep with cond_resched") Reviewed-by: Rouven Czerwinski Tested-by: Rouven Czerwinski Tested-by: Sumit Garg Acked-by: Arnd Bergmann Signed-off-by: Jens Wiklander Signed-off-by: Gang Li --- drivers/tee/optee/call.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/tee/optee/call.c b/drivers/tee/optee/call.c index f58f383e8ef21..ba19876368dd8 100644 --- a/drivers/tee/optee/call.c +++ b/drivers/tee/optee/call.c @@ -149,8 +149,7 @@ u32 optee_do_call_with_arg(struct tee_context *ctx, phys_addr_t parg) */ optee_cq_wait_for_completion(&optee->call_queue, &w); } else if (OPTEE_SMC_RETURN_IS_RPC(res.a0)) { - if (need_resched()) - cond_resched(); + cond_resched(); param.a0 = res.a0; param.a1 = res.a1; param.a2 = res.a2; From 69fdea647a1216ae19f094b211a69272568412a1 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 4 Feb 2021 09:36:18 +0200 Subject: [PATCH 0087/5344] vdpa/mlx5: Restore the hardware used index after change map commit b35ccebe3ef76168aa2edaa35809c0232cb3578e upstream. When a change of memory map occurs, the hardware resources are destroyed and then re-created again with the new memory map. In such case, we need to restore the hardware available and used indices. The driver failed to restore the used index which is added here. Also, since the driver also fails to reset the available and used indices upon device reset, fix this here to avoid regression caused by the fact that used index may not be zero upon device reset. Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices") Signed-off-by: Eli Cohen Link: https://lore.kernel.org/r/20210204073618.36336-1-elic@nvidia.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Signed-off-by: Gang Li --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index e66763ada976e..9aff4d2716f43 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -77,6 +77,7 @@ struct mlx5_vq_restore_info { u64 device_addr; u64 driver_addr; u16 avail_index; + u16 used_index; bool ready; struct vdpa_callback cb; bool restore; @@ -111,6 +112,7 @@ struct mlx5_vdpa_virtqueue { u32 virtq_id; struct mlx5_vdpa_net *ndev; u16 avail_idx; + u16 used_idx; int fw_state; /* keep last in the struct */ @@ -789,6 +791,7 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtque obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in, obj_context); MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx); + MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx); MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_12_3, get_features_12_3(ndev->mvdev.actual_features)); vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context); @@ -1007,6 +1010,7 @@ static int connect_qps(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *m struct mlx5_virtq_attr { u8 state; u16 available_index; + u16 used_index; }; static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, @@ -1037,6 +1041,7 @@ static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueu memset(attr, 0, sizeof(*attr)); attr->state = MLX5_GET(virtio_net_q_object, obj_context, state); attr->available_index = MLX5_GET(virtio_net_q_object, obj_context, hw_available_index); + attr->used_index = MLX5_GET(virtio_net_q_object, obj_context, hw_used_index); kfree(out); return 0; @@ -1520,6 +1525,16 @@ static void teardown_virtqueues(struct mlx5_vdpa_net *ndev) } } +static void clear_virtqueues(struct mlx5_vdpa_net *ndev) +{ + int i; + + for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) { + ndev->vqs[i].avail_idx = 0; + ndev->vqs[i].used_idx = 0; + } +} + /* TODO: cross-endian support */ static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev) { @@ -1595,6 +1610,7 @@ static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu return err; ri->avail_index = attr.available_index; + ri->used_index = attr.used_index; ri->ready = mvq->ready; ri->num_ent = mvq->num_ent; ri->desc_addr = mvq->desc_addr; @@ -1639,6 +1655,7 @@ static void restore_channels_info(struct mlx5_vdpa_net *ndev) continue; mvq->avail_idx = ri->avail_index; + mvq->used_idx = ri->used_index; mvq->ready = ri->ready; mvq->num_ent = ri->num_ent; mvq->desc_addr = ri->desc_addr; @@ -1753,6 +1770,7 @@ static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status) if (!status) { mlx5_vdpa_info(mvdev, "performing device reset\n"); teardown_driver(ndev); + clear_virtqueues(ndev); mlx5_vdpa_destroy_mr(&ndev->mvdev); ndev->mvdev.status = 0; ndev->mvdev.mlx_features = 0; From 06be964a06f3a32a31ce60dcda52503a224be9a5 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 12 Mar 2021 05:21:38 -0800 Subject: [PATCH 0088/5344] perf/x86/intel: Fix unchecked MSR access error caused by VLBR_EVENT commit 2dc0572f2cef87425147658698dce2600b799bd3 upstream. On a Haswell machine, the perf_fuzzer managed to trigger this message: [117248.075892] unchecked MSR access error: WRMSR to 0x3f1 (tried to write 0x0400000000000000) at rIP: 0xffffffff8106e4f4 (native_write_msr+0x4/0x20) [117248.089957] Call Trace: [117248.092685] intel_pmu_pebs_enable_all+0x31/0x40 [117248.097737] intel_pmu_enable_all+0xa/0x10 [117248.102210] __perf_event_task_sched_in+0x2df/0x2f0 [117248.107511] finish_task_switch.isra.0+0x15f/0x280 [117248.112765] schedule_tail+0xc/0x40 [117248.116562] ret_from_fork+0x8/0x30 A fake event called VLBR_EVENT may use the bit 58 of the PEBS_ENABLE, if the precise_ip is set. The bit 58 is reserved by the HW. Accessing the bit causes the unchecked MSR access error. The fake event doesn't support PEBS. The case should be rejected. Fixes: 097e4311cda9 ("perf/x86: Add constraint to create guest LBR event without hw counter") Reported-by: Vince Weaver Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1615555298-140216-2-git-send-email-kan.liang@linux.intel.com Signed-off-by: Gang Li --- arch/x86/events/intel/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index f0ac975f3d223..318a89bf00d0b 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3543,6 +3543,9 @@ static int intel_pmu_hw_config(struct perf_event *event) return ret; if (event->attr.precise_ip) { + if ((event->attr.config & INTEL_ARCH_EVENT_MASK) == INTEL_FIXED_VLBR_EVENT) + return -EINVAL; + if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) { event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; if (!(event->attr.sample_type & From 8f394b48a9336da7392b9d9ae504f716b228fbd2 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 8 Apr 2021 11:22:09 -0400 Subject: [PATCH 0089/5344] PM: runtime: Add documentation for pm_runtime_resume_and_get() commit 2c412337cfe655bcc6adff8904fc653a1678f70f upstream. Commit dd8088d5a896 ("PM: runtime: Add pm_runtime_resume_and_get to deal with usage counter") added a new runtime-PM API function without updating the documentation in runtime_pm.rst. Add the missing documentation. Signed-off-by: Alan Stern Fixes: dd8088d5a896 ("PM: runtime: Add pm_runtime_resume_and_get to deal with usage counter") Signed-off-by: Rafael J. Wysocki Signed-off-by: Gang Li --- Documentation/power/runtime_pm.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/power/runtime_pm.rst b/Documentation/power/runtime_pm.rst index 2c2ec99b50886..a3a1d04526c24 100644 --- a/Documentation/power/runtime_pm.rst +++ b/Documentation/power/runtime_pm.rst @@ -339,6 +339,10 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h: checked additionally, and -EACCES means that 'power.disable_depth' is different from 0 + `int pm_runtime_resume_and_get(struct device *dev);` + - run pm_runtime_resume(dev) and if successful, increment the device's + usage counter; return the result of pm_runtime_resume + `int pm_request_idle(struct device *dev);` - submit a request to execute the subsystem-level idle callback for the device (the request is represented by a work item in pm_wq); returns 0 on From d0a8ea97709528e8a33c7aa6df7fd1d103ec90bd Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 8 Apr 2021 12:10:46 +0300 Subject: [PATCH 0090/5344] vdpa/mlx5: Fix wrong use of bit numbers commit 4b454a82418dd76d8c0590bb3f7a99a63ea57dc5 upstream. VIRTIO_F_VERSION_1 is a bit number. Use BIT_ULL() with mask conditionals. Also, in mlx5_vdpa_is_little_endian() use BIT_ULL for consistency with the rest of the code. Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices") Signed-off-by: Eli Cohen Link: https://lore.kernel.org/r/20210408091047.4269-5-elic@nvidia.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Signed-off-by: Gang Li --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 9aff4d2716f43..8f51a8f081a05 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -805,7 +805,7 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtque MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->fwqp.mqp.qpn); MLX5_SET(virtio_q, vq_ctx, queue_size, mvq->num_ent); MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0, - !!(ndev->mvdev.actual_features & VIRTIO_F_VERSION_1)); + !!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_F_VERSION_1))); MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr); MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr); MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr); @@ -1539,7 +1539,7 @@ static void clear_virtqueues(struct mlx5_vdpa_net *ndev) static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev) { return virtio_legacy_is_little_endian() || - (mvdev->actual_features & (1ULL << VIRTIO_F_VERSION_1)); + (mvdev->actual_features & BIT_ULL(VIRTIO_F_VERSION_1)); } static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev *mvdev, u16 val) From 944077a79529cc0271b9ec684f1087225e510b44 Mon Sep 17 00:00:00 2001 From: Si-Wei Liu Date: Thu, 8 Apr 2021 12:10:43 +0300 Subject: [PATCH 0091/5344] vdpa/mlx5: should exclude header length and fcs from mtu commit d084d996aaf53c0cc583dc75a4fc2a67fe485846 upstream. When feature VIRTIO_NET_F_MTU is negotiated on mlx5_vdpa, 22 extra bytes worth of MTU length is shown in guest. This is because the mlx5_query_port_max_mtu API returns the "hardware" MTU value, which does not just contain the Ethernet payload, but includes extra lengths starting from the Ethernet header up to the FCS altogether. Fix the MTU so packets won't get dropped silently. Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices") Signed-off-by: Si-Wei Liu Acked-by: Jason Wang Acked-by: Eli Cohen Link: https://lore.kernel.org/r/20210408091047.4269-2-elic@nvidia.com Signed-off-by: Michael S. Tsirkin Signed-off-by: Gang Li --- drivers/vdpa/mlx5/core/mlx5_vdpa.h | 4 ++++ drivers/vdpa/mlx5/net/mlx5_vnet.c | 15 ++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h index 08f742fd24099..b6cc53ba980cc 100644 --- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h +++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h @@ -4,9 +4,13 @@ #ifndef __MLX5_VDPA_H__ #define __MLX5_VDPA_H__ +#include +#include #include #include +#define MLX5V_ETH_HARD_MTU (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN) + struct mlx5_vdpa_direct_mr { u64 start; u64 end; diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 8f51a8f081a05..2b986e0e843e6 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1898,6 +1898,19 @@ static const struct vdpa_config_ops mlx5_vdpa_ops = { .free = mlx5_vdpa_free, }; +static int query_mtu(struct mlx5_core_dev *mdev, u16 *mtu) +{ + u16 hw_mtu; + int err; + + err = mlx5_query_nic_vport_mtu(mdev, &hw_mtu); + if (err) + return err; + + *mtu = hw_mtu - MLX5V_ETH_HARD_MTU; + return 0; +} + static int alloc_resources(struct mlx5_vdpa_net *ndev) { struct mlx5_vdpa_net_resources *res = &ndev->res; @@ -1980,7 +1993,7 @@ void *mlx5_vdpa_add_dev(struct mlx5_core_dev *mdev) init_mvqs(ndev); mutex_init(&ndev->reslock); config = &ndev->config; - err = mlx5_query_nic_vport_mtu(mdev, &ndev->mtu); + err = query_mtu(mdev, &ndev->mtu); if (err) goto err_mtu; From b12dfe834492674fd2b5f1aaff0d665524a728f1 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 8 Apr 2021 12:10:47 +0300 Subject: [PATCH 0092/5344] vdpa/mlx5: Fix suspend/resume index restoration commit bc04d93ea30a0a8eb2a2648b848cef35d1f6f798 upstream. When we suspend the VM, the VDPA interface will be reset. When the VM is resumed again, clear_virtqueues() will clear the available and used indices resulting in hardware virqtqueue objects becoming out of sync. We can avoid this function alltogether since qemu will clear them if required, e.g. when the VM went through a reboot. Moreover, since the hw available and used indices should always be identical on query and should be restored to the same value same value for virtqueues that complete in order, we set the single value provided by set_vq_state(). In get_vq_state() we return the value of hardware used index. Fixes: b35ccebe3ef7 ("vdpa/mlx5: Restore the hardware used index after change map") Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices") Signed-off-by: Eli Cohen Link: https://lore.kernel.org/r/20210408091047.4269-6-elic@nvidia.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Signed-off-by: Gang Li --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 2b986e0e843e6..75ca629e1d349 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1154,6 +1154,7 @@ static void suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *m return; } mvq->avail_idx = attr.available_index; + mvq->used_idx = attr.used_index; } static void suspend_vqs(struct mlx5_vdpa_net *ndev) @@ -1411,6 +1412,7 @@ static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx, return -EINVAL; } + mvq->used_idx = state->avail_index; mvq->avail_idx = state->avail_index; return 0; } @@ -1428,7 +1430,11 @@ static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa * that cares about emulating the index after vq is stopped. */ if (!mvq->initialized) { - state->avail_index = mvq->avail_idx; + /* Firmware returns a wrong value for the available index. + * Since both values should be identical, we take the value of + * used_idx which is reported correctly. + */ + state->avail_index = mvq->used_idx; return 0; } @@ -1437,7 +1443,7 @@ static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa mlx5_vdpa_warn(mvdev, "failed to query virtqueue\n"); return err; } - state->avail_index = attr.available_index; + state->avail_index = attr.used_index; return 0; } @@ -1525,16 +1531,6 @@ static void teardown_virtqueues(struct mlx5_vdpa_net *ndev) } } -static void clear_virtqueues(struct mlx5_vdpa_net *ndev) -{ - int i; - - for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) { - ndev->vqs[i].avail_idx = 0; - ndev->vqs[i].used_idx = 0; - } -} - /* TODO: cross-endian support */ static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev) { @@ -1770,7 +1766,6 @@ static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status) if (!status) { mlx5_vdpa_info(mvdev, "performing device reset\n"); teardown_driver(ndev); - clear_virtqueues(ndev); mlx5_vdpa_destroy_mr(&ndev->mvdev); ndev->mvdev.status = 0; ndev->mvdev.mlx_features = 0; From 96d19454bb4c12ee961986b5f8dd8fb7f76060d6 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Sun, 11 Apr 2021 11:36:46 +0300 Subject: [PATCH 0093/5344] vdpa/mlx5: Set err = -ENOMEM in case dma_map_sg_attrs fails commit be286f84e33da1a7f83142b64dbd86f600e73363 upstream. Set err = -ENOMEM if dma_map_sg_attrs() fails so the function reutrns error. Fixes: 94abbccdf291 ("vdpa/mlx5: Add shared memory registration code") Signed-off-by: Eli Cohen Reported-by: kernel test robot Reported-by: Dan Carpenter Link: https://lore.kernel.org/r/20210411083646.910546-1-elic@nvidia.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Reviewed-by: Stefano Garzarella Signed-off-by: Gang Li --- drivers/vdpa/mlx5/core/mr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c index d300f799efcd1..aa656f57bf5b7 100644 --- a/drivers/vdpa/mlx5/core/mr.c +++ b/drivers/vdpa/mlx5/core/mr.c @@ -273,8 +273,10 @@ static int map_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr mr->log_size = log_entity_size; mr->nsg = nsg; mr->nent = dma_map_sg_attrs(dma, mr->sg_head.sgl, mr->nsg, DMA_BIDIRECTIONAL, 0); - if (!mr->nent) + if (!mr->nent) { + err = -ENOMEM; goto err_map; + } err = create_direct_mr(mvdev, mr); if (err) From 84a45ae7078adbcb048c1a1940df6a6cd226a2d0 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Thu, 22 Apr 2021 21:41:51 +0800 Subject: [PATCH 0094/5344] net: sock: remove the unnecessary check in proto_register commit ed744d819379ddeec5744b0bfc7eb6d0a8ac4e46 upstream. tw_prot_cleanup will check the twsk_prot. Fixes: 0f5907af3913 ("net: Fix potential memory leak in proto_register()") Cc: Miaohe Lin Signed-off-by: Tonghao Zhang Signed-off-by: David S. Miller Signed-off-by: Gang Li --- net/core/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/sock.c b/net/core/sock.c index c77298b2d9a1b..b9c8f7c08ce16 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3447,7 +3447,7 @@ int proto_register(struct proto *prot, int alloc_slab) return ret; out_free_timewait_sock_slab: - if (alloc_slab && prot->twsk_prot) + if (alloc_slab) tw_prot_cleanup(prot->twsk_prot); out_free_request_sock_slab: if (alloc_slab) { From bcd2f15db4d958657161afa5d8a2888f9c28c611 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 13 Apr 2021 17:15:57 +0800 Subject: [PATCH 0095/5344] vhost-vdpa: fix vm_flags for virtqueue doorbell mapping commit 3a3e0fad16d40a2aa68ddf7eea4acdf48b22dd44 upstream. The virtqueue doorbell is usually implemented via registeres but we don't provide the necessary vma->flags like VM_PFNMAP. This may cause several issues e.g when userspace tries to map the doorbell via vhost IOTLB, kernel may panic due to the page is not backed by page structure. This patch fixes this by setting the necessary vm_flags. With this patch, try to map doorbell via IOTLB will fail with bad address. Cc: stable@vger.kernel.org Fixes: ddd89d0a059d ("vhost_vdpa: support doorbell mapping via mmap") Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210413091557.29008-1-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin Signed-off-by: Gang Li --- drivers/vhost/vdpa.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 4eae2fcc04dc1..157bcd08d11a0 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -1081,6 +1081,7 @@ static int vhost_vdpa_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_end - vma->vm_start != notify.size) return -ENOTSUPP; + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &vhost_vdpa_vm_ops; return 0; } From 86502cc6463791cffcab8797d4958a9109595614 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Tue, 4 May 2021 01:52:36 -0500 Subject: [PATCH 0096/5344] x86/events/amd/iommu: Fix invalid Perf result due to IOMMU PMC power-gating commit e10de314287c2c14b0e6f0e3e961975ce2f4a83d upstream. On certain AMD platforms, when the IOMMU performance counter source (csource) field is zero, power-gating for the counter is enabled, which prevents write access and returns zero for read access. This can cause invalid perf result especially when event multiplexing is needed (i.e. more number of events than available counters) since the current logic keeps track of the previously read counter value, and subsequently re-program the counter to continue counting the event. With power-gating enabled, we cannot gurantee successful re-programming of the counter. Workaround this issue by : 1. Modifying the ordering of setting/reading counters and enabing/ disabling csources to only access the counter when the csource is set to non-zero. 2. Since AMD IOMMU PMU does not support interrupt mode, the logic can be simplified to always start counting with value zero, and accumulate the counter value when stopping without the need to keep track and reprogram the counter with the previously read counter value. This has been tested on systems with and without power-gating. Fixes: 994d6608efe4 ("iommu/amd: Remove performance counter pre-initialization test") Suggested-by: Alexander Monakov Signed-off-by: Suravee Suthikulpanit Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210504065236.4415-1-suravee.suthikulpanit@amd.com Signed-off-by: Gang Li --- arch/x86/events/amd/iommu.c | 47 ++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c index 6a98a76516214..2da6139b0977f 100644 --- a/arch/x86/events/amd/iommu.c +++ b/arch/x86/events/amd/iommu.c @@ -18,8 +18,6 @@ #include "../perf_event.h" #include "iommu.h" -#define COUNTER_SHIFT 16 - /* iommu pmu conf masks */ #define GET_CSOURCE(x) ((x)->conf & 0xFFULL) #define GET_DEVID(x) (((x)->conf >> 8) & 0xFFFFULL) @@ -285,22 +283,31 @@ static void perf_iommu_start(struct perf_event *event, int flags) WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); hwc->state = 0; + /* + * To account for power-gating, which prevents write to + * the counter, we need to enable the counter + * before setting up counter register. + */ + perf_iommu_enable_event(event); + if (flags & PERF_EF_RELOAD) { - u64 prev_raw_count = local64_read(&hwc->prev_count); + u64 count = 0; struct amd_iommu *iommu = perf_event_2_iommu(event); + /* + * Since the IOMMU PMU only support counting mode, + * the counter always start with value zero. + */ amd_iommu_pc_set_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr, - IOMMU_PC_COUNTER_REG, &prev_raw_count); + IOMMU_PC_COUNTER_REG, &count); } - perf_iommu_enable_event(event); perf_event_update_userpage(event); - } static void perf_iommu_read(struct perf_event *event) { - u64 count, prev, delta; + u64 count; struct hw_perf_event *hwc = &event->hw; struct amd_iommu *iommu = perf_event_2_iommu(event); @@ -311,14 +318,11 @@ static void perf_iommu_read(struct perf_event *event) /* IOMMU pc counter register is only 48 bits */ count &= GENMASK_ULL(47, 0); - prev = local64_read(&hwc->prev_count); - if (local64_cmpxchg(&hwc->prev_count, prev, count) != prev) - return; - - /* Handle 48-bit counter overflow */ - delta = (count << COUNTER_SHIFT) - (prev << COUNTER_SHIFT); - delta >>= COUNTER_SHIFT; - local64_add(delta, &event->count); + /* + * Since the counter always start with value zero, + * simply just accumulate the count for the event. + */ + local64_add(count, &event->count); } static void perf_iommu_stop(struct perf_event *event, int flags) @@ -328,15 +332,16 @@ static void perf_iommu_stop(struct perf_event *event, int flags) if (hwc->state & PERF_HES_UPTODATE) return; + /* + * To account for power-gating, in which reading the counter would + * return zero, we need to read the register before disabling. + */ + perf_iommu_read(event); + hwc->state |= PERF_HES_UPTODATE; + perf_iommu_disable_event(event); WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); hwc->state |= PERF_HES_STOPPED; - - if (hwc->state & PERF_HES_UPTODATE) - return; - - perf_iommu_read(event); - hwc->state |= PERF_HES_UPTODATE; } static int perf_iommu_add(struct perf_event *event, int flags) From 0ccc778ee6164383918e8a9a0afa2fb76f351766 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 14 May 2021 12:02:50 +0100 Subject: [PATCH 0097/5344] io_uring: fix ltout double free on completion race commit 447c19f3b5074409c794b350b10306e1da1ef4ba upstream. Always remove linked timeout on io_link_timeout_fn() from the master request link list, otherwise we may get use-after-free when first io_link_timeout_fn() puts linked timeout in the fail path, and then will be found and put on master's free. Cc: stable@vger.kernel.org # 5.10+ Fixes: 90cd7e424969d ("io_uring: track link timeout's master explicitly") Reported-and-tested-by: syzbot+5a864149dd970b546223@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/69c46bf6ce37fec4fdcd98f0882e18eb07ce693a.1620990121.git.asml.silence@gmail.com Signed-off-by: Jens Axboe Signed-off-by: Gang Li --- fs/io_uring.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 682a6e5cbb7dd..0ef1067ba8077 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6520,10 +6520,11 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) * We don't expect the list to be empty, that will only happen if we * race with the completion of the linked work. */ - if (prev && refcount_inc_not_zero(&prev->refs)) + if (prev) { io_remove_next_linked(prev); - else - prev = NULL; + if (!refcount_inc_not_zero(&prev->refs)) + prev = NULL; + } spin_unlock_irqrestore(&ctx->completion_lock, flags); if (prev) { From 079cf57328adfa92b8f757cbb2c5c3201f94d040 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 22 Apr 2021 15:48:10 +0300 Subject: [PATCH 0098/5344] {net,vdpa}/mlx5: Configure interface MAC into mpfs L2 table commit 7c9f131f366ab414691907fa0407124ea2b2f3bc upstream. net/mlx5: Expose MPFS configuration API MPFS is the multi physical function switch that bridges traffic between the physical port and any physical functions associated with it. The driver is required to add or remove MAC entries to properly forward incoming traffic to the correct physical function. We export the API to control MPFS so that other drivers, such as mlx5_vdpa are able to add MAC addresses of their network interfaces. The MAC address of the vdpa interface must be configured into the MPFS L2 address. Failing to do so could cause, in some NIC configurations, failure to forward packets to the vdpa network device instance. Fix this by adding calls to update the MPFS table. CC: CC: CC: Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices") Signed-off-by: Eli Cohen Signed-off-by: Saeed Mahameed Signed-off-by: Gang Li --- .../net/ethernet/mellanox/mlx5/core/en_fs.c | 1 + .../net/ethernet/mellanox/mlx5/core/eswitch.c | 1 + .../ethernet/mellanox/mlx5/core/lib/mpfs.c | 3 +++ .../ethernet/mellanox/mlx5/core/lib/mpfs.h | 5 +---- drivers/vdpa/mlx5/net/mlx5_vnet.c | 19 ++++++++++++++++++- include/linux/mlx5/mpfs.h | 18 ++++++++++++++++++ 6 files changed, 42 insertions(+), 5 deletions(-) create mode 100644 include/linux/mlx5/mpfs.h diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c index c4ac7a9968d16..72711fc4bbbbc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "en.h" #include "lib/mpfs.h" diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 791fc14cc6d65..c12073d35e277 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "mlx5_core.h" #include "lib/eq.h" #include "eswitch.h" diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c index 3118e8d664072..91315b0d48fe1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include "mlx5_core.h" #include "lib/mpfs.h" @@ -177,6 +178,7 @@ int mlx5_mpfs_add_mac(struct mlx5_core_dev *dev, u8 *mac) mutex_unlock(&mpfs->lock); return err; } +EXPORT_SYMBOL(mlx5_mpfs_add_mac); int mlx5_mpfs_del_mac(struct mlx5_core_dev *dev, u8 *mac) { @@ -208,3 +210,4 @@ int mlx5_mpfs_del_mac(struct mlx5_core_dev *dev, u8 *mac) mutex_unlock(&mpfs->lock); return err; } +EXPORT_SYMBOL(mlx5_mpfs_del_mac); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h index 4a7b2c3203a7e..4a293542a7aa1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h @@ -84,12 +84,9 @@ struct l2addr_node { #ifdef CONFIG_MLX5_MPFS int mlx5_mpfs_init(struct mlx5_core_dev *dev); void mlx5_mpfs_cleanup(struct mlx5_core_dev *dev); -int mlx5_mpfs_add_mac(struct mlx5_core_dev *dev, u8 *mac); -int mlx5_mpfs_del_mac(struct mlx5_core_dev *dev, u8 *mac); #else /* #ifndef CONFIG_MLX5_MPFS */ static inline int mlx5_mpfs_init(struct mlx5_core_dev *dev) { return 0; } static inline void mlx5_mpfs_cleanup(struct mlx5_core_dev *dev) {} -static inline int mlx5_mpfs_add_mac(struct mlx5_core_dev *dev, u8 *mac) { return 0; } -static inline int mlx5_mpfs_del_mac(struct mlx5_core_dev *dev, u8 *mac) { return 0; } #endif + #endif diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 75ca629e1d349..9d4b95f8de9ec 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "mlx5_vnet.h" #include "mlx5_vdpa_ifc.h" #include "mlx5_vdpa.h" @@ -1844,11 +1845,16 @@ static int mlx5_vdpa_set_map(struct vdpa_device *vdev, struct vhost_iotlb *iotlb static void mlx5_vdpa_free(struct vdpa_device *vdev) { struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_core_dev *pfmdev; struct mlx5_vdpa_net *ndev; ndev = to_mlx5_vdpa_ndev(mvdev); free_resources(ndev); + if (!is_zero_ether_addr(ndev->config.mac)) { + pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev)); + mlx5_mpfs_del_mac(pfmdev, ndev->config.mac); + } mlx5_vdpa_free_resources(&ndev->mvdev); mutex_destroy(&ndev->reslock); } @@ -1968,6 +1974,7 @@ static void init_mvqs(struct mlx5_vdpa_net *ndev) void *mlx5_vdpa_add_dev(struct mlx5_core_dev *mdev) { struct virtio_net_config *config; + struct mlx5_core_dev *pfmdev; struct mlx5_vdpa_dev *mvdev; struct mlx5_vdpa_net *ndev; u32 max_vqs; @@ -1996,10 +2003,17 @@ void *mlx5_vdpa_add_dev(struct mlx5_core_dev *mdev) if (err) goto err_mtu; + if (!is_zero_ether_addr(config->mac)) { + pfmdev = pci_get_drvdata(pci_physfn(mdev->pdev)); + err = mlx5_mpfs_add_mac(pfmdev, config->mac); + if (err) + goto err_mtu; + } + mvdev->vdev.dma_dev = mdev->device; err = mlx5_vdpa_alloc_resources(&ndev->mvdev); if (err) - goto err_mtu; + goto err_mpfs; err = alloc_resources(ndev); if (err) @@ -2015,6 +2029,9 @@ void *mlx5_vdpa_add_dev(struct mlx5_core_dev *mdev) free_resources(ndev); err_res: mlx5_vdpa_free_resources(&ndev->mvdev); +err_mpfs: + if (!is_zero_ether_addr(config->mac)) + mlx5_mpfs_del_mac(pfmdev, config->mac); err_mtu: mutex_destroy(&ndev->reslock); put_device(&mvdev->vdev.dev); diff --git a/include/linux/mlx5/mpfs.h b/include/linux/mlx5/mpfs.h new file mode 100644 index 0000000000000..bf700c8d55164 --- /dev/null +++ b/include/linux/mlx5/mpfs.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + * Copyright (c) 2021 Mellanox Technologies Ltd. + */ + +#ifndef _MLX5_MPFS_ +#define _MLX5_MPFS_ + +struct mlx5_core_dev; + +#ifdef CONFIG_MLX5_MPFS +int mlx5_mpfs_add_mac(struct mlx5_core_dev *dev, u8 *mac); +int mlx5_mpfs_del_mac(struct mlx5_core_dev *dev, u8 *mac); +#else /* #ifndef CONFIG_MLX5_MPFS */ +static inline int mlx5_mpfs_add_mac(struct mlx5_core_dev *dev, u8 *mac) { return 0; } +static inline int mlx5_mpfs_del_mac(struct mlx5_core_dev *dev, u8 *mac) { return 0; } +#endif + +#endif From b5b0949ed604d3340bb5d5f23ee75e76737ba683 Mon Sep 17 00:00:00 2001 From: kernel test robot Date: Sun, 16 May 2021 07:03:58 +0800 Subject: [PATCH 0099/5344] scsi: target: tcmu: Fix boolreturn.cocci warnings commit 824731258b65f58764786f8d776c2007b084e12c upstream. drivers/target/target_core_user.c:1424:9-10: WARNING: return of 0/1 in function 'tcmu_handle_completions' with return type bool Return statements in functions returning bool should use true/false instead of 1/0. Generated by: scripts/coccinelle/misc/boolreturn.cocci Link: https://lore.kernel.org/r/20210515230358.GA97544@60d1edce16e0 Fixes: 9814b55cde05 ("scsi: target: tcmu: Return from tcmu_handle_completions() if cmd_id not found") CC: Bodo Stroesser Reported-by: kernel test robot Acked-by: Bodo Stroesser Signed-off-by: kernel test robot Signed-off-by: Martin K. Petersen Signed-off-by: Gang Li --- drivers/target/target_core_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 71144e33272a3..e8e5b5be932c7 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -1213,7 +1213,7 @@ static bool tcmu_handle_completions(struct tcmu_dev *udev) if (test_bit(TCMU_DEV_BIT_BROKEN, &udev->flags)) { pr_err("ring broken, not handling completions\n"); - return 0; + return false; } mb = udev->mb_addr; From 2b6d003bd6cea61b2be676c5c57625739ff3fc67 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Wed, 26 May 2021 22:22:19 +0800 Subject: [PATCH 0100/5344] ath10k: remove unused more_frags variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit e0a6120f6816ddd366530ce7ae5cb001a5e819dd upstream. Fix the following W=1 build warning: drivers/net/wireless/ath/ath10k/htt_rx.c:1790:7: warning: variable ‘more_frags’ set but not used [-Wunused-but-set-variable] 1790 | bool more_frags; | ^~~~~~~~~~ Fixes: a1166b2653db ("ath10k: add CCMP PN replay protection for fragmented frames for PCIe") Reported-by: Hulk Robot Signed-off-by: Yang Yingliang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210526142219.2542528-1-yangyingliang@huawei.com Signed-off-by: Gang Li --- drivers/net/wireless/ath/ath10k/htt_rx.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/wireless/ath/ath10k/htt_rx.c b/drivers/net/wireless/ath/ath10k/htt_rx.c index 760d24a28f392..420d5fa360927 100644 --- a/drivers/net/wireless/ath/ath10k/htt_rx.c +++ b/drivers/net/wireless/ath/ath10k/htt_rx.c @@ -1780,7 +1780,6 @@ static bool ath10k_htt_rx_h_frag_pn_check(struct ath10k *ar, struct ath10k_peer *peer; union htt_rx_pn_t *last_pn, new_pn = {0}; struct ieee80211_hdr *hdr; - bool more_frags; u8 tid, frag_number; u32 seq; @@ -1798,7 +1797,6 @@ static bool ath10k_htt_rx_h_frag_pn_check(struct ath10k *ar, last_pn = &peer->frag_tids_last_pn[tid]; new_pn.pn48 = ath10k_htt_rx_h_get_pn(ar, skb, offset, enctype); - more_frags = ieee80211_has_morefrags(hdr->frame_control); frag_number = le16_to_cpu(hdr->seq_ctrl) & IEEE80211_SCTL_FRAG; seq = (__le16_to_cpu(hdr->seq_ctrl) & IEEE80211_SCTL_SEQ) >> 4; From e8079436edcd4ccf56f73e0e1737f44eb3ad12e9 Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Fri, 11 Jun 2021 17:58:27 +1000 Subject: [PATCH 0101/5344] powerpc/tau: Remove superfluous parameter in alloc_workqueue() call commit ddf4a7bcd09439e82c4d6f959f4ff6c53f07466f upstream. This avoids an (optional) compiler warning: arch/powerpc/kernel/tau_6xx.c: In function 'TAU_init': arch/powerpc/kernel/tau_6xx.c:204:30: error: too many arguments for format [-Werror=format-extra-args] tau_workq = alloc_workqueue("tau", WQ_UNBOUND, 1, 0); Fixes: b1c6a0a10bfa ("powerpc/tau: Convert from timer to workqueue") Reported-by: Naresh Kamboju Signed-off-by: Finn Thain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/a1456e8bbd33ef702e3ff6f14b1bf3919241c62b.1623398307.git.fthain@linux-m68k.org Signed-off-by: Gang Li --- arch/powerpc/kernel/tau_6xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c index 0b4694b8d2482..8ece45c2a1f64 100644 --- a/arch/powerpc/kernel/tau_6xx.c +++ b/arch/powerpc/kernel/tau_6xx.c @@ -203,7 +203,7 @@ static int __init TAU_init(void) tau_int_enable = IS_ENABLED(CONFIG_TAU_INT) && !strcmp(cur_cpu_spec->platform, "ppc750"); - tau_workq = alloc_workqueue("tau", WQ_UNBOUND, 1, 0); + tau_workq = alloc_workqueue("tau", WQ_UNBOUND, 1); if (!tau_workq) return -ENOMEM; From 3d77b68080b758758af02b3429d8e7eb056681e4 Mon Sep 17 00:00:00 2001 From: Esben Haabendal Date: Mon, 21 Jun 2021 10:20:08 +0200 Subject: [PATCH 0102/5344] net: ll_temac: Remove left-over debug message commit ce03b94ba682a67e8233c9ee3066071656ded58f upstream. Fixes: f63963411942 ("net: ll_temac: Avoid ndo_start_xmit returning NETDEV_TX_BUSY") Signed-off-by: Esben Haabendal Signed-off-by: David S. Miller Signed-off-by: Gang Li --- drivers/net/ethernet/xilinx/ll_temac_main.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c index 9a7af7dda70dc..bddd64e918ce0 100644 --- a/drivers/net/ethernet/xilinx/ll_temac_main.c +++ b/drivers/net/ethernet/xilinx/ll_temac_main.c @@ -939,10 +939,8 @@ temac_start_xmit(struct sk_buff *skb, struct net_device *ndev) wmb(); lp->dma_out(lp, TX_TAILDESC_PTR, tail_p); /* DMA start */ - if (temac_check_tx_bd_space(lp, MAX_SKB_FRAGS + 1)) { - netdev_info(ndev, "%s -> netif_stop_queue\n", __func__); + if (temac_check_tx_bd_space(lp, MAX_SKB_FRAGS + 1)) netif_stop_queue(ndev); - } return NETDEV_TX_OK; } From 4c5d0536717b336b292d3ce72660a669368959d4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 18 Jun 2021 16:45:14 +0000 Subject: [PATCH 0103/5344] scsi: bnx2fc: Remove meaningless bnx2fc_abts_cleanup() return value assignment commit 73b306a2bcb75e37b8065aa714ad2c6949c90ebf upstream. Commit 122c81c563b0 ("scsi: bnx2fc: Return failure if io_req is already in ABTS processing") made bnx2fc_eh_abort() return FAILED when io_req was alrady in ABTS processing, regardless of the return value of bnx2fc_abts_cleanup(). However, the change left the assignment of the return value of bnx2fc_abts_cleanup(). Remove this. This issue was discovered and resolved using Coverity Static Analysis Security Testing (SAST) by Synopsys, Inc. Link: https://lore.kernel.org/r/20210618164514.6299-1-sj38.park@gmail.com Fixes: 122c81c563b0 ("scsi: bnx2fc: Return failure if io_req is already in ABTS processing") Acked-by: Saurav Kashyap Signed-off-by: SeongJae Park Signed-off-by: Martin K. Petersen Signed-off-by: Gang Li --- drivers/scsi/bnx2fc/bnx2fc_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/bnx2fc/bnx2fc_io.c b/drivers/scsi/bnx2fc/bnx2fc_io.c index a0e7764148899..06133d7c2a559 100644 --- a/drivers/scsi/bnx2fc/bnx2fc_io.c +++ b/drivers/scsi/bnx2fc/bnx2fc_io.c @@ -1212,7 +1212,7 @@ int bnx2fc_eh_abort(struct scsi_cmnd *sc_cmd) * cleanup the command and return that I/O was successfully * aborted. */ - rc = bnx2fc_abts_cleanup(io_req); + bnx2fc_abts_cleanup(io_req); /* This only occurs when an task abort was requested while ABTS is in progress. Setting the IO_CLEANUP flag will skip the RRQ process in the case when the fw generated SCSI_CMD cmpl From 869e1acbe1dc5cf1728de221cee5c4be13ea1496 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Sun, 30 May 2021 12:03:49 +0300 Subject: [PATCH 0104/5344] vdpa/mlx5: Fix possible failure in umem size calculation commit 71ab6a7cfbae27f86a3901daab10bfe13b3a1e3a upstream. umem size is a 32 bit unsigned value so assigning it to an int could cause false failures. Set the calculated value inside the function and modify function name to reflect the fact it updates the size. This bug was found during code review but never had real impact to this date. Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices") Signed-off-by: Eli Cohen Link: https://lore.kernel.org/r/20210530090349.8360-1-elic@nvidia.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Signed-off-by: Gang Li --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 9d4b95f8de9ec..f5b7a0815b98d 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -596,8 +596,8 @@ static void cq_destroy(struct mlx5_vdpa_net *ndev, u16 idx) mlx5_db_free(ndev->mvdev.mdev, &vcq->db); } -static int umem_size(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num, - struct mlx5_vdpa_umem **umemp) +static void set_umem_size(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num, + struct mlx5_vdpa_umem **umemp) { struct mlx5_core_dev *mdev = ndev->mvdev.mdev; int p_a; @@ -620,7 +620,7 @@ static int umem_size(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq *umemp = &mvq->umem3; break; } - return p_a * mvq->num_ent + p_b; + (*umemp)->size = p_a * mvq->num_ent + p_b; } static void umem_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem) @@ -636,15 +636,10 @@ static int create_umem(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *m void *in; int err; __be64 *pas; - int size; struct mlx5_vdpa_umem *umem; - size = umem_size(ndev, mvq, num, &umem); - if (size < 0) - return size; - - umem->size = size; - err = umem_frag_buf_alloc(ndev, umem, size); + set_umem_size(ndev, mvq, num, &umem); + err = umem_frag_buf_alloc(ndev, umem, umem->size); if (err) return err; From f2b455fd734c54384a27df8d48a4c773f40402eb Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Sun, 30 May 2021 12:03:17 +0300 Subject: [PATCH 0105/5344] vdpa/mlx5: Fix umem sizes assignments on VQ create commit e3011776af16caf423f2c36d0047acd624c274fa upstream. Fix copy paste bug assigning umem1 size to umem2 and umem3. The issue was discovered when trying to use a 1:1 MR that covers the entire address space where firmware complained that provided sizes are not large enough. 1:1 MRs are required to support virtio_vdpa. Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices") Signed-off-by: Eli Cohen Link: https://lore.kernel.org/r/20210530090317.8284-1-elic@nvidia.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Signed-off-by: Gang Li --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index f5b7a0815b98d..b049db9997de8 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -809,9 +809,9 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtque MLX5_SET(virtio_q, vq_ctx, umem_1_id, mvq->umem1.id); MLX5_SET(virtio_q, vq_ctx, umem_1_size, mvq->umem1.size); MLX5_SET(virtio_q, vq_ctx, umem_2_id, mvq->umem2.id); - MLX5_SET(virtio_q, vq_ctx, umem_2_size, mvq->umem1.size); + MLX5_SET(virtio_q, vq_ctx, umem_2_size, mvq->umem2.size); MLX5_SET(virtio_q, vq_ctx, umem_3_id, mvq->umem3.id); - MLX5_SET(virtio_q, vq_ctx, umem_3_size, mvq->umem1.size); + MLX5_SET(virtio_q, vq_ctx, umem_3_size, mvq->umem3.size); MLX5_SET(virtio_q, vq_ctx, pd, ndev->mvdev.res.pdn); if (MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, eth_frame_offload_type)) MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0, 1); From 1fab74b61b3eac00cf4724b15143c47fd7508e6e Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Sun, 6 Jun 2021 08:31:28 +0300 Subject: [PATCH 0106/5344] vdpa/mlx5: Clear vq ready indication upon device reset commit e3aadf2e1614174dc81d52cbb9dabb77913b11c6 upstream. After device reset, the virtqueues are not ready so clear the ready field. Failing to do so can result in virtio_vdpa failing to load if the device was previously used by vhost_vdpa and the old values are ready. virtio_vdpa expects to find VQs in "not ready" state. Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices") Signed-off-by: Eli Cohen Link: https://lore.kernel.org/r/20210606053128.170399-1-elic@nvidia.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Signed-off-by: Gang Li --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index b049db9997de8..8256d05936cf7 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1752,6 +1752,14 @@ static void teardown_driver(struct mlx5_vdpa_net *ndev) mutex_unlock(&ndev->reslock); } +static void clear_vqs_ready(struct mlx5_vdpa_net *ndev) +{ + int i; + + for (i = 0; i < ndev->mvdev.max_vqs; i++) + ndev->vqs[i].ready = false; +} + static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status) { struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); @@ -1762,6 +1770,7 @@ static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status) if (!status) { mlx5_vdpa_info(mvdev, "performing device reset\n"); teardown_driver(ndev); + clear_vqs_ready(ndev); mlx5_vdpa_destroy_mr(&ndev->mvdev); ndev->mvdev.status = 0; ndev->mvdev.mlx_features = 0; From 35e77b108e2171da07a2909ffe9e4e0ebabfaf28 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 28 Jun 2021 14:05:20 -0700 Subject: [PATCH 0107/5344] spi: : add missing struct kernel-doc entry commit 8dd591ad0104593f315b6b2ab636a18c002f7d86 upstream. Fix kernel-doc warning in spi.h by adding the missing kernel-doc entry and also correct the original comment so that they both indicate the correct polarity of the flag. ../include/linux/spi/spi.h:673: warning: Function parameter or member 'devm_allocated' not described in 'spi_controller' Fixes: 794aaf01444d ("spi: Fix use-after-free with devm_spi_alloc_*") Signed-off-by: Randy Dunlap Cc: William A. Kennington III Cc: Mark Brown Cc: linux-spi@vger.kernel.org Cc: Lukas Wunner Reviewed-by: Lukas Wunner Link: https://lore.kernel.org/r/20210628210520.5712-1-rdunlap@infradead.org Signed-off-by: Mark Brown Signed-off-by: Gang Li --- include/linux/spi/spi.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 7067f85cef0bf..fed5b3ac7f44d 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -319,6 +319,7 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv) * @max_speed_hz: Highest supported transfer speed * @flags: other constraints relevant to this driver * @slave: indicates that this is an SPI slave controller + * @devm_allocated: whether the allocation of this struct is devres-managed * @max_transfer_size: function that returns the max transfer size for * a &spi_device; may be %NULL, so the default %SIZE_MAX will be used. * @max_message_size: function that returns the max message size for @@ -466,7 +467,7 @@ struct spi_controller { #define SPI_MASTER_GPIO_SS BIT(5) /* GPIO CS must select slave */ - /* flag indicating this is a non-devres managed controller */ + /* flag indicating if the allocation of this struct is devres-managed */ bool devm_allocated; /* flag indicating this is an SPI slave controller */ From 596e073c79df3b1798ae59822455e8de4ac0bfc8 Mon Sep 17 00:00:00 2001 From: Jia He Date: Tue, 20 Jul 2021 21:26:55 +0800 Subject: [PATCH 0108/5344] Revert "qed: fix possible unpaired spin_{un}lock_bh in _qed_mcp_cmd_and_union()" commit 91bed5565bba03b2a9f7334b58ae4be9df7c3840 upstream. This reverts commit 6206b7981a36476f4695d661ae139f7db36a802d. That patch added additional spin_{un}lock_bh(), which was harmless but pointless. The orginal code path has guaranteed the pair of spin_{un}lock_bh(). We'd better revert it before we find the exact root cause of the bug_on mentioned in that patch. Fixes: 6206b7981a36 ("qed: fix possible unpaired spin_{un}lock_bh in _qed_mcp_cmd_and_union()") Cc: David S. Miller Cc: Prabhakar Kushwaha Signed-off-by: Jia He Signed-off-by: David S. Miller Signed-off-by: Gang Li --- drivers/net/ethernet/qlogic/qed/qed_mcp.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c index 5d85ae59bc51e..9401b49275f0a 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c @@ -498,18 +498,14 @@ _qed_mcp_cmd_and_union(struct qed_hwfn *p_hwfn, spin_lock_bh(&p_hwfn->mcp_info->cmd_lock); - if (!qed_mcp_has_pending_cmd(p_hwfn)) { - spin_unlock_bh(&p_hwfn->mcp_info->cmd_lock); + if (!qed_mcp_has_pending_cmd(p_hwfn)) break; - } rc = qed_mcp_update_pending_cmd(p_hwfn, p_ptt); - if (!rc) { - spin_unlock_bh(&p_hwfn->mcp_info->cmd_lock); + if (!rc) break; - } else if (rc != -EAGAIN) { + else if (rc != -EAGAIN) goto err; - } spin_unlock_bh(&p_hwfn->mcp_info->cmd_lock); @@ -526,8 +522,6 @@ _qed_mcp_cmd_and_union(struct qed_hwfn *p_hwfn, return -EAGAIN; } - spin_lock_bh(&p_hwfn->mcp_info->cmd_lock); - /* Send the mailbox command */ qed_mcp_reread_offsets(p_hwfn, p_ptt); seq_num = ++p_hwfn->mcp_info->drv_mb_seq; @@ -554,18 +548,14 @@ _qed_mcp_cmd_and_union(struct qed_hwfn *p_hwfn, spin_lock_bh(&p_hwfn->mcp_info->cmd_lock); - if (p_cmd_elem->b_is_completed) { - spin_unlock_bh(&p_hwfn->mcp_info->cmd_lock); + if (p_cmd_elem->b_is_completed) break; - } rc = qed_mcp_update_pending_cmd(p_hwfn, p_ptt); - if (!rc) { - spin_unlock_bh(&p_hwfn->mcp_info->cmd_lock); + if (!rc) break; - } else if (rc != -EAGAIN) { + else if (rc != -EAGAIN) goto err; - } spin_unlock_bh(&p_hwfn->mcp_info->cmd_lock); } while (++cnt < max_retries); @@ -586,7 +576,6 @@ _qed_mcp_cmd_and_union(struct qed_hwfn *p_hwfn, return -EAGAIN; } - spin_lock_bh(&p_hwfn->mcp_info->cmd_lock); qed_mcp_cmd_del_elem(p_hwfn, p_cmd_elem); spin_unlock_bh(&p_hwfn->mcp_info->cmd_lock); From fa7a85af45119125622dcd077a161fba91e2212b Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Mon, 21 Jun 2021 07:07:28 +0200 Subject: [PATCH 0109/5344] media: dvb-usb: Fix error handling in dvb_usb_i2c_init commit 131ae388b88e3daf4cb0721ed4b4cb8bfc201465 upstream. In dvb_usb_i2c_init, if i2c_add_adapter fails, it only prints an error message, and then continues to set DVB_USB_STATE_I2C. This affects the logic of dvb_usb_i2c_exit, which leads to that, the deletion of i2c_adap even if the i2c_add_adapter fails. Fix this by returning at the failure of i2c_add_adapter and then move dvb_usb_i2c_exit out of the error handling code of dvb_usb_i2c_init. Fixes: 13a79f14ab28 ("media: dvb-usb: Fix memory leak at error in dvb_usb_device_init()") Signed-off-by: Dongliang Mu Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Gang Li --- drivers/media/usb/dvb-usb/dvb-usb-i2c.c | 9 +++++++-- drivers/media/usb/dvb-usb/dvb-usb-init.c | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/media/usb/dvb-usb/dvb-usb-i2c.c b/drivers/media/usb/dvb-usb/dvb-usb-i2c.c index 2e07106f46803..bc4b2abdde1a4 100644 --- a/drivers/media/usb/dvb-usb/dvb-usb-i2c.c +++ b/drivers/media/usb/dvb-usb/dvb-usb-i2c.c @@ -17,7 +17,8 @@ int dvb_usb_i2c_init(struct dvb_usb_device *d) if (d->props.i2c_algo == NULL) { err("no i2c algorithm specified"); - return -EINVAL; + ret = -EINVAL; + goto err; } strscpy(d->i2c_adap.name, d->desc->name, sizeof(d->i2c_adap.name)); @@ -27,11 +28,15 @@ int dvb_usb_i2c_init(struct dvb_usb_device *d) i2c_set_adapdata(&d->i2c_adap, d); - if ((ret = i2c_add_adapter(&d->i2c_adap)) < 0) + ret = i2c_add_adapter(&d->i2c_adap); + if (ret < 0) { err("could not add i2c adapter"); + goto err; + } d->state |= DVB_USB_STATE_I2C; +err: return ret; } diff --git a/drivers/media/usb/dvb-usb/dvb-usb-init.c b/drivers/media/usb/dvb-usb/dvb-usb-init.c index f57c4627624f5..e7720ff11d3d9 100644 --- a/drivers/media/usb/dvb-usb/dvb-usb-init.c +++ b/drivers/media/usb/dvb-usb/dvb-usb-init.c @@ -194,8 +194,8 @@ static int dvb_usb_init(struct dvb_usb_device *d, short *adapter_nums) err_adapter_init: dvb_usb_adapter_exit(d); -err_i2c_init: dvb_usb_i2c_exit(d); +err_i2c_init: if (d->priv && d->props.priv_destroy) d->props.priv_destroy(d); err_priv_init: From 2ae54453905c53339a0679f444c31063de0d5dc3 Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Thu, 29 Jul 2021 20:01:03 +0200 Subject: [PATCH 0110/5344] powerpc/stacktrace: Include linux/delay.h commit a6cae77f1bc89368a4e2822afcddc45c3062d499 upstream. commit 7c6986ade69e ("powerpc/stacktrace: Fix spurious "stale" traces in raise_backtrace_ipi()") introduces udelay() call without including the linux/delay.h header. This may happen to work on master but the header that declares the functionshould be included nonetheless. Fixes: 7c6986ade69e ("powerpc/stacktrace: Fix spurious "stale" traces in raise_backtrace_ipi()") Signed-off-by: Michal Suchanek Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210729180103.15578-1-msuchanek@suse.de Signed-off-by: Gang Li --- arch/powerpc/kernel/stacktrace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index b13c6213b0d9b..890f95151fb44 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -8,6 +8,7 @@ * Copyright 2018 Nick Piggin, Michael Ellerman, IBM Corp. */ +#include #include #include #include From f09289a1c2a28129f3e0362de2f40890017ce9a1 Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Wed, 7 Jul 2021 11:34:09 +0200 Subject: [PATCH 0111/5344] media: em28xx-input: fix refcount bug in em28xx_usb_disconnect commit 6fa54bc713c262e1cfbc5613377ef52280d7311f upstream. If em28xx_ir_init fails, it would decrease the refcount of dev. However, in the em28xx_ir_fini, when ir is NULL, it goes to ref_put and decrease the refcount of dev. This will lead to a refcount bug. Fix this bug by removing the kref_put in the error handling code of em28xx_ir_init. refcount_t: underflow; use-after-free. WARNING: CPU: 0 PID: 7 at lib/refcount.c:28 refcount_warn_saturate+0x18e/0x1a0 lib/refcount.c:28 Modules linked in: CPU: 0 PID: 7 Comm: kworker/0:1 Not tainted 5.13.0 #3 Workqueue: usb_hub_wq hub_event RIP: 0010:refcount_warn_saturate+0x18e/0x1a0 lib/refcount.c:28 Call Trace: kref_put.constprop.0+0x60/0x85 include/linux/kref.h:69 em28xx_usb_disconnect.cold+0xd7/0xdc drivers/media/usb/em28xx/em28xx-cards.c:4150 usb_unbind_interface+0xbf/0x3a0 drivers/usb/core/driver.c:458 __device_release_driver drivers/base/dd.c:1201 [inline] device_release_driver_internal+0x22a/0x230 drivers/base/dd.c:1232 bus_remove_device+0x108/0x160 drivers/base/bus.c:529 device_del+0x1fe/0x510 drivers/base/core.c:3540 usb_disable_device+0xd1/0x1d0 drivers/usb/core/message.c:1419 usb_disconnect+0x109/0x330 drivers/usb/core/hub.c:2221 hub_port_connect drivers/usb/core/hub.c:5151 [inline] hub_port_connect_change drivers/usb/core/hub.c:5440 [inline] port_event drivers/usb/core/hub.c:5586 [inline] hub_event+0xf81/0x1d40 drivers/usb/core/hub.c:5668 process_one_work+0x2c9/0x610 kernel/workqueue.c:2276 process_scheduled_works kernel/workqueue.c:2338 [inline] worker_thread+0x333/0x5b0 kernel/workqueue.c:2424 kthread+0x188/0x1d0 kernel/kthread.c:319 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 Reported-by: Dongliang Mu Fixes: ac5688637144 ("media: em28xx: Fix possible memory leak of em28xx struct") Signed-off-by: Dongliang Mu Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Gang Li --- drivers/media/usb/em28xx/em28xx-input.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/media/usb/em28xx/em28xx-input.c b/drivers/media/usb/em28xx/em28xx-input.c index 59529cbf9cd0b..0b6d77c3bec86 100644 --- a/drivers/media/usb/em28xx/em28xx-input.c +++ b/drivers/media/usb/em28xx/em28xx-input.c @@ -842,7 +842,6 @@ static int em28xx_ir_init(struct em28xx *dev) kfree(ir); ref_put: em28xx_shutdown_buttons(dev); - kref_put(&dev->ref, em28xx_free_device); return err; } From 8e5eb6124a3804649a3fd2f39b57d537e3fa715f Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Sun, 11 Apr 2021 15:32:55 +0300 Subject: [PATCH 0112/5344] net/mlx5: Synchronize correct IRQ when destroying CQ commit 563476ae0c5e48a028cbfa38fa9d2fc0418eb88f upstream. The CQ destroy is performed based on the IRQ number that is stored in cq->irqn. That number wasn't set explicitly during CQ creation and as expected some of the API users of mlx5_core_create_cq() forgot to update it. This caused to wrong synchronization call of the wrong IRQ with a number 0 instead of the real one. As a fix, set the IRQ number directly in the mlx5_core_create_cq() and update all users accordingly. Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices") Fixes: ef1659ade359 ("IB/mlx5: Add DEVX support for CQ events") Signed-off-by: Shay Drory Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: Gang Li --- drivers/infiniband/hw/mlx5/cq.c | 4 +--- drivers/infiniband/hw/mlx5/devx.c | 3 +-- drivers/net/ethernet/mellanox/mlx5/core/cq.c | 1 + .../net/ethernet/mellanox/mlx5/core/en_main.c | 13 ++---------- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 20 +++++++++++++++---- .../ethernet/mellanox/mlx5/core/fpga/conn.c | 4 +--- .../net/ethernet/mellanox/mlx5/core/lib/eq.h | 2 ++ .../mellanox/mlx5/core/steering/dr_send.c | 4 +--- drivers/vdpa/mlx5/net/mlx5_vnet.c | 3 +-- include/linux/mlx5/driver.h | 3 +-- 10 files changed, 27 insertions(+), 30 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 23ce0126b268e..c68a2faa82f31 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -921,7 +921,6 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, u32 *cqb = NULL; void *cqc; int cqe_size; - unsigned int irqn; int eqn; int err; @@ -960,7 +959,7 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, INIT_WORK(&cq->notify_work, notify_soft_wc_handler); } - err = mlx5_vector2eqn(dev->mdev, vector, &eqn, &irqn); + err = mlx5_vector2eqn(dev->mdev, vector, &eqn); if (err) goto err_cqb; @@ -983,7 +982,6 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, goto err_cqb; mlx5_ib_dbg(dev, "cqn 0x%x\n", cq->mcq.cqn); - cq->mcq.irqn = irqn; if (udata) cq->mcq.tasklet_ctx.comp = mlx5_ib_cq_comp; else diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 664e0f374ac00..df2fc856995ed 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -957,7 +957,6 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_QUERY_EQN)( struct mlx5_ib_dev *dev; int user_vector; int dev_eqn; - unsigned int irqn; int err; if (uverbs_copy_from(&user_vector, attrs, @@ -969,7 +968,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_QUERY_EQN)( return PTR_ERR(c); dev = to_mdev(c->ibucontext.device); - err = mlx5_vector2eqn(dev->mdev, user_vector, &dev_eqn, &irqn); + err = mlx5_vector2eqn(dev->mdev, user_vector, &dev_eqn); if (err < 0) return err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c index 818edc63e428f..c8f26480d1456 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c @@ -136,6 +136,7 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, cq->cqn); cq->uar = dev->priv.uar; + cq->irqn = eq->core.irqn; return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 24c49a84947f3..622c2d26543d3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1582,15 +1582,9 @@ static int mlx5e_alloc_cq_common(struct mlx5_core_dev *mdev, struct mlx5e_cq *cq) { struct mlx5_core_cq *mcq = &cq->mcq; - int eqn_not_used; - unsigned int irqn; int err; u32 i; - err = mlx5_vector2eqn(mdev, param->eq_ix, &eqn_not_used, &irqn); - if (err) - return err; - err = mlx5_cqwq_create(mdev, ¶m->wq, param->cqc, &cq->wq, &cq->wq_ctrl); if (err) @@ -1604,7 +1598,6 @@ static int mlx5e_alloc_cq_common(struct mlx5_core_dev *mdev, mcq->vector = param->eq_ix; mcq->comp = mlx5e_completion_event; mcq->event = mlx5e_cq_error_event; - mcq->irqn = irqn; for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) { struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, i); @@ -1650,11 +1643,10 @@ static int mlx5e_create_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param) void *in; void *cqc; int inlen; - unsigned int irqn_not_used; int eqn; int err; - err = mlx5_vector2eqn(mdev, param->eq_ix, &eqn, &irqn_not_used); + err = mlx5_vector2eqn(mdev, param->eq_ix, &eqn); if (err) return err; @@ -2018,9 +2010,8 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, struct mlx5e_channel *c; unsigned int irq; int err; - int eqn; - err = mlx5_vector2eqn(priv->mdev, ix, &eqn, &irq); + err = mlx5_vector2irqn(priv->mdev, ix, &irq); if (err) return err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index 30a2ee3c40a00..e2d7f821c956b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -859,8 +859,8 @@ static int create_comp_eqs(struct mlx5_core_dev *dev) return err; } -int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn, - unsigned int *irqn) +static int vector2eqnirqn(struct mlx5_core_dev *dev, int vector, int *eqn, + unsigned int *irqn) { struct mlx5_eq_table *table = dev->priv.eq_table; struct mlx5_eq_comp *eq, *n; @@ -869,8 +869,10 @@ int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn, list_for_each_entry_safe(eq, n, &table->comp_eqs_list, list) { if (i++ == vector) { - *eqn = eq->core.eqn; - *irqn = eq->core.irqn; + if (irqn) + *irqn = eq->core.irqn; + if (eqn) + *eqn = eq->core.eqn; err = 0; break; } @@ -878,8 +880,18 @@ int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn, return err; } + +int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn) +{ + return vector2eqnirqn(dev, vector, eqn, NULL); +} EXPORT_SYMBOL(mlx5_vector2eqn); +int mlx5_vector2irqn(struct mlx5_core_dev *dev, int vector, unsigned int *irqn) +{ + return vector2eqnirqn(dev, vector, NULL, irqn); +} + unsigned int mlx5_comp_vectors_count(struct mlx5_core_dev *dev) { return dev->priv.eq_table->num_comp_eqs; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c index 61021133029e6..6819d8c3979ba 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c @@ -434,7 +434,6 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size) struct mlx5_wq_param wqp; struct mlx5_cqe64 *cqe; int inlen, err, eqn; - unsigned int irqn; void *cqc, *in; __be64 *pas; u32 i; @@ -463,7 +462,7 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size) goto err_cqwq; } - err = mlx5_vector2eqn(mdev, smp_processor_id(), &eqn, &irqn); + err = mlx5_vector2eqn(mdev, smp_processor_id(), &eqn); if (err) { kvfree(in); goto err_cqwq; @@ -494,7 +493,6 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size) conn->cq.mcq.vector = 0; conn->cq.mcq.comp = mlx5_fpga_conn_cq_complete; conn->cq.mcq.event = mlx5_fpga_conn_cq_event; - conn->cq.mcq.irqn = irqn; conn->cq.mcq.uar = fdev->conn_res.uar; tasklet_init(&conn->cq.tasklet, mlx5_fpga_conn_cq_tasklet, (unsigned long)conn); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h index 9aaf0eab7c2e1..0f7d501e2dbd9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h @@ -99,4 +99,6 @@ void mlx5_core_eq_free_irqs(struct mlx5_core_dev *dev); struct cpu_rmap *mlx5_eq_table_get_rmap(struct mlx5_core_dev *dev); #endif +int mlx5_vector2irqn(struct mlx5_core_dev *dev, int vector, unsigned int *irqn); + #endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c index f012aac83b10e..fae3c89649631 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c @@ -705,7 +705,6 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, struct mlx5_cqe64 *cqe; struct mlx5dr_cq *cq; int inlen, err, eqn; - unsigned int irqn; void *cqc, *in; __be64 *pas; int vector; @@ -738,7 +737,7 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, goto err_cqwq; vector = raw_smp_processor_id() % mlx5_comp_vectors_count(mdev); - err = mlx5_vector2eqn(mdev, vector, &eqn, &irqn); + err = mlx5_vector2eqn(mdev, vector, &eqn); if (err) { kvfree(in); goto err_cqwq; @@ -775,7 +774,6 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, *cq->mcq.arm_db = cpu_to_be32(2 << 28); cq->mcq.vector = 0; - cq->mcq.irqn = irqn; cq->mcq.uar = uar; return cq; diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 8256d05936cf7..53ae6b3dff2c9 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -511,7 +511,6 @@ static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent) void __iomem *uar_page = ndev->mvdev.res.uar->map; u32 out[MLX5_ST_SZ_DW(create_cq_out)]; struct mlx5_vdpa_cq *vcq = &mvq->cq; - unsigned int irqn; __be64 *pas; int inlen; void *cqc; @@ -551,7 +550,7 @@ static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent) /* Use vector 0 by default. Consider adding code to choose least used * vector. */ - err = mlx5_vector2eqn(mdev, 0, &eqn, &irqn); + err = mlx5_vector2eqn(mdev, 0, &eqn); if (err) goto err_vec; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index dc93ed6c7096f..e2c296d11f978 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -995,8 +995,7 @@ void mlx5_unregister_debugfs(void); void mlx5_fill_page_array(struct mlx5_frag_buf *buf, __be64 *pas); void mlx5_fill_page_frag_array_perm(struct mlx5_frag_buf *buf, __be64 *pas, u8 perm); void mlx5_fill_page_frag_array(struct mlx5_frag_buf *frag_buf, __be64 *pas); -int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn, - unsigned int *irqn); +int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn); int mlx5_core_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn); int mlx5_core_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn); From ae0a2d9ffb31deb2f825ca4858c5f5f617d7d7f9 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Wed, 28 Jul 2021 21:07:55 +0800 Subject: [PATCH 0113/5344] vhost-vdpa: Fix integer overflow in vhost_vdpa_process_iotlb_update() commit 0e398290cff997610b66e73573faaee70c9a700e upstream. The "msg->iova + msg->size" addition can have an integer overflow if the iotlb message is from a malicious user space application. So let's fix it. Fixes: 1b48dc03e575 ("vhost: vdpa: report iova range") Reported-by: Dan Carpenter Signed-off-by: Xie Yongji Acked-by: Jason Wang Link: https://lore.kernel.org/r/20210728130756.97-1-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin Signed-off-by: Gang Li --- drivers/vhost/vdpa.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 157bcd08d11a0..582dfe886541a 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -809,7 +809,8 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, struct vdpa_device *vdpa = v->vdpa; struct vhost_iotlb *iotlb = dev->iotlb; - if (msg->iova < v->range.first || + if (msg->iova < v->range.first || !msg->size || + msg->iova > U64_MAX - msg->size + 1 || msg->iova + msg->size - 1 > v->range.last) return -EINVAL; From 40fb758a5d1080d0818db092d85f827cc1c3aa06 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Thu, 15 Jul 2021 16:00:25 +0800 Subject: [PATCH 0114/5344] vDPA/ifcvf: Fix return value check for vdpa_alloc_device() commit 1057afa0121db8bd3ca4718c8e0ca12388ab7759 upstream. The vdpa_alloc_device() returns an error pointer upon failure, not NULL. To handle the failure correctly, this replaces NULL check with IS_ERR() check and propagate the error upwards. Fixes: 5a2414bc454e ("virtio: Intel IFC VF driver for VDPA") Reported-by: Dan Carpenter Signed-off-by: Xie Yongji Link: https://lore.kernel.org/r/20210715080026.242-3-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Reviewed-by: Stefano Garzarella Signed-off-by: Gang Li --- drivers/vdpa/ifcvf/ifcvf_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index 167a20fbb1494..e19dee2a19149 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -438,9 +438,9 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id) adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa, dev, &ifc_vdpa_ops, NULL, false); - if (adapter == NULL) { + if (IS_ERR(adapter)) { IFCVF_ERR(pdev, "Failed to allocate vDPA structure"); - return -ENOMEM; + return PTR_ERR(adapter); } pci_set_master(pdev); From 012a53326e343e0e314d2842e1f60b7f9570222a Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Thu, 15 Jul 2021 16:00:23 +0800 Subject: [PATCH 0115/5344] vdpa_sim: Fix return value check for vdpa_alloc_device() commit 2b847f21145d84e2e1dde99d3e2c00a5468f02e4 upstream. The vdpa_alloc_device() returns an error pointer upon failure, not NULL. To handle the failure correctly, this replaces NULL check with IS_ERR() check and propagate the error upwards. Fixes: 2c53d0f64c06 ("vdpasim: vDPA device simulator") Reported-by: Dan Carpenter Signed-off-by: Xie Yongji Link: https://lore.kernel.org/r/20210715080026.242-1-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefano Garzarella Signed-off-by: Gang Li --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 2aba1c076d8e6..afde4313b0f17 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -251,8 +251,10 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr) vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, dev_attr->name, false); - if (!vdpasim) + if (IS_ERR(vdpasim)) { + ret = PTR_ERR(vdpasim); goto err_alloc; + } vdpasim->dev_attr = *dev_attr; INIT_WORK(&vdpasim->work, dev_attr->work_fn); From 7aefc703f09b090cfb36ce90ecf4920709d930a0 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 11 Aug 2021 08:37:13 +0300 Subject: [PATCH 0116/5344] vdpa/mlx5: Avoid destroying MR on empty iotlb commit 08dbd5660232bede7916d8568003012c1182cc9a upstream. The current code treats an empty iotlb provdied in set_map() as a special case and destroy the memory region object. This must not be done since the virtqueue objects reference this MR. Doing so will cause the driver unload to emit errors and log timeouts caused by the firmware complaining on busy resources. This patch treats an empty iotlb as any other change of mapping. In this case, mlx5_vdpa_create_mr() will fail and the entire set_map() call to fail. This issue has not been encountered before but was seen to occur in a non-official version of qemu. Since qemu is a userspace program, the driver must protect against such case. Fixes: 94abbccdf291 ("vdpa/mlx5: Add shared memory registration code") Signed-off-by: Eli Cohen Link: https://lore.kernel.org/r/20210811053713.66658-1-elic@nvidia.com Signed-off-by: Michael S. Tsirkin Signed-off-by: Gang Li --- drivers/vdpa/mlx5/core/mr.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c index aa656f57bf5b7..32c9925de4736 100644 --- a/drivers/vdpa/mlx5/core/mr.c +++ b/drivers/vdpa/mlx5/core/mr.c @@ -454,11 +454,6 @@ void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev) mutex_unlock(&mr->mkey_mtx); } -static bool map_empty(struct vhost_iotlb *iotlb) -{ - return !vhost_iotlb_itree_first(iotlb, 0, U64_MAX); -} - int mlx5_vdpa_handle_set_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb, bool *change_map) { @@ -466,10 +461,6 @@ int mlx5_vdpa_handle_set_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *io int err = 0; *change_map = false; - if (map_empty(iotlb)) { - mlx5_vdpa_destroy_mr(mvdev); - return 0; - } mutex_lock(&mr->mkey_mtx); if (mr->initialized) { mlx5_vdpa_info(mvdev, "memory map update\n"); From e72d0ece53ebcb8072fa3f30224879a31715af3c Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 11 Aug 2021 08:37:59 +0300 Subject: [PATCH 0117/5344] vdpa/mlx5: Fix queue type selection logic commit 879753c816dbbdb2a9a395aa4448d29feee92d1a upstream. get_queue_type() comments that splict virtqueue is preferred, however, the actual logic preferred packed virtqueues. Since firmware has not supported packed virtqueues we ended up using split virtqueues as was desired. Since we do not advertise support for packed virtqueues, we add a check to verify split virtqueues are indeed supported. Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices") Signed-off-by: Eli Cohen Link: https://lore.kernel.org/r/20210811053759.66752-1-elic@nvidia.com Signed-off-by: Michael S. Tsirkin Signed-off-by: Gang Li --- drivers/vdpa/mlx5/core/mlx5_vdpa_ifc.h | 10 ++++++---- drivers/vdpa/mlx5/net/mlx5_vnet.c | 14 ++++++++++---- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa_ifc.h b/drivers/vdpa/mlx5/core/mlx5_vdpa_ifc.h index f6f57a29b38ef..3c3bae2a3b6e2 100644 --- a/drivers/vdpa/mlx5/core/mlx5_vdpa_ifc.h +++ b/drivers/vdpa/mlx5/core/mlx5_vdpa_ifc.h @@ -13,13 +13,15 @@ enum { }; enum { - MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT = 0x1, // do I check this caps? - MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED = 0x2, + MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT = 0, + MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED = 1, }; enum { - MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT = 0, - MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED = 1, + MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT = + BIT(MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT), + MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED = + BIT(MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED), }; struct mlx5_ifc_virtio_q_bits { diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 53ae6b3dff2c9..42e07847c1daa 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -737,12 +737,12 @@ static int get_queue_type(struct mlx5_vdpa_net *ndev) type_mask = MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, virtio_queue_type); /* prefer split queue */ - if (type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED) - return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED; + if (type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT) + return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT; - WARN_ON(!(type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)); + WARN_ON(!(type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED)); - return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT; + return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED; } static bool vq_is_tx(u16 idx) @@ -1983,6 +1983,12 @@ void *mlx5_vdpa_add_dev(struct mlx5_core_dev *mdev) u32 max_vqs; int err; + if (!(MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_queue_type) & + MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)) { + dev_warn(mdev->device, "missing support for split virtqueues\n"); + return -EOPNOTSUPP; + } + /* we save one virtqueue for control virtqueue should we require it */ max_vqs = MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues); max_vqs = min_t(u32, max_vqs, MLX5_MAX_SUPPORTED_VQS); From acdcaffd9d760bf3ffdcffbabd44e7e973075d06 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 14 Aug 2021 09:04:40 -0600 Subject: [PATCH 0118/5344] io_uring: only assign io_uring_enter() SQPOLL error in actual error case commit 21f965221e7c42609521342403e8fb91b8b3e76e upstream. If an SQPOLL based ring is newly created and an application issues an io_uring_enter(2) system call on it, then we can return a spurious -EOWNERDEAD error. This happens because there's nothing to submit, and if the caller doesn't specify any other action, the initial error assignment of -EOWNERDEAD never gets overwritten. This causes us to return it directly, even if it isn't valid. Move the error assignment into the actual failure case instead. Cc: stable@vger.kernel.org Fixes: d9d05217cb69 ("io_uring: stop SQPOLL submit on creator's death") Reported-by: Sherlock Holo sherlockya@gmail.com Link: https://github.com/axboe/liburing/issues/413 Signed-off-by: Jens Axboe Signed-off-by: Gang Li --- fs/io_uring.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 0ef1067ba8077..7ecd782d037bd 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -9393,9 +9393,10 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (ctx->flags & IORING_SETUP_SQPOLL) { io_cqring_overflow_flush(ctx, false, NULL, NULL); - ret = -EOWNERDEAD; - if (unlikely(ctx->sqo_dead)) + if (unlikely(ctx->sqo_dead)) { + ret = -EOWNERDEAD; goto out; + } if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sq_data->wait); if (flags & IORING_ENTER_SQ_WAIT) { From b89bcdc90c14a5c37236f9874e7beb11633d1bac Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:27:37 +0200 Subject: [PATCH 0119/5344] locking/local_lock: Add missing owner initialization commit d8bbd97ad0b99a9394f2cd8410b884c48e218cf0 upstream. If CONFIG_DEBUG_LOCK_ALLOC=y is enabled then local_lock_t has an 'owner' member which is checked for consistency, but nothing initialized it to zero explicitly. The static initializer does so implicit, and the run time allocated per CPU storage is usually zero initialized as well, but relying on that is not really good practice. Fixes: 91710728d172 ("locking: Introduce local_lock()") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211301.969975279@linutronix.de Signed-off-by: Gang Li --- include/linux/local_lock_internal.h | 39 +++++++++++++++++------------ 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h index 4a8795b21d774..3f02b818625ef 100644 --- a/include/linux/local_lock_internal.h +++ b/include/linux/local_lock_internal.h @@ -14,26 +14,14 @@ typedef struct { } local_lock_t; #ifdef CONFIG_DEBUG_LOCK_ALLOC -# define LL_DEP_MAP_INIT(lockname) \ +# define LOCAL_LOCK_DEBUG_INIT(lockname) \ .dep_map = { \ .name = #lockname, \ .wait_type_inner = LD_WAIT_CONFIG, \ - } -#else -# define LL_DEP_MAP_INIT(lockname) -#endif - -#define INIT_LOCAL_LOCK(lockname) { LL_DEP_MAP_INIT(lockname) } + .lock_type = LD_LOCK_PERCPU, \ + }, \ + .owner = NULL, -#define __local_lock_init(lock) \ -do { \ - static struct lock_class_key __key; \ - \ - debug_check_no_locks_freed((void *)lock, sizeof(*lock));\ - lockdep_init_map_wait(&(lock)->dep_map, #lock, &__key, 0, LD_WAIT_CONFIG);\ -} while (0) - -#ifdef CONFIG_DEBUG_LOCK_ALLOC static inline void local_lock_acquire(local_lock_t *l) { lock_map_acquire(&l->dep_map); @@ -48,11 +36,30 @@ static inline void local_lock_release(local_lock_t *l) lock_map_release(&l->dep_map); } +static inline void local_lock_debug_init(local_lock_t *l) +{ + l->owner = NULL; +} #else /* CONFIG_DEBUG_LOCK_ALLOC */ +# define LOCAL_LOCK_DEBUG_INIT(lockname) static inline void local_lock_acquire(local_lock_t *l) { } static inline void local_lock_release(local_lock_t *l) { } +static inline void local_lock_debug_init(local_lock_t *l) { } #endif /* !CONFIG_DEBUG_LOCK_ALLOC */ +#define INIT_LOCAL_LOCK(lockname) { LOCAL_LOCK_DEBUG_INIT(lockname) } + +#define __local_lock_init(lock) \ +do { \ + static struct lock_class_key __key; \ + \ + debug_check_no_locks_freed((void *)lock, sizeof(*lock));\ + lockdep_init_map_type(&(lock)->dep_map, #lock, &__key, \ + 0, LD_WAIT_CONFIG, LD_WAIT_INV, \ + LD_LOCK_PERCPU); \ + local_lock_debug_init(lock); \ +} while (0) + #define __local_lock(lock) \ do { \ preempt_disable(); \ From 51467df54484996c08cbf717fa4987db48b03f3a Mon Sep 17 00:00:00 2001 From: Jim Broadus Date: Sun, 8 Aug 2021 23:55:05 -0700 Subject: [PATCH 0120/5344] HID: i2c-hid: Fix Elan touchpad regression commit 786537063bbfb3a7ebc6fc21b2baf37fb91df401 upstream. A quirk was recently added for Elan devices that has same device match as an entry earlier in the list. The i2c_hid_lookup_quirk function will always return the last match in the list, so the new entry shadows the old entry. The quirk in the previous entry, I2C_HID_QUIRK_BOGUS_IRQ, silenced a flood of messages which have reappeared in the 5.13 kernel. This change moves the two quirk flags into the same entry. Fixes: ca66a6770bd9 (HID: i2c-hid: Skip ELAN power-on command after reset) Signed-off-by: Jim Broadus Signed-off-by: Jiri Kosina Signed-off-by: Gang Li --- drivers/hid/i2c-hid/i2c-hid-core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/hid/i2c-hid/i2c-hid-core.c b/drivers/hid/i2c-hid/i2c-hid-core.c index 6f7a3702b5fba..ac076ac73de5d 100644 --- a/drivers/hid/i2c-hid/i2c-hid-core.c +++ b/drivers/hid/i2c-hid/i2c-hid-core.c @@ -178,8 +178,6 @@ static const struct i2c_hid_quirks { I2C_HID_QUIRK_NO_IRQ_AFTER_RESET }, { I2C_VENDOR_ID_RAYDIUM, I2C_PRODUCT_ID_RAYDIUM_3118, I2C_HID_QUIRK_NO_IRQ_AFTER_RESET }, - { USB_VENDOR_ID_ELAN, HID_ANY_ID, - I2C_HID_QUIRK_BOGUS_IRQ }, { USB_VENDOR_ID_ALPS_JP, HID_ANY_ID, I2C_HID_QUIRK_RESET_ON_RESUME }, { I2C_VENDOR_ID_SYNAPTICS, I2C_PRODUCT_ID_SYNAPTICS_SYNA2393, @@ -190,7 +188,8 @@ static const struct i2c_hid_quirks { * Sending the wakeup after reset actually break ELAN touchscreen controller */ { USB_VENDOR_ID_ELAN, HID_ANY_ID, - I2C_HID_QUIRK_NO_WAKEUP_AFTER_RESET }, + I2C_HID_QUIRK_NO_WAKEUP_AFTER_RESET | + I2C_HID_QUIRK_BOGUS_IRQ }, { 0, 0 } }; From 3e9dd076586327436a62cd1d9474be2e9fec8ed1 Mon Sep 17 00:00:00 2001 From: Desmond Cheong Zhi Xi Date: Sat, 21 Aug 2021 01:50:40 +0800 Subject: [PATCH 0121/5344] btrfs: reset replace target device to allocation state on close commit 0d977e0eba234e01a60bdde27314dc21374201b3 upstream. This crash was observed with a failed assertion on device close: BTRFS: Transaction aborted (error -28) WARNING: CPU: 1 PID: 3902 at fs/btrfs/extent-tree.c:2150 btrfs_run_delayed_refs+0x1d2/0x1e0 [btrfs] Modules linked in: btrfs blake2b_generic libcrc32c crc32c_intel xor zstd_decompress zstd_compress xxhash lzo_compress lzo_decompress raid6_pq loop CPU: 1 PID: 3902 Comm: kworker/u8:4 Not tainted 5.14.0-rc5-default+ #1532 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014 Workqueue: events_unbound btrfs_async_reclaim_metadata_space [btrfs] RIP: 0010:btrfs_run_delayed_refs+0x1d2/0x1e0 [btrfs] RSP: 0018:ffffb7a5452d7d80 EFLAGS: 00010282 RAX: 0000000000000000 RBX: 0000000000000003 RCX: 0000000000000000 RDX: 0000000000000001 RSI: ffffffffabee13c4 RDI: 00000000ffffffff RBP: ffff97834176a378 R08: 0000000000000001 R09: 0000000000000001 R10: 0000000000000000 R11: 0000000000000001 R12: ffff97835195d388 R13: 0000000005b08000 R14: ffff978385484000 R15: 000000000000016c FS: 0000000000000000(0000) GS:ffff9783bd800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000056190d003fe8 CR3: 000000002a81e005 CR4: 0000000000170ea0 Call Trace: flush_space+0x197/0x2f0 [btrfs] btrfs_async_reclaim_metadata_space+0x139/0x300 [btrfs] process_one_work+0x262/0x5e0 worker_thread+0x4c/0x320 ? process_one_work+0x5e0/0x5e0 kthread+0x144/0x170 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x1f/0x30 irq event stamp: 19334989 hardirqs last enabled at (19334997): [] console_unlock+0x2b7/0x400 hardirqs last disabled at (19335006): [] console_unlock+0x33d/0x400 softirqs last enabled at (19334900): [] __do_softirq+0x30d/0x574 softirqs last disabled at (19334893): [] irq_exit_rcu+0x12c/0x140 ---[ end trace 45939e308e0dd3c7 ]--- BTRFS: error (device vdd) in btrfs_run_delayed_refs:2150: errno=-28 No space left BTRFS info (device vdd): forced readonly BTRFS warning (device vdd): failed setting block group ro: -30 BTRFS info (device vdd): suspending dev_replace for unmount assertion failed: !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state), in fs/btrfs/volumes.c:1150 ------------[ cut here ]------------ kernel BUG at fs/btrfs/ctree.h:3431! invalid opcode: 0000 [#1] PREEMPT SMP CPU: 1 PID: 3982 Comm: umount Tainted: G W 5.14.0-rc5-default+ #1532 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014 RIP: 0010:assertfail.constprop.0+0x18/0x1a [btrfs] RSP: 0018:ffffb7a5454c7db8 EFLAGS: 00010246 RAX: 0000000000000068 RBX: ffff978364b91c00 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffffffabee13c4 RDI: 00000000ffffffff RBP: ffff9783523a4c00 R08: 0000000000000001 R09: 0000000000000001 R10: 0000000000000000 R11: 0000000000000001 R12: ffff9783523a4d18 R13: 0000000000000000 R14: 0000000000000004 R15: 0000000000000003 FS: 00007f61c8f42800(0000) GS:ffff9783bd800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000056190cffa810 CR3: 0000000030b96002 CR4: 0000000000170ea0 Call Trace: btrfs_close_one_device.cold+0x11/0x55 [btrfs] close_fs_devices+0x44/0xb0 [btrfs] btrfs_close_devices+0x48/0x160 [btrfs] generic_shutdown_super+0x69/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x2c/0xa0 cleanup_mnt+0x144/0x1b0 task_work_run+0x59/0xa0 exit_to_user_mode_loop+0xe7/0xf0 exit_to_user_mode_prepare+0xaf/0xf0 syscall_exit_to_user_mode+0x19/0x50 do_syscall_64+0x4a/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae This happens when close_ctree is called while a dev_replace hasn't completed. In close_ctree, we suspend the dev_replace, but keep the replace target around so that we can resume the dev_replace procedure when we mount the root again. This is the call trace: close_ctree(): btrfs_dev_replace_suspend_for_unmount(); btrfs_close_devices(): btrfs_close_fs_devices(): btrfs_close_one_device(): ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); However, since the replace target sticks around, there is a device with BTRFS_DEV_STATE_REPLACE_TGT set on close, and we fail the assertion in btrfs_close_one_device. To fix this, if we come across the replace target device when closing, we should properly reset it back to allocation state. This fix also ensures that if a non-target device has a corrupted state and has the BTRFS_DEV_STATE_REPLACE_TGT bit set, the assertion will still catch the error. Reported-by: David Sterba Fixes: b2a616676839 ("btrfs: fix rw device counting in __btrfs_free_extra_devids") CC: stable@vger.kernel.org # 4.19+ Reviewed-by: Anand Jain Signed-off-by: Desmond Cheong Zhi Xi Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Gang Li --- fs/btrfs/volumes.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3e3529c600cb7..94bc3e280b009 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1311,6 +1311,9 @@ static void btrfs_close_one_device(struct btrfs_device *device) fs_devices->rw_devices--; } + if (device->devid == BTRFS_DEV_REPLACE_DEVID) + clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) fs_devices->missing_devices--; From 8ebbed37acbe7d64efc198834501bd7f22b5d56c Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 24 Aug 2021 14:19:26 +0200 Subject: [PATCH 0122/5344] Revert "USB: serial: ch341: fix character loss at high transfer rates" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit df7b16d1c00ecb3da3a30c999cdb39f273c99a2f upstream. This reverts commit 3c18e9baee0ef97510dcda78c82285f52626764b. These devices do not appear to send a zero-length packet when the transfer size is a multiple of the bulk-endpoint max-packet size. This means that incoming data may not be processed by the driver until a short packet is received or the receive buffer is full. Revert back to using endpoint-sized receive buffers to avoid stalled reads. Reported-by: Paul Größel Link: https://bugzilla.kernel.org/show_bug.cgi?id=214131 Fixes: 3c18e9baee0e ("USB: serial: ch341: fix character loss at high transfer rates") Cc: stable@vger.kernel.org Cc: Willy Tarreau Link: https://lore.kernel.org/r/20210824121926.19311-1-johan@kernel.org Signed-off-by: Johan Hovold Signed-off-by: Gang Li --- drivers/usb/serial/ch341.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/usb/serial/ch341.c b/drivers/usb/serial/ch341.c index e9c39a41faae9..a82ba9cc0c724 100644 --- a/drivers/usb/serial/ch341.c +++ b/drivers/usb/serial/ch341.c @@ -678,7 +678,6 @@ static struct usb_serial_driver ch341_device = { .owner = THIS_MODULE, .name = "ch341-uart", }, - .bulk_in_size = 512, .id_table = id_table, .num_ports = 1, .open = ch341_open, From e680f18b58ccd3c3b18f32cf51568b1306912944 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= Date: Thu, 26 Aug 2021 19:03:42 +0200 Subject: [PATCH 0123/5344] PCI/MSI: Skip masking MSI-X on Xen PV MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 1a519dc7a73c977547d8b5108d98c6e769c89f4b upstream. When running as Xen PV guest, masking MSI-X is a responsibility of the hypervisor. The guest has no write access to the relevant BAR at all - when it tries to, it results in a crash like this: BUG: unable to handle page fault for address: ffffc9004069100c #PF: supervisor write access in kernel mode #PF: error_code(0x0003) - permissions violation RIP: e030:__pci_enable_msix_range.part.0+0x26b/0x5f0 e1000e_set_interrupt_capability+0xbf/0xd0 [e1000e] e1000_probe+0x41f/0xdb0 [e1000e] local_pci_probe+0x42/0x80 (...) The recently introduced function msix_mask_all() does not check the global variable pci_msi_ignore_mask which is set by XEN PV to bypass the masking of MSI[-X] interrupts. Add the check to make this function XEN PV compatible. Fixes: 7d5ec3d36123 ("PCI/MSI: Mask all unused MSI-X entries") Signed-off-by: Marek Marczykowski-Górecki Signed-off-by: Thomas Gleixner Acked-by: Bjorn Helgaas Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210826170342.135172-1-marmarek@invisiblethingslab.com Signed-off-by: Gang Li --- drivers/pci/msi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 2163ae3380d5e..971080232afc7 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -782,6 +782,9 @@ static void msix_mask_all(void __iomem *base, int tsize) u32 ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT; int i; + if (pci_msi_ignore_mask) + return; + for (i = 0; i < tsize; i++, base += PCI_MSIX_ENTRY_SIZE) writel(ctrl, base + PCI_MSIX_ENTRY_VECTOR_CTRL); } From 5c0b85458c3540e969cee440a212c96a956366df Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 21 Aug 2021 23:07:51 +0800 Subject: [PATCH 0124/5344] io_uring: retry in case of short read on block device commit 7db304375e11741e5940f9bc549155035bfb4dc1 upstream. In case of buffered reading from block device, when short read happens, we should retry to read more, otherwise the IO will be completed partially, for example, the following fio expects to read 2MB, but it can only read 1M or less bytes: fio --name=onessd --filename=/dev/nvme0n1 --filesize=2M \ --rw=randread --bs=2M --direct=0 --overwrite=0 --numjobs=1 \ --iodepth=1 --time_based=0 --runtime=2 --ioengine=io_uring \ --registerfiles --fixedbufs --gtod_reduce=1 --group_reporting Fix the issue by allowing short read retry for block device, which sets FMODE_BUF_RASYNC really. Fixes: 9a173346bd9e ("io_uring: fix short read retries for non-reg files") Cc: Pavel Begunkov Signed-off-by: Ming Lei Reviewed-by: Pavel Begunkov Link: https://lore.kernel.org/r/20210821150751.1290434-1-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Gang Li --- fs/io_uring.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7ecd782d037bd..ecbc378afeafa 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3503,6 +3503,12 @@ static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) return -EINVAL; } +static bool need_read_all(struct io_kiocb *req) +{ + return req->flags & REQ_F_ISREG || + S_ISBLK(file_inode(req->file)->i_mode); +} + static int io_read(struct io_kiocb *req, bool force_nonblock, struct io_comp_state *cs) { @@ -3563,7 +3569,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, /* read it all, or we did blocking attempt. no retry. */ if (!iov_iter_count(iter) || !force_nonblock || - (req->file->f_flags & O_NONBLOCK) || !(req->flags & REQ_F_ISREG)) + (req->file->f_flags & O_NONBLOCK) || !need_read_all(req)) goto done; io_size -= ret; From 7384683056a9065d012ac0f7724f8a8b9229cb5c Mon Sep 17 00:00:00 2001 From: Lukas Hannen Date: Wed, 25 Aug 2021 10:12:43 +0000 Subject: [PATCH 0125/5344] time: Handle negative seconds correctly in timespec64_to_ns() commit 39ff83f2f6cc5cc1458dfcea9697f96338210beb upstream. timespec64_ns() prevents multiplication overflows by comparing the seconds value of the timespec to KTIME_SEC_MAX. If the value is greater or equal it returns KTIME_MAX. But that check casts the signed seconds value to unsigned which makes the comparision true for all negative values and therefore return wrongly KTIME_MAX. Negative second values are perfectly valid and required in some places, e.g. ptp_clock_adjtime(). Remove the cast and add a check for the negative boundary which is required to prevent undefined behaviour due to multiplication underflow. Fixes: cb47755725da ("time: Prevent undefined behaviour in timespec64_to_ns()")' Signed-off-by: Lukas Hannen Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/AM6PR01MB541637BD6F336B8FFB72AF80EEC69@AM6PR01MB5416.eurprd01.prod.exchangelabs.com Signed-off-by: Gang Li --- include/linux/time64.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/time64.h b/include/linux/time64.h index 5eab3f2635186..f6059c505986b 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -33,7 +33,9 @@ struct itimerspec64 { #define TIME64_MIN (-TIME64_MAX - 1) #define KTIME_MAX ((s64)~((u64)1 << 63)) +#define KTIME_MIN (-KTIME_MAX - 1) #define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) +#define KTIME_SEC_MIN (KTIME_MIN / NSEC_PER_SEC) /* * Limits for settimeofday(): @@ -132,10 +134,13 @@ static inline bool timespec64_valid_settod(const struct timespec64 *ts) */ static inline s64 timespec64_to_ns(const struct timespec64 *ts) { - /* Prevent multiplication overflow */ - if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) + /* Prevent multiplication overflow / underflow */ + if (ts->tv_sec >= KTIME_SEC_MAX) return KTIME_MAX; + if (ts->tv_sec <= KTIME_SEC_MIN) + return KTIME_MIN; + return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec; } From 06c0f260c65ab00aff0ccfdca51d1b65e8b368e3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Sep 2021 17:00:29 -0700 Subject: [PATCH 0126/5344] net/af_unix: fix a data-race in unix_dgram_poll commit 04f08eb44b5011493d77b602fdec29ff0f5c6cd5 upstream. syzbot reported another data-race in af_unix [1] Lets change __skb_insert() to use WRITE_ONCE() when changing skb head qlen. Also, change unix_dgram_poll() to use lockless version of unix_recvq_full() It is verry possible we can switch all/most unix_recvq_full() to the lockless version, this will be done in a future kernel version. [1] HEAD commit: 8596e589b787732c8346f0482919e83cc9362db1 BUG: KCSAN: data-race in skb_queue_tail / unix_dgram_poll write to 0xffff88814eeb24e0 of 4 bytes by task 25815 on cpu 0: __skb_insert include/linux/skbuff.h:1938 [inline] __skb_queue_before include/linux/skbuff.h:2043 [inline] __skb_queue_tail include/linux/skbuff.h:2076 [inline] skb_queue_tail+0x80/0xa0 net/core/skbuff.c:3264 unix_dgram_sendmsg+0xff2/0x1600 net/unix/af_unix.c:1850 sock_sendmsg_nosec net/socket.c:703 [inline] sock_sendmsg net/socket.c:723 [inline] ____sys_sendmsg+0x360/0x4d0 net/socket.c:2392 ___sys_sendmsg net/socket.c:2446 [inline] __sys_sendmmsg+0x315/0x4b0 net/socket.c:2532 __do_sys_sendmmsg net/socket.c:2561 [inline] __se_sys_sendmmsg net/socket.c:2558 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2558 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff88814eeb24e0 of 4 bytes by task 25834 on cpu 1: skb_queue_len include/linux/skbuff.h:1869 [inline] unix_recvq_full net/unix/af_unix.c:194 [inline] unix_dgram_poll+0x2bc/0x3e0 net/unix/af_unix.c:2777 sock_poll+0x23e/0x260 net/socket.c:1288 vfs_poll include/linux/poll.h:90 [inline] ep_item_poll fs/eventpoll.c:846 [inline] ep_send_events fs/eventpoll.c:1683 [inline] ep_poll fs/eventpoll.c:1798 [inline] do_epoll_wait+0x6ad/0xf00 fs/eventpoll.c:2226 __do_sys_epoll_wait fs/eventpoll.c:2238 [inline] __se_sys_epoll_wait fs/eventpoll.c:2233 [inline] __x64_sys_epoll_wait+0xf6/0x120 fs/eventpoll.c:2233 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x0000001b -> 0x00000001 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 25834 Comm: syz-executor.1 Tainted: G W 5.14.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 86b18aaa2b5b ("skbuff: fix a data race in skb_queue_len()") Cc: Qian Cai Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Gang Li --- include/linux/skbuff.h | 2 +- net/unix/af_unix.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9b1f3cff27032..7fe336949a261 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1893,7 +1893,7 @@ static inline void __skb_insert(struct sk_buff *newsk, WRITE_ONCE(newsk->prev, prev); WRITE_ONCE(next->prev, newsk); WRITE_ONCE(prev->next, newsk); - list->qlen++; + WRITE_ONCE(list->qlen, list->qlen + 1); } static inline void __skb_queue_splice(const struct sk_buff_head *list, diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 52ee3a9bb7093..3098710c9c344 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2734,7 +2734,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, other = unix_peer(sk); if (other && unix_peer(other) != sk && - unix_recvq_full(other) && + unix_recvq_full_lockless(other) && unix_dgram_peer_wake_me(sk, other)) writable = 0; From 13574f640c80744c63bcb6982b3dd37797c6d451 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 9 Sep 2021 10:56:12 +0200 Subject: [PATCH 0127/5344] thermal/drivers/int340x: Do not set a wrong tcc offset on resume commit 8b4bd256674720709a9d858a219fcac6f2f253b5 upstream. After upgrading to Linux 5.13.3 I noticed my laptop would shutdown due to overheat (when it should not). It turned out this was due to commit fe6a6de6692e ("thermal/drivers/int340x/processor_thermal: Fix tcc setting"). What happens is this drivers uses a global variable to keep track of the tcc offset (tcc_offset_save) and uses it on resume. The issue is this variable is initialized to 0, but is only set in tcc_offset_degree_celsius_store, i.e. when the tcc offset is explicitly set by userspace. If that does not happen, the resume path will set the offset to 0 (in my case the h/w default being 3, the offset would become too low after a suspend/resume cycle). The issue did not arise before commit fe6a6de6692e, as the function setting the offset would return if the offset was 0. This is no longer the case (rightfully). Fix this by not applying the offset if it wasn't saved before, reverting back to the old logic. A better approach will come later, but this will be easier to apply to stable kernels. The logic to restore the offset after a resume was there long before commit fe6a6de6692e, but as a value of 0 was considered invalid I'm referencing the commit that made the issue possible in the Fixes tag instead. Fixes: fe6a6de6692e ("thermal/drivers/int340x/processor_thermal: Fix tcc setting") Cc: stable@vger.kernel.org Cc: Srinivas Pandruvada Signed-off-by: Antoine Tenart Signed-off-by: Daniel Lezcano Reviewed-by: Srinivas Pandruvada Tested-by: Srinivas Pandruvada Link: https://lore.kernel.org/r/20210909085613.5577-2-atenart@kernel.org Signed-off-by: Gang Li --- .../thermal/intel/int340x_thermal/processor_thermal_device.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_device.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_device.c index 576523d0326c8..a902e2a053ee3 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_device.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_device.c @@ -179,7 +179,7 @@ static int tcc_offset_update(unsigned int tcc) return 0; } -static unsigned int tcc_offset_save; +static int tcc_offset_save = -1; static ssize_t tcc_offset_degree_celsius_store(struct device *dev, struct device_attribute *attr, const char *buf, @@ -703,7 +703,8 @@ static int proc_thermal_resume(struct device *dev) proc_dev = dev_get_drvdata(dev); proc_thermal_read_ppcc(proc_dev); - tcc_offset_update(tcc_offset_save); + if (tcc_offset_save >= 0) + tcc_offset_update(tcc_offset_save); return 0; } From 54b9ce40ee3fe5678a9f62573ff5c662710dcb89 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Sun, 12 Sep 2021 22:05:23 +0300 Subject: [PATCH 0128/5344] bnx2x: Fix enabling network interfaces without VFs commit 52ce14c134a003fee03d8fc57442c05a55b53715 upstream. This function is called to enable SR-IOV when available, not enabling interfaces without VFs was a regression. Fixes: 65161c35554f ("bnx2x: Fix missing error code in bnx2x_iov_init_one()") Signed-off-by: Adrian Bunk Reported-by: YunQiang Su Tested-by: YunQiang Su Cc: stable@vger.kernel.org Acked-by: Shai Malin Link: https://lore.kernel.org/r/20210912190523.27991-1-bunk@kernel.org Signed-off-by: Jakub Kicinski Signed-off-by: Gang Li --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c index cf39623b828b7..4630998d47fd4 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c @@ -1246,7 +1246,7 @@ int bnx2x_iov_init_one(struct bnx2x *bp, int int_mode_param, /* SR-IOV capability was enabled but there are no VFs*/ if (iov->total == 0) { - err = -EINVAL; + err = 0; goto failed; } From ef9c4ce87e068c0031eac860f42b4d36526a920b Mon Sep 17 00:00:00 2001 From: Martin Fuzzey Date: Mon, 30 Aug 2021 17:26:45 +0200 Subject: [PATCH 0129/5344] rsi: fix key enabled check causing unwanted encryption for vap_id > 0 commit 99ac6018821253ec67f466086afb63fc18ea48e2 upstream. My previous patch checked if encryption should be enabled by directly checking info->control.hw_key (like the downstream driver). However that missed that the control and driver_info members of struct ieee80211_tx_info are union fields. Due to this when rsi_core_xmit() updates fields in "tx_params" (driver_info) it can overwrite the control.hw_key, causing the result of the later test to be incorrect. With the current structure layout the first byte of control.hw_key is overlayed with the vap_id so, since we only test if control.hw_key is NULL / non NULL, a non zero vap_id will incorrectly enable encryption. In basic STA and AP modes the vap_id is always zero so it works but in P2P client mode a second VIF is created causing vap_id to be non zero and hence encryption to be enabled before keys have been set. Fix this by extracting the key presence flag to a new field in the driver private tx_params structure and populating it first. Fixes: 314538041b56 ("rsi: fix AP mode with WPA failure due to encrypted EAPOL") Signed-off-by: Martin Fuzzey CC: stable@vger.kernel.org Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1630337206-12410-3-git-send-email-martin.fuzzey@flowbird.group Signed-off-by: Gang Li --- drivers/net/wireless/rsi/rsi_91x_core.c | 2 ++ drivers/net/wireless/rsi/rsi_91x_hal.c | 2 +- drivers/net/wireless/rsi/rsi_main.h | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/rsi/rsi_91x_core.c b/drivers/net/wireless/rsi/rsi_91x_core.c index 3644d7d994638..c6c29034b2ead 100644 --- a/drivers/net/wireless/rsi/rsi_91x_core.c +++ b/drivers/net/wireless/rsi/rsi_91x_core.c @@ -400,6 +400,8 @@ void rsi_core_xmit(struct rsi_common *common, struct sk_buff *skb) info = IEEE80211_SKB_CB(skb); tx_params = (struct skb_info *)info->driver_data; + /* info->driver_data and info->control part of union so make copy */ + tx_params->have_key = !!info->control.hw_key; wh = (struct ieee80211_hdr *)&skb->data[0]; tx_params->sta_id = 0; diff --git a/drivers/net/wireless/rsi/rsi_91x_hal.c b/drivers/net/wireless/rsi/rsi_91x_hal.c index 03791f3fe480c..0523ccee9b12c 100644 --- a/drivers/net/wireless/rsi/rsi_91x_hal.c +++ b/drivers/net/wireless/rsi/rsi_91x_hal.c @@ -203,7 +203,7 @@ int rsi_prepare_data_desc(struct rsi_common *common, struct sk_buff *skb) wh->frame_control |= cpu_to_le16(RSI_SET_PS_ENABLE); if ((!(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT)) && - info->control.hw_key) { + tx_params->have_key) { if (rsi_is_cipher_wep(common)) ieee80211_size += 4; else diff --git a/drivers/net/wireless/rsi/rsi_main.h b/drivers/net/wireless/rsi/rsi_main.h index b3e25bc28682c..1820015dc74d9 100644 --- a/drivers/net/wireless/rsi/rsi_main.h +++ b/drivers/net/wireless/rsi/rsi_main.h @@ -139,6 +139,7 @@ struct skb_info { u8 internal_hdr_size; struct ieee80211_vif *vif; u8 vap_id; + bool have_key; }; enum edca_queue { From 6ff22d81d76bc0f3f4f7ad75367d261eb35f4957 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 24 Sep 2021 08:43:54 -0600 Subject: [PATCH 0130/5344] io_uring: don't punt files update to io-wq unconditionally commit cdb31c29d397a8076d81fd1458d091c647ef94ba upstream. There's no reason to punt it unconditionally, we just need to ensure that the submit lock grabbing is conditional. Fixes: 05f3fb3c5397 ("io_uring: avoid ring quiesce for fixed file set unregister and update") Signed-off-by: Jens Axboe Signed-off-by: Gang Li --- fs/io_uring.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ecbc378afeafa..cd32da26cc4b1 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6043,15 +6043,12 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock, struct io_uring_files_update up; int ret; - if (force_nonblock) - return -EAGAIN; - up.offset = req->files_update.offset; up.fds = req->files_update.arg; - mutex_lock(&ctx->uring_lock); + io_ring_submit_lock(ctx, !force_nonblock); ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args); - mutex_unlock(&ctx->uring_lock); + io_ring_submit_unlock(ctx, !force_nonblock); if (ret < 0) req_set_fail_links(req); From 27ab09a8c51d174211b9d4e8c7a867052344cd89 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 27 Sep 2021 11:58:39 +0200 Subject: [PATCH 0131/5344] mac80211: fix use-after-free in CCMP/GCMP RX commit 94513069eb549737bcfc3d988d6ed4da948a2de8 upstream. When PN checking is done in mac80211, for fragmentation we need to copy the PN to the RX struct so we can later use it to do a comparison, since commit bf30ca922a0c ("mac80211: check defrag PN against current frame"). Unfortunately, in that commit I used the 'hdr' variable without it being necessarily valid, so use-after-free could occur if it was necessary to reallocate (parts of) the frame. Fix this by reloading the variable after the code that results in the reallocations, if any. This fixes https://bugzilla.kernel.org/show_bug.cgi?id=214401. Cc: stable@vger.kernel.org Fixes: bf30ca922a0c ("mac80211: check defrag PN against current frame") Link: https://lore.kernel.org/r/20210927115838.12b9ac6bb233.I1d066acd5408a662c3b6e828122cd314fcb28cdb@changeid Signed-off-by: Johannes Berg Signed-off-by: Gang Li --- net/mac80211/wpa.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index bca47fad5a162..4eed23e276104 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -520,6 +520,9 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx, return RX_DROP_UNUSABLE; } + /* reload hdr - skb might have been reallocated */ + hdr = (void *)rx->skb->data; + data_len = skb->len - hdrlen - IEEE80211_CCMP_HDR_LEN - mic_len; if (!rx->sta || data_len < 0) return RX_DROP_UNUSABLE; @@ -749,6 +752,9 @@ ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx) return RX_DROP_UNUSABLE; } + /* reload hdr - skb might have been reallocated */ + hdr = (void *)rx->skb->data; + data_len = skb->len - hdrlen - IEEE80211_GCMP_HDR_LEN - mic_len; if (!rx->sta || data_len < 0) return RX_DROP_UNUSABLE; From 5af990b01cc15256af361f199b6c64ded99d6f23 Mon Sep 17 00:00:00 2001 From: Patrick Ho Date: Sat, 21 Aug 2021 02:56:26 -0400 Subject: [PATCH 0132/5344] nfsd: fix error handling of register_pernet_subsys() in init_nfsd() commit 1d625050c7c2dd877e108e382b8aaf1ae3cfe1f4 upstream. init_nfsd() should not unregister pernet subsys if the register fails but should instead unwind from the last successful operation which is register_filesystem(). Unregistering a failed register_pernet_subsys() call can result in a kernel GPF as revealed by programmatically injecting an error in register_pernet_subsys(). Verified the fix handled failure gracefully with no lingering nfsd entry in /proc/filesystems. This change was introduced by the commit bd5ae9288d64 ("nfsd: register pernet ops last, unregister first"), the original error handling logic was correct. Fixes: bd5ae9288d64 ("nfsd: register pernet ops last, unregister first") Cc: stable@vger.kernel.org Signed-off-by: Patrick Ho Acked-by: J. Bruce Fields Signed-off-by: Chuck Lever Signed-off-by: Gang Li --- fs/nfsd/nfsctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 075ef83ae53c2..afd42b4870e34 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1550,7 +1550,7 @@ static int __init init_nfsd(void) goto out_free_all; return 0; out_free_all: - unregister_pernet_subsys(&nfsd_net_ops); + unregister_filesystem(&nfsd_fs_type); out_free_exports: remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); From 177a91df0d57a9a343bc2585aa1a36a7fcc132c1 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:40:27 +0900 Subject: [PATCH 0133/5344] ia64: kprobes: Fix to pass correct trampoline address to the handler commit a7fe2378454cf46cd5e2776d05e72bbe8f0a468c upstream. The following commit: Commit e792ff804f49 ("ia64: kprobes: Use generic kretprobe trampoline handler") Passed the wrong trampoline address to __kretprobe_trampoline_handler(): it passes the descriptor address instead of function entry address. Pass the right parameter. Also use correct symbol dereference function to get the function address from 'kretprobe_trampoline' - an IA64 special. Link: https://lkml.kernel.org/r/163163042696.489837.12551102356265354730.stgit@devnote2 Fixes: e792ff804f49 ("ia64: kprobes: Use generic kretprobe trampoline handler") Cc: Josh Poimboeuf Cc: Ingo Molnar Cc: X86 ML Cc: Daniel Xu Cc: Thomas Gleixner Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Abhishek Sagar Cc: Andrii Nakryiko Cc: Paul McKenney Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Gang Li --- arch/ia64/kernel/kprobes.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c index b3dc39050c1ad..8a223d0e4918c 100644 --- a/arch/ia64/kernel/kprobes.c +++ b/arch/ia64/kernel/kprobes.c @@ -398,7 +398,8 @@ static void kretprobe_trampoline(void) int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) { - regs->cr_iip = __kretprobe_trampoline_handler(regs, kretprobe_trampoline, NULL); + regs->cr_iip = __kretprobe_trampoline_handler(regs, + dereference_function_descriptor(kretprobe_trampoline), NULL); /* * By returning a non-zero value, we are telling * kprobe_handler() that we don't want the post_handler @@ -414,7 +415,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, ri->fp = NULL; /* Replace the return addr with trampoline addr */ - regs->b0 = ((struct fnptr *)kretprobe_trampoline)->ip; + regs->b0 = (unsigned long)dereference_function_descriptor(kretprobe_trampoline); } /* Check the instruction in the slot is break */ @@ -918,14 +919,14 @@ static struct kprobe trampoline_p = { int __init arch_init_kprobes(void) { trampoline_p.addr = - (kprobe_opcode_t *)((struct fnptr *)kretprobe_trampoline)->ip; + dereference_function_descriptor(kretprobe_trampoline); return register_kprobe(&trampoline_p); } int __kprobes arch_trampoline_kprobe(struct kprobe *p) { if (p->addr == - (kprobe_opcode_t *)((struct fnptr *)kretprobe_trampoline)->ip) + dereference_function_descriptor(kretprobe_trampoline)) return 1; return 0; From e93cbeebd605c9948153b7c612fdea0fc069b8e1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Oct 2021 12:55:22 -0700 Subject: [PATCH 0134/5344] net/sched: sch_taprio: properly cancel timer from taprio_destroy() commit a56d447f196fa9973c568f54c0d76d5391c3b0c0 upstream. There is a comment in qdisc_create() about us not calling ops->reset() in some cases. err_out4: /* * Any broken qdiscs that would require a ops->reset() here? * The qdisc was never in action so it shouldn't be necessary. */ As taprio sets a timer before actually receiving a packet, we need to cancel it from ops->destroy, just in case ops->reset has not been called. syzbot reported: ODEBUG: free active (active state 0) object type: hrtimer hint: advance_sched+0x0/0x9a0 arch/x86/include/asm/atomic64_64.h:22 WARNING: CPU: 0 PID: 8441 at lib/debugobjects.c:505 debug_print_object+0x16e/0x250 lib/debugobjects.c:505 Modules linked in: CPU: 0 PID: 8441 Comm: syz-executor813 Not tainted 5.14.0-rc6-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:debug_print_object+0x16e/0x250 lib/debugobjects.c:505 Code: ff df 48 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 af 00 00 00 48 8b 14 dd e0 d3 e3 89 4c 89 ee 48 c7 c7 e0 c7 e3 89 e8 5b 86 11 05 <0f> 0b 83 05 85 03 92 09 01 48 83 c4 18 5b 5d 41 5c 41 5d 41 5e c3 RSP: 0018:ffffc9000130f330 EFLAGS: 00010282 RAX: 0000000000000000 RBX: 0000000000000003 RCX: 0000000000000000 RDX: ffff88802baeb880 RSI: ffffffff815d87b5 RDI: fffff52000261e58 RBP: 0000000000000001 R08: 0000000000000000 R09: 0000000000000000 R10: ffffffff815d25ee R11: 0000000000000000 R12: ffffffff898dd020 R13: ffffffff89e3ce20 R14: ffffffff81653630 R15: dffffc0000000000 FS: 0000000000f0d300(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ffb64b3e000 CR3: 0000000036557000 CR4: 00000000001506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __debug_check_no_obj_freed lib/debugobjects.c:987 [inline] debug_check_no_obj_freed+0x301/0x420 lib/debugobjects.c:1018 slab_free_hook mm/slub.c:1603 [inline] slab_free_freelist_hook+0x171/0x240 mm/slub.c:1653 slab_free mm/slub.c:3213 [inline] kfree+0xe4/0x540 mm/slub.c:4267 qdisc_create+0xbcf/0x1320 net/sched/sch_api.c:1299 tc_modify_qdisc+0x4c8/0x1a60 net/sched/sch_api.c:1663 rtnetlink_rcv_msg+0x413/0xb80 net/core/rtnetlink.c:5571 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2504 netlink_unicast_kernel net/netlink/af_netlink.c:1314 [inline] netlink_unicast+0x533/0x7d0 net/netlink/af_netlink.c:1340 netlink_sendmsg+0x86d/0xdb0 net/netlink/af_netlink.c:1929 sock_sendmsg_nosec net/socket.c:704 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:724 ____sys_sendmsg+0x6e8/0x810 net/socket.c:2403 ___sys_sendmsg+0xf3/0x170 net/socket.c:2457 __sys_sendmsg+0xe5/0x1b0 net/socket.c:2486 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 Fixes: 44d4775ca518 ("net/sched: sch_taprio: reset child qdiscs before freeing them") Signed-off-by: Eric Dumazet Cc: Davide Caratti Reported-by: syzbot Acked-by: Vinicius Costa Gomes Acked-by: Davide Caratti Signed-off-by: David S. Miller Signed-off-by: Gang Li --- net/sched/sch_taprio.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index a4de4853c79de..53f181206eef4 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1628,6 +1628,10 @@ static void taprio_destroy(struct Qdisc *sch) list_del(&q->taprio_list); spin_unlock(&taprio_list_lock); + /* Note that taprio_reset() might not be called if an error + * happens in qdisc_create(), after taprio_init() has been called. + */ + hrtimer_cancel(&q->advance_timer); taprio_disable_offload(dev, q, NULL); From a294afc1bc7b9ca5ab5c97aae70117fac065f838 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 29 Sep 2021 11:09:37 +0200 Subject: [PATCH 0135/5344] USB: cdc-acm: fix break reporting commit 58fc1daa4d2e9789b9ffc880907c961ea7c062cc upstream. A recent change that started reporting break events forgot to push the event to the line discipline, which meant that a detected break would not be reported until further characters had been receive (the port could even have been closed and reopened in between). Fixes: 08dff274edda ("cdc-acm: fix BREAK rx code path adding necessary calls") Cc: stable@vger.kernel.org Acked-by: Oliver Neukum Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20210929090937.7410-3-johan@kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Gang Li --- drivers/usb/class/cdc-acm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c index c0604c60ebd01..a2b04e098782a 100644 --- a/drivers/usb/class/cdc-acm.c +++ b/drivers/usb/class/cdc-acm.c @@ -339,6 +339,9 @@ static void acm_process_notification(struct acm *acm, unsigned char *buf) acm->iocount.overrun++; spin_unlock_irqrestore(&acm->read_lock, flags); + if (newctrl & ACM_CTRL_BRK) + tty_flip_buffer_push(&acm->port); + if (difference) wake_up_all(&acm->wioctl); From 58dfb62aae086afd44384fd2a3e3e51008c56017 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 29 Sep 2021 11:09:36 +0200 Subject: [PATCH 0136/5344] USB: cdc-acm: fix racy tty buffer accesses commit 65a205e6113506e69a503b61d97efec43fc10fd7 upstream. A recent change that started reporting break events to the line discipline caused the tty-buffer insertions to no longer be serialised by inserting events also from the completion handler for the interrupt endpoint. Completion calls for distinct endpoints are not guaranteed to be serialised. For example, in case a host-controller driver uses bottom-half completion, the interrupt and bulk-in completion handlers can end up running in parallel on two CPUs (high-and low-prio tasklets, respectively) thereby breaking the tty layer's single producer assumption. Fix this by holding the read lock also when inserting characters from the bulk endpoint. Fixes: 08dff274edda ("cdc-acm: fix BREAK rx code path adding necessary calls") Cc: stable@vger.kernel.org Acked-by: Oliver Neukum Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20210929090937.7410-2-johan@kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Gang Li --- drivers/usb/class/cdc-acm.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c index a2b04e098782a..10cfa7d089e9f 100644 --- a/drivers/usb/class/cdc-acm.c +++ b/drivers/usb/class/cdc-acm.c @@ -477,11 +477,16 @@ static int acm_submit_read_urbs(struct acm *acm, gfp_t mem_flags) static void acm_process_read_urb(struct acm *acm, struct urb *urb) { + unsigned long flags; + if (!urb->actual_length) return; + spin_lock_irqsave(&acm->read_lock, flags); tty_insert_flip_string(&acm->port, urb->transfer_buffer, urb->actual_length); + spin_unlock_irqrestore(&acm->read_lock, flags); + tty_flip_buffer_push(&acm->port); } From 096ea1aaef39b0dd97b41c8b21d0118efa819814 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Tue, 28 Sep 2021 09:36:52 +0200 Subject: [PATCH 0137/5344] mmc: meson-gx: do not use memcpy_to/fromio for dram-access-quirk commit 8a38a4d51c5055d0201542e5ea3c0cb287f6e223 upstream. The memory at the end of the controller only accepts 32bit read/write accesses, but the arm64 memcpy_to/fromio implementation only uses 64bit (which will be split into two 32bit access) and 8bit leading to incomplete copies to/from this memory when the buffer is not multiple of 8bytes. Add a local copy using writel/readl accesses to make sure we use the right memory access width. The switch to memcpy_to/fromio was done because of 285133040e6c ("arm64: Import latest memcpy()/memmove() implementation"), but using memcpy worked before since it mainly used 32bit memory acceses. Fixes: 103a5348c22c ("mmc: meson-gx: use memcpy_to/fromio for dram-access-quirk") Reported-by: Christian Hewitt Suggested-by: Martin Blumenstingl Signed-off-by: Neil Armstrong Tested-by: Martin Blumenstingl Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210928073652.434690-1-narmstrong@baylibre.com Signed-off-by: Ulf Hansson Signed-off-by: Gang Li --- drivers/mmc/host/meson-gx-mmc.c | 73 ++++++++++++++++++++++++++------- 1 file changed, 59 insertions(+), 14 deletions(-) diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c index a3e3b274f0ea3..cdd57ce55b2fa 100644 --- a/drivers/mmc/host/meson-gx-mmc.c +++ b/drivers/mmc/host/meson-gx-mmc.c @@ -738,7 +738,7 @@ static void meson_mmc_desc_chain_transfer(struct mmc_host *mmc, u32 cmd_cfg) writel(start, host->regs + SD_EMMC_START); } -/* local sg copy to buffer version with _to/fromio usage for dram_access_quirk */ +/* local sg copy for dram_access_quirk */ static void meson_mmc_copy_buffer(struct meson_host *host, struct mmc_data *data, size_t buflen, bool to_buffer) { @@ -756,21 +756,27 @@ static void meson_mmc_copy_buffer(struct meson_host *host, struct mmc_data *data sg_miter_start(&miter, sgl, nents, sg_flags); while ((offset < buflen) && sg_miter_next(&miter)) { - unsigned int len; + unsigned int buf_offset = 0; + unsigned int len, left; + u32 *buf = miter.addr; len = min(miter.length, buflen - offset); + left = len; - /* When dram_access_quirk, the bounce buffer is a iomem mapping */ - if (host->dram_access_quirk) { - if (to_buffer) - memcpy_toio(host->bounce_iomem_buf + offset, miter.addr, len); - else - memcpy_fromio(miter.addr, host->bounce_iomem_buf + offset, len); + if (to_buffer) { + do { + writel(*buf++, host->bounce_iomem_buf + offset + buf_offset); + + buf_offset += 4; + left -= 4; + } while (left); } else { - if (to_buffer) - memcpy(host->bounce_buf + offset, miter.addr, len); - else - memcpy(miter.addr, host->bounce_buf + offset, len); + do { + *buf++ = readl(host->bounce_iomem_buf + offset + buf_offset); + + buf_offset += 4; + left -= 4; + } while (left); } offset += len; @@ -822,7 +828,11 @@ static void meson_mmc_start_cmd(struct mmc_host *mmc, struct mmc_command *cmd) if (data->flags & MMC_DATA_WRITE) { cmd_cfg |= CMD_CFG_DATA_WR; WARN_ON(xfer_bytes > host->bounce_buf_size); - meson_mmc_copy_buffer(host, data, xfer_bytes, true); + if (host->dram_access_quirk) + meson_mmc_copy_buffer(host, data, xfer_bytes, true); + else + sg_copy_to_buffer(data->sg, data->sg_len, + host->bounce_buf, xfer_bytes); dma_wmb(); } @@ -841,12 +851,43 @@ static void meson_mmc_start_cmd(struct mmc_host *mmc, struct mmc_command *cmd) writel(cmd->arg, host->regs + SD_EMMC_CMD_ARG); } +static int meson_mmc_validate_dram_access(struct mmc_host *mmc, struct mmc_data *data) +{ + struct scatterlist *sg; + int i; + + /* Reject request if any element offset or size is not 32bit aligned */ + for_each_sg(data->sg, sg, data->sg_len, i) { + if (!IS_ALIGNED(sg->offset, sizeof(u32)) || + !IS_ALIGNED(sg->length, sizeof(u32))) { + dev_err(mmc_dev(mmc), "unaligned sg offset %u len %u\n", + data->sg->offset, data->sg->length); + return -EINVAL; + } + } + + return 0; +} + static void meson_mmc_request(struct mmc_host *mmc, struct mmc_request *mrq) { struct meson_host *host = mmc_priv(mmc); bool needs_pre_post_req = mrq->data && !(mrq->data->host_cookie & SD_EMMC_PRE_REQ_DONE); + /* + * The memory at the end of the controller used as bounce buffer for + * the dram_access_quirk only accepts 32bit read/write access, + * check the aligment and length of the data before starting the request. + */ + if (host->dram_access_quirk && mrq->data) { + mrq->cmd->error = meson_mmc_validate_dram_access(mmc, mrq->data); + if (mrq->cmd->error) { + mmc_request_done(mmc, mrq); + return; + } + } + if (needs_pre_post_req) { meson_mmc_get_transfer_mode(mmc, mrq); if (!meson_mmc_desc_chain_mode(mrq->data)) @@ -991,7 +1032,11 @@ static irqreturn_t meson_mmc_irq_thread(int irq, void *dev_id) if (meson_mmc_bounce_buf_read(data)) { xfer_bytes = data->blksz * data->blocks; WARN_ON(xfer_bytes > host->bounce_buf_size); - meson_mmc_copy_buffer(host, data, xfer_bytes, false); + if (host->dram_access_quirk) + meson_mmc_copy_buffer(host, data, xfer_bytes, false); + else + sg_copy_from_buffer(data->sg, data->sg_len, + host->bounce_buf, xfer_bytes); } next_cmd = meson_mmc_get_next_command(cmd); From f8feb2c247e8b87abf8e9abed5e95b6155ab6872 Mon Sep 17 00:00:00 2001 From: Cindy Lu Date: Wed, 29 Sep 2021 17:09:33 +0800 Subject: [PATCH 0138/5344] vhost-vdpa: Fix the wrong input in config_cb commit bcef9356fc2e1302daf373c83c826aa27954d128 upstream. Fix the wrong input in for config_cb. In function vhost_vdpa_config_cb, the input cb.private was used as struct vhost_vdpa, so the input was wrong here, fix this issue Fixes: 776f395004d8 ("vhost_vdpa: Support config interrupt in vdpa") Signed-off-by: Cindy Lu Link: https://lore.kernel.org/r/20210929090933.20465-1-lulu@redhat.com Signed-off-by: Michael S. Tsirkin Signed-off-by: Gang Li --- drivers/vhost/vdpa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 582dfe886541a..b44b2892241cc 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -317,7 +317,7 @@ static long vhost_vdpa_set_config_call(struct vhost_vdpa *v, u32 __user *argp) struct eventfd_ctx *ctx; cb.callback = vhost_vdpa_config_cb; - cb.private = v->vdpa; + cb.private = v; if (copy_from_user(&fd, argp, sizeof(fd))) return -EFAULT; From fcd8ff4d091ca2f694928bfd336e925f6763fd4c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 18 Oct 2021 15:15:39 -0700 Subject: [PATCH 0139/5344] ocfs2: fix data corruption after conversion from inline format commit 5314454ea3ff6fc746eaf71b9a7ceebed52888fa upstream. Commit 6dbf7bb55598 ("fs: Don't invalidate page buffers in block_write_full_page()") uncovered a latent bug in ocfs2 conversion from inline inode format to a normal inode format. The code in ocfs2_convert_inline_data_to_extents() attempts to zero out the whole cluster allocated for file data by grabbing, zeroing, and dirtying all pages covering this cluster. However these pages are beyond i_size, thus writeback code generally ignores these dirty pages and no blocks were ever actually zeroed on the disk. This oversight was fixed by commit 693c241a5f6a ("ocfs2: No need to zero pages past i_size.") for standard ocfs2 write path, inline conversion path was apparently forgotten; the commit log also has a reasoning why the zeroing actually is not needed. After commit 6dbf7bb55598, things became worse as writeback code stopped invalidating buffers on pages beyond i_size and thus these pages end up with clean PageDirty bit but with buffers attached to these pages being still dirty. So when a file is converted from inline format, then writeback triggers, and then the file is grown so that these pages become valid, the invalid dirtiness state is preserved, mark_buffer_dirty() does nothing on these pages (buffers are already dirty) but page is never written back because it is clean. So data written to these pages is lost once pages are reclaimed. Simple reproducer for the problem is: xfs_io -f -c "pwrite 0 2000" -c "pwrite 2000 2000" -c "fsync" \ -c "pwrite 4000 2000" ocfs2_file After unmounting and mounting the fs again, you can observe that end of 'ocfs2_file' has lost its contents. Fix the problem by not doing the pointless zeroing during conversion from inline format similarly as in the standard write path. [akpm@linux-foundation.org: fix whitespace, per Joseph] Link: https://lkml.kernel.org/r/20210930095405.21433-1-jack@suse.cz Fixes: 6dbf7bb55598 ("fs: Don't invalidate page buffers in block_write_full_page()") Signed-off-by: Jan Kara Reviewed-by: Joseph Qi Tested-by: Joseph Qi Acked-by: Gang He Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Cc: "Markov, Andrey" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Gang Li --- fs/ocfs2/alloc.c | 46 ++++++++++++---------------------------------- 1 file changed, 12 insertions(+), 34 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 0a6fe7d5aba74..4db87b26cf7b2 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -7048,7 +7048,7 @@ void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di) int ocfs2_convert_inline_data_to_extents(struct inode *inode, struct buffer_head *di_bh) { - int ret, i, has_data, num_pages = 0; + int ret, has_data, num_pages = 0; int need_free = 0; u32 bit_off, num; handle_t *handle; @@ -7057,26 +7057,17 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_alloc_context *data_ac = NULL; - struct page **pages = NULL; - loff_t end = osb->s_clustersize; + struct page *page = NULL; struct ocfs2_extent_tree et; int did_quota = 0; has_data = i_size_read(inode) ? 1 : 0; if (has_data) { - pages = kcalloc(ocfs2_pages_per_cluster(osb->sb), - sizeof(struct page *), GFP_NOFS); - if (pages == NULL) { - ret = -ENOMEM; - mlog_errno(ret); - return ret; - } - ret = ocfs2_reserve_clusters(osb, 1, &data_ac); if (ret) { mlog_errno(ret); - goto free_pages; + goto out; } } @@ -7096,7 +7087,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, } if (has_data) { - unsigned int page_end; + unsigned int page_end = min_t(unsigned, PAGE_SIZE, + osb->s_clustersize); u64 phys; ret = dquot_alloc_space_nodirty(inode, @@ -7120,15 +7112,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, */ block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off); - /* - * Non sparse file systems zero on extend, so no need - * to do that now. - */ - if (!ocfs2_sparse_alloc(osb) && - PAGE_SIZE < osb->s_clustersize) - end = PAGE_SIZE; - - ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages); + ret = ocfs2_grab_eof_pages(inode, 0, page_end, &page, + &num_pages); if (ret) { mlog_errno(ret); need_free = 1; @@ -7139,20 +7124,15 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, * This should populate the 1st page for us and mark * it up to date. */ - ret = ocfs2_read_inline_data(inode, pages[0], di_bh); + ret = ocfs2_read_inline_data(inode, page, di_bh); if (ret) { mlog_errno(ret); need_free = 1; goto out_unlock; } - page_end = PAGE_SIZE; - if (PAGE_SIZE > osb->s_clustersize) - page_end = osb->s_clustersize; - - for (i = 0; i < num_pages; i++) - ocfs2_map_and_dirty_page(inode, handle, 0, page_end, - pages[i], i > 0, &phys); + ocfs2_map_and_dirty_page(inode, handle, 0, page_end, page, 0, + &phys); } spin_lock(&oi->ip_lock); @@ -7183,8 +7163,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, } out_unlock: - if (pages) - ocfs2_unlock_and_free_pages(pages, num_pages); + if (page) + ocfs2_unlock_and_free_pages(&page, num_pages); out_commit: if (ret < 0 && did_quota) @@ -7208,8 +7188,6 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, out: if (data_ac) ocfs2_free_alloc_context(data_ac); -free_pages: - kfree(pages); return ret; } From 0b50cfbcc3c98b35f1197dac796ed2373b6c3049 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 28 Aug 2021 10:57:47 -0700 Subject: [PATCH 0140/5344] clocksource/drivers/timer-ti-dm: Select TIMER_OF commit eda9a4f7af6ee47e9e131f20e4f8a41a97379293 upstream. When building OMAP_DM_TIMER without TIMER_OF, there are orphan sections due to the use of TIMER_OF_DELCARE() without CONFIG_TIMER_OF. Select CONFIG_TIMER_OF when enaling OMAP_DM_TIMER: arm-linux-gnueabi-ld: warning: orphan section `__timer_of_table' from `drivers/clocksource/timer-ti-dm-systimer.o' being placed in section `__timer_of_table' Reported-by: kernel test robot Link: https://lore.kernel.org/lkml/202108282255.tkdt4ani-lkp@intel.com/ Cc: Tony Lindgren Cc: Daniel Lezcano Cc: Keerthy Cc: Sebastian Reichel Cc: Ladislav Michl Cc: Grygorii Strashko Cc: linux-omap@vger.kernel.org Fixes: 52762fbd1c47 ("clocksource/drivers/timer-ti-dm: Add clockevent and clocksource support") Signed-off-by: Kees Cook Acked-by: Tony Lindgren Link: https://lore.kernel.org/r/20210828175747.3777891-1-keescook@chromium.org Signed-off-by: Daniel Lezcano Signed-off-by: Gang Li --- drivers/clocksource/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index 3bb5625504e2f..9bfe4c5af87e3 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -24,6 +24,7 @@ config I8253_LOCK config OMAP_DM_TIMER bool + select TIMER_OF config CLKBLD_I8253 def_bool y if CLKSRC_I8253 || CLKEVT_I8253 || I8253_LOCK From 40c40706f9d2fc08d76d4c598473b6002bbd0835 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 26 Oct 2021 21:56:40 -0400 Subject: [PATCH 0141/5344] NFSv4: Fix a regression in nfs_set_open_stateid_locked() commit 01d29f87fcfef38d51ce2b473981a5c1e861ac0a upstream. If we already hold open state on the client, yet the server gives us a completely different stateid to the one we already hold, then we currently treat it as if it were an out-of-sequence update, and wait for 5 seconds for other updates to come in. This commit fixes the behaviour so that we immediately start processing of the new stateid, and then leave it to the call to nfs4_test_and_free_stateid() to decide what to do with the old stateid. Fixes: b4868b44c562 ("NFSv4: Wait for stateid updates after CLOSE/OPEN_DOWNGRADE") Signed-off-by: Trond Myklebust Signed-off-by: Gang Li --- fs/nfs/nfs4proc.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 5ecaf7b6b0fa1..fb3d1532f11dd 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1549,15 +1549,16 @@ static bool nfs_stateid_is_sequential(struct nfs4_state *state, { if (test_bit(NFS_OPEN_STATE, &state->flags)) { /* The common case - we're updating to a new sequence number */ - if (nfs4_stateid_match_other(stateid, &state->open_stateid) && - nfs4_stateid_is_next(&state->open_stateid, stateid)) { - return true; + if (nfs4_stateid_match_other(stateid, &state->open_stateid)) { + if (nfs4_stateid_is_next(&state->open_stateid, stateid)) + return true; + return false; } - } else { - /* This is the first OPEN in this generation */ - if (stateid->seqid == cpu_to_be32(1)) - return true; + /* The server returned a new stateid */ } + /* This is the first OPEN in this generation */ + if (stateid->seqid == cpu_to_be32(1)) + return true; return false; } From f78d9988093b27eba3ea24c7c1a77fbc32d15d52 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Fri, 5 Nov 2021 18:00:36 +0200 Subject: [PATCH 0142/5344] xhci: Fix USB 3.1 enumeration issues by increasing roothub power-on-good delay commit e1959faf085b004e6c3afaaaa743381f00e7c015 upstream. Some USB 3.1 enumeration issues were reported after the hub driver removed the minimum 100ms limit for the power-on-good delay. Since commit 90d28fb53d4a ("usb: core: reduce power-on-good delay time of root hub") the hub driver sets the power-on-delay based on the bPwrOn2PwrGood value in the hub descriptor. xhci driver has a 20ms bPwrOn2PwrGood value for both roothubs based on xhci spec section 5.4.8, but it's clearly not enough for the USB 3.1 devices, causing enumeration issues. Tests indicate full 100ms delay is needed. Reported-by: Walt Jr. Brake Signed-off-by: Mathias Nyman Fixes: 90d28fb53d4a ("usb: core: reduce power-on-good delay time of root hub") Cc: stable Link: https://lore.kernel.org/r/20211105160036.549516-1-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Gang Li --- drivers/usb/host/xhci-hub.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c index 6358d4e0653ed..9a4260927ce31 100644 --- a/drivers/usb/host/xhci-hub.c +++ b/drivers/usb/host/xhci-hub.c @@ -171,7 +171,6 @@ static void xhci_common_hub_descriptor(struct xhci_hcd *xhci, { u16 temp; - desc->bPwrOn2PwrGood = 10; /* xhci section 5.4.9 says 20ms max */ desc->bHubContrCurrent = 0; desc->bNbrPorts = ports; @@ -206,6 +205,7 @@ static void xhci_usb2_hub_descriptor(struct usb_hcd *hcd, struct xhci_hcd *xhci, desc->bDescriptorType = USB_DT_HUB; temp = 1 + (ports / 8); desc->bDescLength = USB_DT_HUB_NONVAR_SIZE + 2 * temp; + desc->bPwrOn2PwrGood = 10; /* xhci section 5.4.8 says 20ms */ /* The Device Removable bits are reported on a byte granularity. * If the port doesn't exist within that byte, the bit is set to 0. @@ -258,6 +258,7 @@ static void xhci_usb3_hub_descriptor(struct usb_hcd *hcd, struct xhci_hcd *xhci, xhci_common_hub_descriptor(xhci, desc, ports); desc->bDescriptorType = USB_DT_SS_HUB; desc->bDescLength = USB_DT_SS_HUB_SIZE; + desc->bPwrOn2PwrGood = 50; /* usb 3.1 may fail if less than 100ms */ /* header decode latency should be zero for roothubs, * see section 4.23.5.2. From 880eee0ac979a062c31cd2bef937251b14e58b7e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 9 Nov 2021 19:09:44 -0800 Subject: [PATCH 0143/5344] MIPS: boot/compressed/: add __bswapdi2() to target for ZSTD decompression commit e2f4b3be1d3c73176db734565b160250cc1300dd upstream. For MIPS pre-boot, when CONFIG_KERNEL_ZSTD=y, the decompressor function uses __bswapdi2(), so this object file should be added to the target object file. Fixes these build errors: mips-linux-ld: arch/mips/boot/compressed/decompress.o: in function `xxh64': decompress.c:(.text+0x8be0): undefined reference to `__bswapdi2' mips-linux-ld: decompress.c:(.text+0x8c78): undefined reference to `__bswapdi2' mips-linux-ld: decompress.c:(.text+0x8d04): undefined reference to `__bswapdi2' mips-linux-ld: arch/mips/boot/compressed/decompress.o:decompress.c:(.text+0xa010): more undefined references to `__bswapdi2' follow Fixes: 0652035a5794 ("asm-generic: unaligned: remove byteshift helpers") Fixes: cddc40f5617e ("mips: always link byteswap helpers into decompressor") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Arnd Bergmann Cc: Thomas Bogendoerfer Cc: linux-mips@vger.kernel.org Acked-by: Arnd Bergmann Signed-off-by: Thomas Bogendoerfer Signed-off-by: Gang Li --- arch/mips/boot/compressed/Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/mips/boot/compressed/Makefile b/arch/mips/boot/compressed/Makefile index 1d6ebbc2a5d02..7ad20e00c398d 100644 --- a/arch/mips/boot/compressed/Makefile +++ b/arch/mips/boot/compressed/Makefile @@ -49,6 +49,8 @@ $(obj)/uart-ath79.c: $(srctree)/arch/mips/ath79/early_printk.c vmlinuzobjs-$(CONFIG_KERNEL_XZ) += $(obj)/ashldi3.o +vmlinuzobjs-$(CONFIG_KERNEL_ZSTD) += $(obj)/bswapdi.o + extra-y += ashldi3.c $(obj)/ashldi3.c: $(obj)/%.c: $(srctree)/lib/%.c FORCE $(call if_changed,shipped) @@ -57,6 +59,10 @@ extra-y += bswapsi.c $(obj)/bswapsi.c: $(obj)/%.c: $(srctree)/arch/mips/lib/%.c FORCE $(call if_changed,shipped) +extra-y += bswapdi.c +$(obj)/bswapdi.c: $(obj)/%.c: $(srctree)/arch/mips/lib/%.c FORCE + $(call if_changed,shipped) + targets := $(notdir $(vmlinuzobjs-y)) targets += vmlinux.bin From 4855d5b59a31c610f09646567ab15e2c25e334e1 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 3 Nov 2021 17:17:16 +0800 Subject: [PATCH 0144/5344] perf/x86/vlbr: Add c->flags to vlbr event constraints commit 5863702561e625903ec678551cb056a4b19e0b8a upstream. Just like what we do in the x86_get_event_constraints(), the PERF_X86_EVENT_LBR_SELECT flag should also be propagated to event->hw.flags so that the host lbr driver can save/restore MSR_LBR_SELECT for the special vlbr event created by KVM or BPF. Fixes: 097e4311cda9 ("perf/x86: Add constraint to create guest LBR event without hw counter") Reported-by: Wanpeng Li Signed-off-by: Like Xu Signed-off-by: Peter Zijlstra (Intel) Tested-by: Wanpeng Li Link: https://lore.kernel.org/r/20211103091716.59906-1-likexu@tencent.com Signed-off-by: Gang Li --- arch/x86/events/intel/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 318a89bf00d0b..9faccd187b9e7 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2859,8 +2859,10 @@ intel_vlbr_constraints(struct perf_event *event) { struct event_constraint *c = &vlbr_constraint; - if (unlikely(constraint_match(c, event->hw.config))) + if (unlikely(constraint_match(c, event->hw.config))) { + event->hw.flags |= c->flags; return c; + } return NULL; } From eb648016a4b4d41b777fd42205913a85538551e1 Mon Sep 17 00:00:00 2001 From: James Clark Date: Thu, 28 Oct 2021 14:48:26 +0100 Subject: [PATCH 0145/5344] perf tests: Remove bash construct from record+zstd_comp_decomp.sh commit a9cdc1c5e3700a5200e5ca1f90b6958b6483845b upstream. Commit 463538a383a2 ("perf tests: Fix test 68 zstd compression for s390") inadvertently removed the -g flag from all platforms rather than just s390, because the [[ ]] construct fails in sh. Changing to single brackets restores testing of call graphs and removes the following error from the output: $ ./perf test -v 85 85: Zstd perf.data compression/decompression : --- start --- test child forked, pid 50643 Collecting compressed record file: ./tests/shell/record+zstd_comp_decomp.sh: 15: [[: not found Fixes: 463538a383a2 ("perf tests: Fix test 68 zstd compression for s390") Signed-off-by: James Clark Cc: Alexander Shishkin Cc: Florian Fainelli Cc: Ian Rogers Cc: Jiri Olsa Cc: John Fastabend Cc: KP Singh Cc: Mark Rutland Cc: Martin KaFai Lau Cc: Namhyung Kim Cc: Song Liu Cc: Sumanth Korikkar Cc: Thomas Richter Cc: Yonghong Song Cc: bpf@vger.kernel.org Cc: netdev@vger.kernel.org Link: https://lore.kernel.org/r/20211028134828.65774-3-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Gang Li --- tools/perf/tests/shell/record+zstd_comp_decomp.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/tests/shell/record+zstd_comp_decomp.sh b/tools/perf/tests/shell/record+zstd_comp_decomp.sh index 045723b3d9928..c62af807198de 100755 --- a/tools/perf/tests/shell/record+zstd_comp_decomp.sh +++ b/tools/perf/tests/shell/record+zstd_comp_decomp.sh @@ -12,7 +12,7 @@ skip_if_no_z_record() { collect_z_record() { echo "Collecting compressed record file:" - [[ "$(uname -m)" != s390x ]] && gflag='-g' + [ "$(uname -m)" != s390x ] && gflag='-g' $perf_tool record -o $trace_file $gflag -z -F 5000 -- \ dd count=500 if=/dev/urandom of=/dev/null } From ed7446de1366232b7d9371c9d2157beace56c3c8 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Wed, 17 Nov 2021 23:22:09 +0100 Subject: [PATCH 0146/5344] PCI: pciehp: Fix infinite loop in IRQ handler upon power fault commit 23584c1ed3e15a6f4bfab8dc5a88d94ab929ee12 upstream. The Power Fault Detected bit in the Slot Status register differs from all other hotplug events in that it is sticky: It can only be cleared after turning off slot power. Per PCIe r5.0, sec. 6.7.1.8: If a power controller detects a main power fault on the hot-plug slot, it must automatically set its internal main power fault latch [...]. The main power fault latch is cleared when software turns off power to the hot-plug slot. The stickiness used to cause interrupt storms and infinite loops which were fixed in 2009 by commits 5651c48cfafe ("PCI pciehp: fix power fault interrupt storm problem") and 99f0169c17f3 ("PCI: pciehp: enable software notification on empty slots"). Unfortunately in 2020 the infinite loop issue was inadvertently reintroduced by commit 8edf5332c393 ("PCI: pciehp: Fix MSI interrupt race"): The hardirq handler pciehp_isr() clears the PFD bit until pciehp's power_fault_detected flag is set. That happens in the IRQ thread pciehp_ist(), which never learns of the event because the hardirq handler is stuck in an infinite loop. Fix by setting the power_fault_detected flag already in the hardirq handler. Link: https://bugzilla.kernel.org/show_bug.cgi?id=214989 Link: https://lore.kernel.org/linux-pci/DM8PR11MB5702255A6A92F735D90A4446868B9@DM8PR11MB5702.namprd11.prod.outlook.com Fixes: 8edf5332c393 ("PCI: pciehp: Fix MSI interrupt race") Link: https://lore.kernel.org/r/66eaeef31d4997ceea357ad93259f290ededecfd.1637187226.git.lukas@wunner.de Reported-by: Joseph Bao Tested-by: Joseph Bao Signed-off-by: Lukas Wunner Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org # v4.19+ Cc: Stuart Hayes Signed-off-by: Gang Li --- drivers/pci/hotplug/pciehp_hpc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c index 88b996764ff95..907b8be86ce04 100644 --- a/drivers/pci/hotplug/pciehp_hpc.c +++ b/drivers/pci/hotplug/pciehp_hpc.c @@ -577,6 +577,8 @@ static irqreturn_t pciehp_isr(int irq, void *dev_id) */ if (ctrl->power_fault_detected) status &= ~PCI_EXP_SLTSTA_PFD; + else if (status & PCI_EXP_SLTSTA_PFD) + ctrl->power_fault_detected = true; events |= status; if (!events) { @@ -586,7 +588,7 @@ static irqreturn_t pciehp_isr(int irq, void *dev_id) } if (status) { - pcie_capability_write_word(pdev, PCI_EXP_SLTSTA, events); + pcie_capability_write_word(pdev, PCI_EXP_SLTSTA, status); /* * In MSI mode, all event bits must be zero before the port @@ -660,8 +662,7 @@ static irqreturn_t pciehp_ist(int irq, void *dev_id) } /* Check Power Fault Detected */ - if ((events & PCI_EXP_SLTSTA_PFD) && !ctrl->power_fault_detected) { - ctrl->power_fault_detected = 1; + if (events & PCI_EXP_SLTSTA_PFD) { ctrl_err(ctrl, "Slot(%s): Power fault\n", slot_name(ctrl)); pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF, PCI_EXP_SLTCTL_ATTN_IND_ON); From 3314f0f53e38841edeb5aa67e167df69de256951 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 19 Nov 2021 16:43:52 -0800 Subject: [PATCH 0147/5344] mm/damon/dbgfs: fix missed use of damon_dbgfs_lock commit d78f3853f831eee46c6dbe726debf3be9e9c0d05 upstream. DAMON debugfs is supposed to protect dbgfs_ctxs, dbgfs_nr_ctxs, and dbgfs_dirs using damon_dbgfs_lock. However, some of the code is accessing the variables without the protection. This fixes it by protecting all such accesses. Link: https://lkml.kernel.org/r/20211110145758.16558-3-sj@kernel.org Fixes: 75c1c2b53c78 ("mm/damon/dbgfs: support multiple contexts") Signed-off-by: SeongJae Park Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Gang Li --- mm/damon/dbgfs.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 2741ff79e8e84..f94d19a690dfa 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -538,12 +538,14 @@ static ssize_t dbgfs_monitor_on_write(struct file *file, return -EINVAL; } + mutex_lock(&damon_dbgfs_lock); if (!strncmp(kbuf, "on", count)) err = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs); else if (!strncmp(kbuf, "off", count)) err = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs); else err = -EINVAL; + mutex_unlock(&damon_dbgfs_lock); if (err) ret = err; @@ -596,15 +598,16 @@ static int __init __damon_dbgfs_init(void) static int __init damon_dbgfs_init(void) { - int rc; + int rc = -ENOMEM; + mutex_lock(&damon_dbgfs_lock); dbgfs_ctxs = kmalloc(sizeof(*dbgfs_ctxs), GFP_KERNEL); if (!dbgfs_ctxs) - return -ENOMEM; + goto out; dbgfs_ctxs[0] = dbgfs_new_ctx(); if (!dbgfs_ctxs[0]) { kfree(dbgfs_ctxs); - return -ENOMEM; + goto out; } dbgfs_nr_ctxs = 1; @@ -615,6 +618,8 @@ static int __init damon_dbgfs_init(void) pr_err("%s: dbgfs init failed\n", __func__); } +out: + mutex_unlock(&damon_dbgfs_lock); return rc; } From fef57e73dfa1f413ab542e4c725dbc0b0474f24f Mon Sep 17 00:00:00 2001 From: Longpeng Date: Wed, 24 Nov 2021 09:52:15 +0800 Subject: [PATCH 0148/5344] vdpa_sim: avoid putting an uninitialized iova_domain commit bb93ce4b150dde79f58e34103cbd1fe829796649 upstream. The system will crash if we put an uninitialized iova_domain, this could happen when an error occurs before initializing the iova_domain in vdpasim_create(). BUG: kernel NULL pointer dereference, address: 0000000000000000 ... RIP: 0010:__cpuhp_state_remove_instance+0x96/0x1c0 ... Call Trace: put_iova_domain+0x29/0x220 vdpasim_free+0xd1/0x120 [vdpa_sim] vdpa_release_dev+0x21/0x40 [vdpa] device_release+0x33/0x90 kobject_release+0x63/0x160 vdpasim_create+0x127/0x2a0 [vdpa_sim] vdpasim_net_dev_add+0x7d/0xfe [vdpa_sim_net] vdpa_nl_cmd_dev_add_set_doit+0xe1/0x1a0 [vdpa] genl_family_rcv_msg_doit+0x112/0x140 genl_rcv_msg+0xdf/0x1d0 ... So we must make sure the iova_domain is already initialized before put it. In addition, we may get the following warning in this case: WARNING: ... drivers/iommu/iova.c:344 iova_cache_put+0x58/0x70 So we must make sure the iova_cache_put() is invoked only if the iova_cache_get() is already invoked. Let's fix it together. Cc: stable@vger.kernel.org Fixes: 4080fc106750 ("vdpa_sim: use iova module to allocate IOVA addresses") Signed-off-by: Longpeng Acked-by: Jason Wang Reviewed-by: Stefano Garzarella Link: https://lore.kernel.org/r/20211124015215.119-1-longpeng2@huawei.com Signed-off-by: Michael S. Tsirkin Signed-off-by: Gang Li --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index afde4313b0f17..872ba1c73138c 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -580,8 +580,11 @@ static void vdpasim_free(struct vdpa_device *vdpa) vringh_kiov_cleanup(&vdpasim->vqs[i].in_iov); } - put_iova_domain(&vdpasim->iova); - iova_cache_put(); + if (vdpa_get_dma_dev(vdpa)) { + put_iova_domain(&vdpasim->iova); + iova_cache_put(); + } + kvfree(vdpasim->buffer); if (vdpasim->iommu) vhost_iotlb_free(vdpasim->iommu); From 0b31f18effc77bb3590a72def59131a9cb0687de Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Mon, 29 Nov 2021 00:08:13 -0800 Subject: [PATCH 0149/5344] Input: elantech - fix stack out of bound access in elantech_change_report_id() commit 1d72d9f960ccf1052a0630a68c3d358791dbdaaa upstream. The array param[] in elantech_change_report_id() must be at least 3 bytes, because elantech_read_reg_params() is calling ps2_command() with PSMOUSE_CMD_GETINFO, that is going to access 3 bytes from param[], but it's defined in the stack as an array of 2 bytes, therefore we have a potential stack out-of-bounds access here, also confirmed by KASAN: [ 6.512374] BUG: KASAN: stack-out-of-bounds in __ps2_command+0x372/0x7e0 [ 6.512397] Read of size 1 at addr ffff8881024d77c2 by task kworker/2:1/118 [ 6.512416] CPU: 2 PID: 118 Comm: kworker/2:1 Not tainted 5.13.0-22-generic #22+arighi20211110 [ 6.512428] Hardware name: LENOVO 20T8000QGE/20T8000QGE, BIOS R1AET32W (1.08 ) 08/14/2020 [ 6.512436] Workqueue: events_long serio_handle_event [ 6.512453] Call Trace: [ 6.512462] show_stack+0x52/0x58 [ 6.512474] dump_stack+0xa1/0xd3 [ 6.512487] print_address_description.constprop.0+0x1d/0x140 [ 6.512502] ? __ps2_command+0x372/0x7e0 [ 6.512516] __kasan_report.cold+0x7d/0x112 [ 6.512527] ? _raw_write_lock_irq+0x20/0xd0 [ 6.512539] ? __ps2_command+0x372/0x7e0 [ 6.512552] kasan_report+0x3c/0x50 [ 6.512564] __asan_load1+0x6a/0x70 [ 6.512575] __ps2_command+0x372/0x7e0 [ 6.512589] ? ps2_drain+0x240/0x240 [ 6.512601] ? dev_printk_emit+0xa2/0xd3 [ 6.512612] ? dev_vprintk_emit+0xc5/0xc5 [ 6.512621] ? __kasan_check_write+0x14/0x20 [ 6.512634] ? mutex_lock+0x8f/0xe0 [ 6.512643] ? __mutex_lock_slowpath+0x20/0x20 [ 6.512655] ps2_command+0x52/0x90 [ 6.512670] elantech_ps2_command+0x4f/0xc0 [psmouse] [ 6.512734] elantech_change_report_id+0x1e6/0x256 [psmouse] [ 6.512799] ? elantech_report_trackpoint.constprop.0.cold+0xd/0xd [psmouse] [ 6.512863] ? ps2_command+0x7f/0x90 [ 6.512877] elantech_query_info.cold+0x6bd/0x9ed [psmouse] [ 6.512943] ? elantech_setup_ps2+0x460/0x460 [psmouse] [ 6.513005] ? psmouse_reset+0x69/0xb0 [psmouse] [ 6.513064] ? psmouse_attr_set_helper+0x2a0/0x2a0 [psmouse] [ 6.513122] ? phys_pmd_init+0x30e/0x521 [ 6.513137] elantech_init+0x8a/0x200 [psmouse] [ 6.513200] ? elantech_init_ps2+0xf0/0xf0 [psmouse] [ 6.513249] ? elantech_query_info+0x440/0x440 [psmouse] [ 6.513296] ? synaptics_send_cmd+0x60/0x60 [psmouse] [ 6.513342] ? elantech_query_info+0x440/0x440 [psmouse] [ 6.513388] ? psmouse_try_protocol+0x11e/0x170 [psmouse] [ 6.513432] psmouse_extensions+0x65d/0x6e0 [psmouse] [ 6.513476] ? psmouse_try_protocol+0x170/0x170 [psmouse] [ 6.513519] ? mutex_unlock+0x22/0x40 [ 6.513526] ? ps2_command+0x7f/0x90 [ 6.513536] ? psmouse_probe+0xa3/0xf0 [psmouse] [ 6.513580] psmouse_switch_protocol+0x27d/0x2e0 [psmouse] [ 6.513624] psmouse_connect+0x272/0x530 [psmouse] [ 6.513669] serio_driver_probe+0x55/0x70 [ 6.513679] really_probe+0x190/0x720 [ 6.513689] driver_probe_device+0x160/0x1f0 [ 6.513697] device_driver_attach+0x119/0x130 [ 6.513705] ? device_driver_attach+0x130/0x130 [ 6.513713] __driver_attach+0xe7/0x1a0 [ 6.513720] ? device_driver_attach+0x130/0x130 [ 6.513728] bus_for_each_dev+0xfb/0x150 [ 6.513738] ? subsys_dev_iter_exit+0x10/0x10 [ 6.513748] ? _raw_write_unlock_bh+0x30/0x30 [ 6.513757] driver_attach+0x2d/0x40 [ 6.513764] serio_handle_event+0x199/0x3d0 [ 6.513775] process_one_work+0x471/0x740 [ 6.513785] worker_thread+0x2d2/0x790 [ 6.513794] ? process_one_work+0x740/0x740 [ 6.513802] kthread+0x1b4/0x1e0 [ 6.513809] ? set_kthread_struct+0x80/0x80 [ 6.513816] ret_from_fork+0x22/0x30 [ 6.513832] The buggy address belongs to the page: [ 6.513838] page:00000000bc35e189 refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1024d7 [ 6.513847] flags: 0x17ffffc0000000(node=0|zone=2|lastcpupid=0x1fffff) [ 6.513860] raw: 0017ffffc0000000 dead000000000100 dead000000000122 0000000000000000 [ 6.513867] raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 [ 6.513872] page dumped because: kasan: bad access detected [ 6.513879] addr ffff8881024d77c2 is located in stack of task kworker/2:1/118 at offset 34 in frame: [ 6.513887] elantech_change_report_id+0x0/0x256 [psmouse] [ 6.513941] this frame has 1 object: [ 6.513947] [32, 34) 'param' [ 6.513956] Memory state around the buggy address: [ 6.513962] ffff8881024d7680: f2 f2 f2 f2 f2 00 00 f3 f3 00 00 00 00 00 00 00 [ 6.513969] ffff8881024d7700: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 6.513976] >ffff8881024d7780: 00 00 00 00 f1 f1 f1 f1 02 f3 f3 f3 00 00 00 00 [ 6.513982] ^ [ 6.513988] ffff8881024d7800: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 6.513995] ffff8881024d7880: 00 f1 f1 f1 f1 03 f2 03 f2 03 f3 f3 f3 00 00 00 [ 6.514000] ================================================================== Define param[] in elantech_change_report_id() as an array of 3 bytes to prevent the out-of-bounds access in the stack. Fixes: e4c9062717fe ("Input: elantech - fix protocol errors for some trackpoints in SMBus mode") BugLink: https://bugs.launchpad.net/bugs/1945590 Signed-off-by: Andrea Righi Reviewed-by: Wolfram Sang Link: https://lore.kernel.org/r/20211116095559.24395-1-andrea.righi@canonical.com Signed-off-by: Dmitry Torokhov Signed-off-by: Gang Li --- drivers/input/mouse/elantech.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/input/mouse/elantech.c b/drivers/input/mouse/elantech.c index 053fe2da1e08f..8533f1f625942 100644 --- a/drivers/input/mouse/elantech.c +++ b/drivers/input/mouse/elantech.c @@ -1575,7 +1575,13 @@ static const struct dmi_system_id no_hw_res_dmi_table[] = { */ static int elantech_change_report_id(struct psmouse *psmouse) { - unsigned char param[2] = { 0x10, 0x03 }; + /* + * NOTE: the code is expecting to receive param[] as an array of 3 + * items (see __ps2_command()), even if in this case only 2 are + * actually needed. Make sure the array size is 3 to avoid potential + * stack out-of-bound accesses. + */ + unsigned char param[3] = { 0x10, 0x03 }; if (elantech_write_reg_params(psmouse, 0x7, param) || elantech_read_reg_params(psmouse, 0x7, param) || From 33db8dab0a1b008ae56d29d15fa6cfe1f5020a4f Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Thu, 2 Dec 2021 11:20:33 +0000 Subject: [PATCH 0150/5344] sched/uclamp: Fix rq->uclamp_max not set on first enqueue commit 315c4f884800c45cb6bd8c90422fad554a8b9588 upstream. Commit d81ae8aac85c ("sched/uclamp: Fix initialization of struct uclamp_rq") introduced a bug where uclamp_max of the rq is not reset to match the woken up task's uclamp_max when the rq is idle. The code was relying on rq->uclamp_max initialized to zero, so on first enqueue static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, enum uclamp_id clamp_id) { ... if (uc_se->value > READ_ONCE(uc_rq->value)) WRITE_ONCE(uc_rq->value, uc_se->value); } was actually resetting it. But since commit d81ae8aac85c changed the default to 1024, this no longer works. And since rq->uclamp_flags is also initialized to 0, neither above code path nor uclamp_idle_reset() update the rq->uclamp_max on first wake up from idle. This is only visible from first wake up(s) until the first dequeue to idle after enabling the static key. And it only matters if the uclamp_max of this task is < 1024 since only then its uclamp_max will be effectively ignored. Fix it by properly initializing rq->uclamp_flags = UCLAMP_FLAG_IDLE to ensure uclamp_idle_reset() is called which then will update the rq uclamp_max value as expected. Fixes: d81ae8aac85c ("sched/uclamp: Fix initialization of struct uclamp_rq") Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Tested-by: Dietmar Eggemann Link: https://lkml.kernel.org/r/20211202112033.1705279-1-qais.yousef@arm.com Signed-off-by: Gang Li --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5558ee69e0cbb..6e124673c6d38 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1330,7 +1330,7 @@ static void __init init_uclamp_rq(struct rq *rq) }; } - rq->uclamp_flags = 0; + rq->uclamp_flags = UCLAMP_FLAG_IDLE; } static void __init init_uclamp(void) From fa782a845ecc620e930a2f9171ae0da2bec581c8 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 8 Dec 2021 13:33:37 +0300 Subject: [PATCH 0151/5344] vdpa: check that offsets are within bounds commit 3ed21c1451a14d139e1ceb18f2fa70865ce3195a upstream. In this function "c->off" is a u32 and "size" is a long. On 64bit systems if "c->off" is greater than "size" then "size - c->off" is a negative and we always return -E2BIG. But on 32bit systems the subtraction is type promoted to a high positive u32 value and basically any "c->len" is accepted. Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend") Reported-by: Xie Yongji Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20211208103337.GA4047@kili Signed-off-by: Michael S. Tsirkin Cc: stable@vger.kernel.org Signed-off-by: Gang Li --- drivers/vhost/vdpa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index b44b2892241cc..d3ae221b20eb6 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -192,7 +192,7 @@ static int vhost_vdpa_config_validate(struct vhost_vdpa *v, struct vdpa_device *vdpa = v->vdpa; long size = vdpa->config->get_config_size(vdpa); - if (c->len == 0) + if (c->len == 0 || c->off > size) return -EINVAL; if (c->len > size - c->off) From 95dbd2bd3c970da74503ea583afff734052250ca Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 30 Nov 2021 06:29:49 +0200 Subject: [PATCH 0152/5344] vdpa: Consider device id larger than 31 commit bb47620be322c5e9e372536cb6b54e17b3a00258 upstream. virtio device id value can be more than 31. Hence, use BIT_ULL in assignment. Fixes: 33b347503f01 ("vdpa: Define vdpa mgmt device, ops and a netlink interface") Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Parav Pandit Acked-by: Jason Wang Link: https://lore.kernel.org/r/20211130042949.88958-1-parav@nvidia.com Signed-off-by: Michael S. Tsirkin Signed-off-by: Gang Li --- drivers/vdpa/vdpa.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index 8c99f130a6278..2f88ce3a573bc 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -353,7 +353,8 @@ static int vdpa_mgmtdev_fill(const struct vdpa_mgmt_dev *mdev, struct sk_buff *m goto msg_err; while (mdev->id_table[i].device) { - supported_classes |= BIT(mdev->id_table[i].device); + if (mdev->id_table[i].device <= 63) + supported_classes |= BIT_ULL(mdev->id_table[i].device); i++; } From 92071d3ec7bbd35b169c1780955dbc6b0d6b36a3 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 10 Dec 2021 14:46:34 -0800 Subject: [PATCH 0153/5344] mm/damon/core: remove unnecessary error messages commit 1afaf5cb687de85c5e00ac70f6eea5597077cbc5 upstream. DAMON core prints error messages when damon_target object creation is failed or wrong monitoring attributes are given. Because appropriate error code is returned for each case, the messages are not essential. Also, because the code path can be triggered with user-specified input, this could result in kernel log mistakenly being messy. To avoid the case, this commit removes the messages. Link: https://lkml.kernel.org/r/20211201150440.1088-4-sj@kernel.org Fixes: 4bc05954d007 ("mm/damon: implement a debugfs-based user space interface") Fixes: b9a6ac4e4ede ("mm/damon: adaptively adjust regions") Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: kernel test robot Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Gang Li --- mm/damon/core.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 30e9211f494a7..96b590ca0f975 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -197,7 +197,6 @@ int damon_set_targets(struct damon_ctx *ctx, for (i = 0; i < nr_ids; i++) { t = damon_new_target(ids[i]); if (!t) { - pr_err("Failed to alloc damon_target\n"); /* The caller should do cleanup of the ids itself */ damon_for_each_target_safe(t, next, ctx) damon_destroy_target(t); @@ -227,16 +226,10 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, unsigned long aggr_int, unsigned long primitive_upd_int, unsigned long min_nr_reg, unsigned long max_nr_reg) { - if (min_nr_reg < 3) { - pr_err("min_nr_regions (%lu) must be at least 3\n", - min_nr_reg); + if (min_nr_reg < 3) return -EINVAL; - } - if (min_nr_reg > max_nr_reg) { - pr_err("invalid nr_regions. min (%lu) > max (%lu)\n", - min_nr_reg, max_nr_reg); + if (min_nr_reg > max_nr_reg) return -EINVAL; - } ctx->sample_interval = sample_int; ctx->aggr_interval = aggr_int; From 77a5f77c425fe4790c41eeb6e0d62821cb2050e9 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 10 Dec 2021 14:46:25 -0800 Subject: [PATCH 0154/5344] mm/damon/core: fix fake load reports due to uninterruptible sleeps commit 70e9274805fccfd175d0431a947bfd11ee7df40e upstream. Because DAMON sleeps in uninterruptible mode, /proc/loadavg reports fake load while DAMON is turned on, though it is doing nothing. This can confuse users[1]. To avoid the case, this commit makes DAMON sleeps in idle mode. [1] https://lore.kernel.org/all/11868371.O9o76ZdvQC@natalenko.name/ Link: https://lkml.kernel.org/r/20211126145015.15862-3-sj@kernel.org Fixes: 2224d8485492 ("mm: introduce Data Access MONitor (DAMON)") Reported-by: Oleksandr Natalenko Signed-off-by: SeongJae Park Tested-by: Oleksandr Natalenko Cc: John Stultz Cc: Thomas Gleixner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Gang Li --- mm/damon/core.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 96b590ca0f975..d820569992b6e 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -627,6 +627,14 @@ static bool kdamond_need_stop(struct damon_ctx *ctx) return true; } +static void kdamond_usleep(unsigned long usecs) +{ + if (usecs > 100 * 1000) + schedule_timeout_idle(usecs_to_jiffies(usecs)); + else + usleep_idle_range(usecs, usecs + 1); +} + static void set_kdamond_stop(struct damon_ctx *ctx) { mutex_lock(&ctx->kdamond_lock); @@ -663,7 +671,7 @@ static int kdamond_fn(void *data) ctx->callback.after_sampling(ctx)) set_kdamond_stop(ctx); - usleep_range(ctx->sample_interval, ctx->sample_interval + 1); + kdamond_usleep(ctx->sample_interval); if (ctx->primitive.check_accesses) max_nr_accesses = ctx->primitive.check_accesses(ctx); From 3b805877e98f3c19f63a2e24eb453b3cda99e778 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 10 Dec 2021 14:46:40 -0800 Subject: [PATCH 0155/5344] mm/damon/vaddr-test: split a test function having >1024 bytes frame size commit 044cd9750fe010170f5dc812e4824d98f5ea928c upstream. On some configuration[1], 'damon_test_split_evenly()' kunit test function has >1024 bytes frame size, so below build warning is triggered: CC mm/damon/vaddr.o In file included from mm/damon/vaddr.c:672: mm/damon/vaddr-test.h: In function 'damon_test_split_evenly': mm/damon/vaddr-test.h:309:1: warning: the frame size of 1064 bytes is larger than 1024 bytes [-Wframe-larger-than=] 309 | } | ^ This commit fixes the warning by separating the common logic in the function. [1] https://lore.kernel.org/linux-mm/202111182146.OV3C4uGr-lkp@intel.com/ Link: https://lkml.kernel.org/r/20211201150440.1088-6-sj@kernel.org Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests") Signed-off-by: SeongJae Park Reported-by: kernel test robot Cc: Brendan Higgins Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Gang Li --- mm/damon/vaddr-test.h | 77 ++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 37 deletions(-) diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h index 1f5c13257dbaf..95ec362cdc37c 100644 --- a/mm/damon/vaddr-test.h +++ b/mm/damon/vaddr-test.h @@ -252,59 +252,62 @@ static void damon_test_apply_three_regions4(struct kunit *test) new_three_regions, expected, ARRAY_SIZE(expected)); } -static void damon_test_split_evenly(struct kunit *test) +static void damon_test_split_evenly_fail(struct kunit *test, + unsigned long start, unsigned long end, unsigned int nr_pieces) { - struct damon_ctx *c = damon_new_ctx(); - struct damon_target *t; - struct damon_region *r; - unsigned long i; - - KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(NULL, NULL, 5), - -EINVAL); - - t = damon_new_target(42); - r = damon_new_region(0, 100); - KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 0), -EINVAL); + struct damon_target *t = damon_new_target(42); + struct damon_region *r = damon_new_region(start, end); damon_add_region(r, t); - KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 10), 0); - KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 10u); + KUNIT_EXPECT_EQ(test, + damon_va_evenly_split_region(t, r, nr_pieces), -EINVAL); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1u); - i = 0; damon_for_each_region(r, t) { - KUNIT_EXPECT_EQ(test, r->ar.start, i++ * 10); - KUNIT_EXPECT_EQ(test, r->ar.end, i * 10); + KUNIT_EXPECT_EQ(test, r->ar.start, start); + KUNIT_EXPECT_EQ(test, r->ar.end, end); } + damon_free_target(t); +} + +static void damon_test_split_evenly_succ(struct kunit *test, + unsigned long start, unsigned long end, unsigned int nr_pieces) +{ + struct damon_target *t = damon_new_target(42); + struct damon_region *r = damon_new_region(start, end); + unsigned long expected_width = (end - start) / nr_pieces; + unsigned long i = 0; - t = damon_new_target(42); - r = damon_new_region(5, 59); damon_add_region(r, t); - KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 5), 0); - KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 5u); + KUNIT_EXPECT_EQ(test, + damon_va_evenly_split_region(t, r, nr_pieces), 0); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), nr_pieces); - i = 0; damon_for_each_region(r, t) { - if (i == 4) + if (i == nr_pieces - 1) break; - KUNIT_EXPECT_EQ(test, r->ar.start, 5 + 10 * i++); - KUNIT_EXPECT_EQ(test, r->ar.end, 5 + 10 * i); + KUNIT_EXPECT_EQ(test, + r->ar.start, start + i++ * expected_width); + KUNIT_EXPECT_EQ(test, r->ar.end, start + i * expected_width); } - KUNIT_EXPECT_EQ(test, r->ar.start, 5 + 10 * i); - KUNIT_EXPECT_EQ(test, r->ar.end, 59ul); + KUNIT_EXPECT_EQ(test, r->ar.start, start + i * expected_width); + KUNIT_EXPECT_EQ(test, r->ar.end, end); damon_free_target(t); +} - t = damon_new_target(42); - r = damon_new_region(5, 6); - damon_add_region(r, t); - KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 2), -EINVAL); - KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1u); +static void damon_test_split_evenly(struct kunit *test) +{ + struct damon_ctx *c = damon_new_ctx(); + + KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(NULL, NULL, 5), + -EINVAL); + + damon_test_split_evenly_fail(test, 0, 100, 0); + damon_test_split_evenly_succ(test, 0, 100, 10); + damon_test_split_evenly_succ(test, 5, 59, 5); + damon_test_split_evenly_fail(test, 5, 6, 2); - damon_for_each_region(r, t) { - KUNIT_EXPECT_EQ(test, r->ar.start, 5ul); - KUNIT_EXPECT_EQ(test, r->ar.end, 6ul); - } - damon_free_target(t); damon_destroy_ctx(c); } From ae94c2c9bad79b3386f87db2ed6db5a492851c38 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 10 Dec 2021 14:47:05 -0800 Subject: [PATCH 0156/5344] mm/memcg: relocate mod_objcg_mlstate(), get_obj_stock() and put_obj_stock() commit a7ebf564de325e1d7d52cd85b12721c424338bcc upstream. All the calls to mod_objcg_mlstate(), get_obj_stock() and put_obj_stock() are done by functions defined within the same "#ifdef CONFIG_MEMCG_KMEM" compilation block. When CONFIG_MEMCG_KMEM isn't defined, the following compilation warnings will be issued [1] and [2]. mm/memcontrol.c:785:20: warning: unused function 'mod_objcg_mlstate' mm/memcontrol.c:2113:33: warning: unused function 'get_obj_stock' Fix these warning by moving those functions to under the same CONFIG_MEMCG_KMEM compilation block. There is no functional change. [1] https://lore.kernel.org/lkml/202111272014.WOYNLUV6-lkp@intel.com/ [2] https://lore.kernel.org/lkml/202111280551.LXsWYt1T-lkp@intel.com/ Link: https://lkml.kernel.org/r/20211129161140.306488-1-longman@redhat.com Fixes: 559271146efc ("mm/memcg: optimize user context object stock access") Fixes: 68ac5b3c8db2 ("mm/memcg: cache vmstat data in percpu memcg_stock_pcp") Signed-off-by: Waiman Long Reported-by: kernel test robot Reviewed-by: Shakeel Butt Acked-by: Roman Gushchin Reviewed-by: Muchun Song Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Gang Li --- mm/memcontrol.c | 106 ++++++++++++++++++++++++------------------------ 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 307e5283c6722..4231d9ecf179d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -914,24 +914,6 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) rcu_read_unlock(); } -/* - * mod_objcg_mlstate() may be called with irq enabled, so - * mod_memcg_lruvec_state() should be used. - */ -static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, - struct pglist_data *pgdat, - enum node_stat_item idx, int nr) -{ - struct mem_cgroup *memcg; - struct lruvec *lruvec; - - rcu_read_lock(); - memcg = obj_cgroup_memcg(objcg); - lruvec = mem_cgroup_lruvec(memcg, pgdat); - mod_memcg_lruvec_state(lruvec, idx, nr); - rcu_read_unlock(); -} - /** * __count_memcg_events - account VM events in a cgroup * @memcg: the memory cgroup @@ -2292,41 +2274,6 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, } #endif -/* - * Most kmem_cache_alloc() calls are from user context. The irq disable/enable - * sequence used in this case to access content from object stock is slow. - * To optimize for user context access, there are now two object stocks for - * task context and interrupt context access respectively. - * - * The task context object stock can be accessed by disabling preemption only - * which is cheap in non-preempt kernel. The interrupt context object stock - * can only be accessed after disabling interrupt. User context code can - * access interrupt object stock, but not vice versa. - */ -static inline struct obj_stock *get_obj_stock(unsigned long *pflags) -{ - struct memcg_stock_pcp *stock; - - if (likely(in_task())) { - *pflags = 0UL; - preempt_disable(); - stock = this_cpu_ptr(&memcg_stock); - return &stock->task_obj; - } - - local_irq_save(*pflags); - stock = this_cpu_ptr(&memcg_stock); - return &stock->irq_obj; -} - -static inline void put_obj_stock(unsigned long flags) -{ - if (likely(in_task())) - preempt_enable(); - else - local_irq_restore(flags); -} - /** * consume_stock: Try to consume stocked charge on this cpu. * @memcg: memcg to consume from. @@ -3056,6 +3003,59 @@ static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) */ #define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT) +/* + * Most kmem_cache_alloc() calls are from user context. The irq disable/enable + * sequence used in this case to access content from object stock is slow. + * To optimize for user context access, there are now two object stocks for + * task context and interrupt context access respectively. + * + * The task context object stock can be accessed by disabling preemption only + * which is cheap in non-preempt kernel. The interrupt context object stock + * can only be accessed after disabling interrupt. User context code can + * access interrupt object stock, but not vice versa. + */ +static inline struct obj_stock *get_obj_stock(unsigned long *pflags) +{ + struct memcg_stock_pcp *stock; + + if (likely(in_task())) { + *pflags = 0UL; + preempt_disable(); + stock = this_cpu_ptr(&memcg_stock); + return &stock->task_obj; + } + + local_irq_save(*pflags); + stock = this_cpu_ptr(&memcg_stock); + return &stock->irq_obj; +} + +static inline void put_obj_stock(unsigned long flags) +{ + if (likely(in_task())) + preempt_enable(); + else + local_irq_restore(flags); +} + +/* + * mod_objcg_mlstate() may be called with irq enabled, so + * mod_memcg_lruvec_state() should be used. + */ +static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, + struct pglist_data *pgdat, + enum node_stat_item idx, int nr) +{ + struct mem_cgroup *memcg; + struct lruvec *lruvec; + + rcu_read_lock(); + memcg = obj_cgroup_memcg(objcg); + lruvec = mem_cgroup_lruvec(memcg, pgdat); + mod_memcg_lruvec_state(lruvec, idx, nr); + rcu_read_unlock(); +} + int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, gfp_t gfp, bool new_page) { From 546f46751249635a50bb3dcb7efcbc46d13572e7 Mon Sep 17 00:00:00 2001 From: Fabien Dessenne Date: Wed, 15 Dec 2021 10:58:08 +0100 Subject: [PATCH 0157/5344] pinctrl: stm32: consider the GPIO offset to expose all the GPIO lines commit b67210cc217f9ca1c576909454d846970c13dfd4 upstream. Consider the GPIO controller offset (from "gpio-ranges") to compute the maximum GPIO line number. This fixes an issue where gpio-ranges uses a non-null offset. e.g.: gpio-ranges = <&pinctrl 6 86 10> In that case the last valid GPIO line is not 9 but 15 (6 + 10 - 1) Cc: stable@vger.kernel.org Fixes: 67e2996f72c7 ("pinctrl: stm32: fix the reported number of GPIO lines per bank") Reported-by: Christoph Fritz Signed-off-by: Fabien Dessenne Link: https://lore.kernel.org/r/20211215095808.621716-1-fabien.dessenne@foss.st.com Signed-off-by: Linus Walleij Signed-off-by: Gang Li --- drivers/pinctrl/stm32/pinctrl-stm32.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/pinctrl/stm32/pinctrl-stm32.c b/drivers/pinctrl/stm32/pinctrl-stm32.c index bac1d040bacab..3c2a60f7b4172 100644 --- a/drivers/pinctrl/stm32/pinctrl-stm32.c +++ b/drivers/pinctrl/stm32/pinctrl-stm32.c @@ -1186,10 +1186,10 @@ static int stm32_gpiolib_register_bank(struct stm32_pinctrl *pctl, bank_nr = args.args[1] / STM32_GPIO_PINS_PER_BANK; bank->gpio_chip.base = args.args[1]; - npins = args.args[2]; - while (!of_parse_phandle_with_fixed_args(np, "gpio-ranges", 3, - ++i, &args)) - npins += args.args[2]; + /* get the last defined gpio line (offset + nb of pins) */ + npins = args.args[0] + args.args[2]; + while (!of_parse_phandle_with_fixed_args(np, "gpio-ranges", 3, ++i, &args)) + npins = max(npins, (int)(args.args[0] + args.args[2])); } else { bank_nr = pctl->nbanks; bank->gpio_chip.base = bank_nr * STM32_GPIO_PINS_PER_BANK; From 6390ab905c832c930de1c9f2e792ca3cc9c309c6 Mon Sep 17 00:00:00 2001 From: Antony Antony Date: Sun, 12 Dec 2021 11:34:30 +0100 Subject: [PATCH 0158/5344] xfrm: interface with if_id 0 should return error commit 8dce43919566f06e865f7e8949f5c10d8c2493f5 upstream. xfrm interface if_id = 0 would cause xfrm policy lookup errors since Commit 9f8550e4bd9d. Now explicitly fail to create an xfrm interface when if_id = 0 With this commit: ip link add ipsec0 type xfrm dev lo if_id 0 Error: if_id must be non zero. v1->v2 change: - add Fixes: tag Fixes: 9f8550e4bd9d ("xfrm: fix disable_xfrm sysctl when used on xfrm interfaces") Signed-off-by: Antony Antony Reviewed-by: Eyal Birger Signed-off-by: Steffen Klassert Signed-off-by: Gang Li --- net/xfrm/xfrm_interface.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 74e90d78c3b46..08343201513a9 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -659,11 +659,16 @@ static int xfrmi_newlink(struct net *src_net, struct net_device *dev, struct netlink_ext_ack *extack) { struct net *net = dev_net(dev); - struct xfrm_if_parms p; + struct xfrm_if_parms p = {}; struct xfrm_if *xi; int err; xfrmi_netlink_parms(data, &p); + if (!p.if_id) { + NL_SET_ERR_MSG(extack, "if_id must be non zero"); + return -EINVAL; + } + xi = xfrmi_locate(net, &p); if (xi) return -EEXIST; @@ -688,7 +693,12 @@ static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], { struct xfrm_if *xi = netdev_priv(dev); struct net *net = xi->net; - struct xfrm_if_parms p; + struct xfrm_if_parms p = {}; + + if (!p.if_id) { + NL_SET_ERR_MSG(extack, "if_id must be non zero"); + return -EINVAL; + } xfrmi_netlink_parms(data, &p); xi = xfrmi_locate(net, &p); From 2863dda214c0410a3809b0fb667c5e9cd5f4104e Mon Sep 17 00:00:00 2001 From: Antony Antony Date: Sun, 12 Dec 2021 11:35:00 +0100 Subject: [PATCH 0159/5344] xfrm: state and policy should fail if XFRMA_IF_ID 0 commit 68ac0f3810e76a853b5f7b90601a05c3048b8b54 upstream. xfrm ineterface does not allow xfrm if_id = 0 fail to create or update xfrm state and policy. With this commit: ip xfrm policy add src 192.0.2.1 dst 192.0.2.2 dir out if_id 0 RTNETLINK answers: Invalid argument ip xfrm state add src 192.0.2.1 dst 192.0.2.2 proto esp spi 1 \ reqid 1 mode tunnel aead 'rfc4106(gcm(aes))' \ 0x1111111111111111111111111111111111111111 96 if_id 0 RTNETLINK answers: Invalid argument v1->v2 change: - add Fixes: tag Fixes: 9f8550e4bd9d ("xfrm: fix disable_xfrm sysctl when used on xfrm interfaces") Signed-off-by: Antony Antony Signed-off-by: Steffen Klassert Signed-off-by: Gang Li --- net/xfrm/xfrm_user.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 0cee2d3c6e452..038b89db1ac73 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -621,8 +621,13 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, xfrm_smark_init(attrs, &x->props.smark); - if (attrs[XFRMA_IF_ID]) + if (attrs[XFRMA_IF_ID]) { x->if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + if (!x->if_id) { + err = -EINVAL; + goto error; + } + } err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV]); if (err) @@ -1328,8 +1333,13 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, mark = xfrm_mark_get(attrs, &m); - if (attrs[XFRMA_IF_ID]) + if (attrs[XFRMA_IF_ID]) { if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + if (!if_id) { + err = -EINVAL; + goto out_noput; + } + } if (p->info.seq) { x = xfrm_find_acq_byseq(net, mark, p->info.seq); @@ -1631,8 +1641,13 @@ static struct xfrm_policy *xfrm_policy_construct(struct net *net, struct xfrm_us xfrm_mark_get(attrs, &xp->mark); - if (attrs[XFRMA_IF_ID]) + if (attrs[XFRMA_IF_ID]) { xp->if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + if (!xp->if_id) { + err = -EINVAL; + goto error; + } + } return xp; error: From 3869298f7b87724048255243a6c92168f4fcc550 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 20 Dec 2021 09:49:01 -0500 Subject: [PATCH 0160/5344] net: accept UFOv6 packages in virtio_net_hdr_to_skb commit 7e5cced9ca84df52d874aca6b632f930b3dc5bc6 upstream. Skb with skb->protocol 0 at the time of virtio_net_hdr_to_skb may have a protocol inferred from virtio_net_hdr with virtio_net_hdr_set_proto. Unlike TCP, UDP does not have separate types for IPv4 and IPv6. Type VIRTIO_NET_HDR_GSO_UDP is guessed to be IPv4/UDP. As of the below commit, UFOv6 packets are dropped due to not matching the protocol as obtained from dev_parse_header_protocol. Invert the test to take that L2 protocol field as starting point and pass both UFOv4 and UFOv6 for VIRTIO_NET_HDR_GSO_UDP. Fixes: 924a9bc362a5 ("net: check if protocol extracted by virtio_net_hdr_set_proto is correct") Link: https://lore.kernel.org/netdev/CABcq3pG9GRCYqFDBAJ48H1vpnnX=41u+MhQnayF1ztLH4WX0Fw@mail.gmail.com/ Reported-by: Andrew Melnichenko Signed-off-by: Willem de Bruijn Link: https://lore.kernel.org/r/20211220144901.2784030-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Gang Li --- include/linux/virtio_net.h | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index b465f8f3e554f..8ca4a776b4c6a 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -7,6 +7,21 @@ #include #include +static inline bool virtio_net_hdr_match_proto(__be16 protocol, __u8 gso_type) +{ + switch (gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { + case VIRTIO_NET_HDR_GSO_TCPV4: + return protocol == cpu_to_be16(ETH_P_IP); + case VIRTIO_NET_HDR_GSO_TCPV6: + return protocol == cpu_to_be16(ETH_P_IPV6); + case VIRTIO_NET_HDR_GSO_UDP: + return protocol == cpu_to_be16(ETH_P_IP) || + protocol == cpu_to_be16(ETH_P_IPV6); + default: + return false; + } +} + static inline int virtio_net_hdr_set_proto(struct sk_buff *skb, const struct virtio_net_hdr *hdr) { @@ -88,9 +103,12 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, if (!skb->protocol) { __be16 protocol = dev_parse_header_protocol(skb); - virtio_net_hdr_set_proto(skb, hdr); - if (protocol && protocol != skb->protocol) + if (!protocol) + virtio_net_hdr_set_proto(skb, hdr); + else if (!virtio_net_hdr_match_proto(protocol, hdr->gso_type)) return -EINVAL; + else + skb->protocol = protocol; } retry: if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys, From 0ec3be3d0913ab8f3cc038a4e1e714d7d3994b87 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 1 Dec 2021 14:25:25 +0100 Subject: [PATCH 0161/5344] firmware: qemu_fw_cfg: fix NULL-pointer deref on duplicate entries commit a57ac7acdcc1665662e369993898194def56e888 upstream. Commit fe3c60684377 ("firmware: Fix a reference count leak.") "fixed" a kobject leak in the file registration helper by properly calling kobject_put() for the entry in case registration of the object fails (e.g. due to a name collision). This would however result in a NULL pointer dereference when the release function tries to remove the never added entry from the fw_cfg_entry_cache list. Fix this by moving the list-removal out of the release function. Note that the offending commit was one of the benign looking umn.edu fixes which was reviewed but not reverted. [1][2] [1] https://lore.kernel.org/r/202105051005.49BFABCE@keescook [2] https://lore.kernel.org/all/YIg7ZOZvS3a8LjSv@kroah.com Fixes: fe3c60684377 ("firmware: Fix a reference count leak.") Cc: stable@vger.kernel.org # 5.8 Cc: Qiushi Wu Cc: Kees Cook Cc: Greg Kroah-Hartman Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20211201132528.30025-2-johan@kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Gang Li --- drivers/firmware/qemu_fw_cfg.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/firmware/qemu_fw_cfg.c b/drivers/firmware/qemu_fw_cfg.c index 59db70fb45614..27c614dab50a0 100644 --- a/drivers/firmware/qemu_fw_cfg.c +++ b/drivers/firmware/qemu_fw_cfg.c @@ -385,9 +385,7 @@ static void fw_cfg_sysfs_cache_cleanup(void) struct fw_cfg_sysfs_entry *entry, *next; list_for_each_entry_safe(entry, next, &fw_cfg_entry_cache, list) { - /* will end up invoking fw_cfg_sysfs_cache_delist() - * via each object's release() method (i.e. destructor) - */ + fw_cfg_sysfs_cache_delist(entry); kobject_put(&entry->kobj); } } @@ -445,7 +443,6 @@ static void fw_cfg_sysfs_release_entry(struct kobject *kobj) { struct fw_cfg_sysfs_entry *entry = to_entry(kobj); - fw_cfg_sysfs_cache_delist(entry); kfree(entry); } From b2c96b7006ad2a9c1b75f9f73c1411a2699ed91a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 22 Dec 2021 20:26:56 -0700 Subject: [PATCH 0162/5344] io_uring: zero iocb->ki_pos for stream file types commit 7b9762a5e8837b92a027d58d396a9d27f6440c36 upstream. io_uring supports using offset == -1 for using the current file position, and we read that in as part of read/write command setup. For the non-iter read/write types we pass in NULL for the position pointer, but for the iter types we should not be passing any anything but 0 for the position for a stream. Clear kiocb->ki_pos if the file is a stream, don't leave it as -1. If we do, then the request will error with -ESPIPE. Fixes: ba04291eb66e ("io_uring: allow use of offset == -1 to mean file position") Link: https://github.com/axboe/liburing/discussions/501 Reported-by: Samuel Williams Signed-off-by: Jens Axboe Signed-off-by: Gang Li --- fs/io_uring.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index cd32da26cc4b1..fa10f4469d18b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2914,9 +2914,13 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->flags |= REQ_F_ISREG; kiocb->ki_pos = READ_ONCE(sqe->off); - if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) { - req->flags |= REQ_F_CUR_POS; - kiocb->ki_pos = req->file->f_pos; + if (kiocb->ki_pos == -1) { + if (!(req->file->f_mode & FMODE_STREAM)) { + req->flags |= REQ_F_CUR_POS; + kiocb->ki_pos = req->file->f_pos; + } else { + kiocb->ki_pos = 0; + } } kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); kiocb->ki_flags = iocb_flags(kiocb->ki_filp); From c1194b1a30ce40f7e81a5ed4c4ee94867b30263b Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 17 Dec 2021 10:03:30 +0100 Subject: [PATCH 0163/5344] crypto: omap-aes - Fix broken pm_runtime_and_get() usage commit c2aec59be093bd44627bc4f6bc67e4614a93a7b6 upstream. This fix is basically the same as 3d6b661330a7 ("crypto: stm32 - Revert broken pm_runtime_resume_and_get changes"), just for the omap driver. If the return value isn't used, then pm_runtime_get_sync() has to be used for ensuring that the usage count is balanced. Fixes: 1f34cc4a8da3 ("crypto: omap-aes - Fix PM reference leak on omap-aes.c") Cc: stable@vger.kernel.org Signed-off-by: Heiner Kallweit Signed-off-by: Herbert Xu Signed-off-by: Gang Li --- drivers/crypto/omap-aes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/omap-aes.c b/drivers/crypto/omap-aes.c index 72edb10181b86..41ffb088831de 100644 --- a/drivers/crypto/omap-aes.c +++ b/drivers/crypto/omap-aes.c @@ -1318,7 +1318,7 @@ static int omap_aes_suspend(struct device *dev) static int omap_aes_resume(struct device *dev) { - pm_runtime_resume_and_get(dev); + pm_runtime_get_sync(dev); return 0; } #endif From 960283514cfa756767aeb58749e06d2a5c30bd83 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 24 Dec 2021 21:12:54 -0800 Subject: [PATCH 0164/5344] mm/damon/dbgfs: protect targets destructions with kdamond_lock commit 34796417964b8d0aef45a99cf6c2d20cebe33733 upstream. DAMON debugfs interface iterates current monitoring targets in 'dbgfs_target_ids_read()' while holding the corresponding 'kdamond_lock'. However, it also destructs the monitoring targets in 'dbgfs_before_terminate()' without holding the lock. This can result in a use_after_free bug. This commit avoids the race by protecting the destruction with the corresponding 'kdamond_lock'. Link: https://lkml.kernel.org/r/20211221094447.2241-1-sj@kernel.org Reported-by: Sangwoo Bae Fixes: 4bc05954d007 ("mm/damon: implement a debugfs-based user space interface") Signed-off-by: SeongJae Park Cc: [5.15.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Gang Li --- mm/damon/dbgfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index f94d19a690dfa..d3bc110430f9d 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -309,10 +309,12 @@ static int dbgfs_before_terminate(struct damon_ctx *ctx) if (!targetid_is_pid(ctx)) return 0; + mutex_lock(&ctx->kdamond_lock); damon_for_each_target_safe(t, next, ctx) { put_pid((struct pid *)t->id); damon_destroy_target(t); } + mutex_unlock(&ctx->kdamond_lock); return 0; } From 0cb280da6ce69492e9561b5ad9550d76e8de9dd4 Mon Sep 17 00:00:00 2001 From: Laurence de Bruxelles Date: Sat, 1 Jan 2022 15:41:49 +0000 Subject: [PATCH 0165/5344] rtc: pxa: fix null pointer dereference commit 34127b3632b21e5c391756e724b1198eb9917981 upstream. With the latest stable kernel versions the rtc on the PXA based Zaurus does not work, when booting I see the following kernel messages: pxa-rtc pxa-rtc: failed to find rtc clock source pxa-rtc pxa-rtc: Unable to init SA1100 RTC sub-device pxa-rtc: probe of pxa-rtc failed with error -2 hctosys: unable to open rtc device (rtc0) I think this is because commit f2997775b111 ("rtc: sa1100: fix possible race condition") moved the allocation of the rtc_device struct out of sa1100_rtc_init and into sa1100_rtc_probe. This means that pxa_rtc_probe also needs to do allocation for the rtc_device struct, otherwise sa1100_rtc_init will try to dereference a null pointer. This patch adds that allocation by copying how sa1100_rtc_probe in drivers/rtc/rtc-sa1100.c does it; after the IRQs are set up a managed rtc_device is allocated. I've tested this patch with `qemu-system-arm -machine akita` and with a real Zaurus SL-C1000 applied to 4.19, 5.4, and 5.10. Signed-off-by: Laurence de Bruxelles Fixes: f2997775b111 ("rtc: sa1100: fix possible race condition") Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220101154149.12026-1-lfdebrux@gmail.com Signed-off-by: Gang Li --- drivers/rtc/rtc-pxa.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/rtc/rtc-pxa.c b/drivers/rtc/rtc-pxa.c index d2f1d8f754bf3..cf8119b6d3204 100644 --- a/drivers/rtc/rtc-pxa.c +++ b/drivers/rtc/rtc-pxa.c @@ -330,6 +330,10 @@ static int __init pxa_rtc_probe(struct platform_device *pdev) if (sa1100_rtc->irq_alarm < 0) return -ENXIO; + sa1100_rtc->rtc = devm_rtc_allocate_device(&pdev->dev); + if (IS_ERR(sa1100_rtc->rtc)) + return PTR_ERR(sa1100_rtc->rtc); + pxa_rtc->base = devm_ioremap(dev, pxa_rtc->ress->start, resource_size(pxa_rtc->ress)); if (!pxa_rtc->base) { From ff018103ae78ed41f5f51f687c30d5cf1f91b4ca Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Sun, 5 Dec 2021 12:07:49 +0200 Subject: [PATCH 0166/5344] net/mlx5: Set command entry semaphore up once got index free commit 8e715cd613a1e872b9d918e912d90b399785761a upstream. Avoid a race where command work handler may fail to allocate command entry index, by holding the command semaphore down till command entry index is being freed. Fixes: 410bd754cd73 ("net/mlx5: Add retry mechanism to the command entry index allocation") Signed-off-by: Moshe Shemesh Reviewed-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed Signed-off-by: Gang Li --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 76547d35cd0e1..60a0b3a04e730 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -147,8 +147,12 @@ static void cmd_ent_put(struct mlx5_cmd_work_ent *ent) if (!refcount_dec_and_test(&ent->refcnt)) return; - if (ent->idx >= 0) - cmd_free_index(ent->cmd, ent->idx); + if (ent->idx >= 0) { + struct mlx5_cmd *cmd = ent->cmd; + + cmd_free_index(cmd, ent->idx); + up(ent->page_queue ? &cmd->pages_sem : &cmd->sem); + } cmd_free_ent(ent); } @@ -1577,8 +1581,6 @@ static void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool force vector = vec & 0xffffffff; for (i = 0; i < (1 << cmd->log_sz); i++) { if (test_bit(i, &vector)) { - struct semaphore *sem; - ent = cmd->ent_arr[i]; /* if we already completed the command, ignore it */ @@ -1601,10 +1603,6 @@ static void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool force dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) cmd_ent_put(ent); - if (ent->page_queue) - sem = &cmd->pages_sem; - else - sem = &cmd->sem; ent->ts2 = ktime_get_ns(); memcpy(ent->out->first.data, ent->lay->out, sizeof(ent->lay->out)); dump_command(dev, ent, 0); @@ -1658,7 +1656,6 @@ static void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool force */ complete(&ent->done); } - up(sem); } } } From fd94fbe73ef252b04948c48e717bd98818b15a3e Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 6 Jan 2022 10:46:11 +0100 Subject: [PATCH 0167/5344] x86/hyperv: Properly deal with empty cpumasks in hyperv_flush_tlb_multi() commit 51500b71d500f251037ed339047a4d9e7d7e295b upstream. KASAN detected the following issue: BUG: KASAN: slab-out-of-bounds in hyperv_flush_tlb_multi+0xf88/0x1060 Read of size 4 at addr ffff8880011ccbc0 by task kcompactd0/33 CPU: 1 PID: 33 Comm: kcompactd0 Not tainted 5.14.0-39.el9.x86_64+debug #1 Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.0 12/17/2019 Call Trace: dump_stack_lvl+0x57/0x7d print_address_description.constprop.0+0x1f/0x140 ? hyperv_flush_tlb_multi+0xf88/0x1060 __kasan_report.cold+0x7f/0x11e ? hyperv_flush_tlb_multi+0xf88/0x1060 kasan_report+0x38/0x50 hyperv_flush_tlb_multi+0xf88/0x1060 flush_tlb_mm_range+0x1b1/0x200 ptep_clear_flush+0x10e/0x150 ... Allocated by task 0: kasan_save_stack+0x1b/0x40 __kasan_kmalloc+0x7c/0x90 hv_common_init+0xae/0x115 hyperv_init+0x97/0x501 apic_intr_mode_init+0xb3/0x1e0 x86_late_time_init+0x92/0xa2 start_kernel+0x338/0x3eb secondary_startup_64_no_verify+0xc2/0xcb The buggy address belongs to the object at ffff8880011cc800 which belongs to the cache kmalloc-1k of size 1024 The buggy address is located 960 bytes inside of 1024-byte region [ffff8880011cc800, ffff8880011ccc00) 'hyperv_flush_tlb_multi+0xf88/0x1060' points to hv_cpu_number_to_vp_number() and '960 bytes' means we're trying to get VP_INDEX for CPU#240. 'nr_cpus' here is exactly 240 so we're trying to access past hv_vp_index's last element. This can (and will) happen when 'cpus' mask is empty and cpumask_last() will return '>=nr_cpus'. Commit ad0a6bad4475 ("x86/hyperv: check cpu mask after interrupt has been disabled") tried to deal with empty cpumask situation but apparently didn't fully fix the issue. 'cpus' cpumask which is passed to hyperv_flush_tlb_multi() is 'mm_cpumask(mm)' (which is '&mm->cpu_bitmap'). This mask changes every time the particular mm is scheduled/unscheduled on some CPU (see switch_mm_irqs_off()), disabling IRQs on the CPU which is performing remote TLB flush has zero influence on whether the particular process can get scheduled/unscheduled on _other_ CPUs so e.g. in the case where the mm was scheduled on one other CPU and got unscheduled during hyperv_flush_tlb_multi()'s execution will lead to cpumask becoming empty. It doesn't seem that there's a good way to protect 'mm_cpumask(mm)' from changing during hyperv_flush_tlb_multi()'s execution. It would be possible to copy it in the very beginning of the function but this is a waste. It seems we can deal with changing cpumask just fine. When 'cpus' cpumask changes during hyperv_flush_tlb_multi()'s execution, there are two possible issues: - 'Under-flushing': we will not flush TLB on a CPU which got added to the mask while hyperv_flush_tlb_multi() was already running. This is not a problem as this is equal to mm getting scheduled on that CPU right after TLB flush. - 'Over-flushing': we may flush TLB on a CPU which is already cleared from the mask. First, extra TLB flush preserves correctness. Second, Hyper-V's TLB flush hypercall takes 'mm->pgd' argument so Hyper-V may avoid the flush if CR3 doesn't match. Fix the immediate issue with cpumask_last()/hv_cpu_number_to_vp_number() and remove the pointless cpumask_empty() check from the beginning of the function as it really doesn't protect anything. Also, avoid the hypercall altogether when 'flush->processor_mask' ends up being empty. Fixes: ad0a6bad4475 ("x86/hyperv: check cpu mask after interrupt has been disabled") Signed-off-by: Vitaly Kuznetsov Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20220106094611.1404218-1-vkuznets@redhat.com Signed-off-by: Wei Liu Signed-off-by: Gang Li --- arch/x86/hyperv/mmu.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index 2c87350c1fb09..13838f1a45bd4 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -68,15 +68,6 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus, local_irq_save(flags); - /* - * Only check the mask _after_ interrupt has been disabled to avoid the - * mask changing under our feet. - */ - if (cpumask_empty(cpus)) { - local_irq_restore(flags); - return; - } - flush_pcpu = (struct hv_tlb_flush **) this_cpu_ptr(hyperv_pcpu_input_arg); @@ -115,7 +106,9 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus, * must. We will also check all VP numbers when walking the * supplied CPU set to remain correct in all cases. */ - if (hv_cpu_number_to_vp_number(cpumask_last(cpus)) >= 64) + cpu = cpumask_last(cpus); + + if (cpu < nr_cpumask_bits && hv_cpu_number_to_vp_number(cpu) >= 64) goto do_ex_hypercall; for_each_cpu(cpu, cpus) { @@ -131,6 +124,12 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus, __set_bit(vcpu, (unsigned long *) &flush->processor_mask); } + + /* nothing to flush if 'processor_mask' ends up being empty */ + if (!flush->processor_mask) { + local_irq_restore(flags); + return; + } } /* From 86422ef3d9ceacf3669fd0d2be528ace2c285f6f Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 30 Dec 2021 16:20:24 +0200 Subject: [PATCH 0168/5344] vdpa/mlx5: Fix wrong configuration of virtio_version_1_0 commit 97143b70aa847f2b0a1f959dde126b76ff7b5376 upstream. Remove overriding of virtio_version_1_0 which forced the virtqueue object to version 1. Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices") Signed-off-by: Eli Cohen Link: https://lore.kernel.org/r/20211230142024.142979-1-elic@nvidia.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Parav Pandit Acked-by: Jason Wang Reviewed-by: Si-Wei Liu Signed-off-by: Gang Li --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 42e07847c1daa..34b73a96895b7 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -812,8 +812,6 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtque MLX5_SET(virtio_q, vq_ctx, umem_3_id, mvq->umem3.id); MLX5_SET(virtio_q, vq_ctx, umem_3_size, mvq->umem3.size); MLX5_SET(virtio_q, vq_ctx, pd, ndev->mvdev.res.pdn); - if (MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, eth_frame_offload_type)) - MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0, 1); err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out)); if (err) From 5be22defdb3b427f59ce970ffe0460603ac56e99 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:10:41 -0800 Subject: [PATCH 0169/5344] mm/damon/dbgfs: remove an unnecessary variable commit 70b8480812d0a3930049a44820a1fa149b090c10 upstream. Patch series "mm/damon: Hide unnecessary information disclosures". DAMON is exposing some unnecessary information including kernel pointer in kernel log and tracepoint. This patchset hides such information. The first patch is only for a trivial cleanup, though. This patch (of 4): This commit removes a unnecessarily used variable in dbgfs_target_ids_write(). Link: https://lkml.kernel.org/r/20211229131016.23641-1-sj@kernel.org Link: https://lkml.kernel.org/r/20211229131016.23641-2-sj@kernel.org Fixes: 4bc05954d007 ("mm/damon: implement a debugfs-based user space interface") Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Gang Li --- mm/damon/dbgfs.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index d3bc110430f9d..d40dca3f74a8b 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -185,7 +185,7 @@ static ssize_t dbgfs_target_ids_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct damon_ctx *ctx = file->private_data; - char *kbuf, *nrs; + char *kbuf; unsigned long *targets; ssize_t nr_targets; ssize_t ret = count; @@ -196,9 +196,8 @@ static ssize_t dbgfs_target_ids_write(struct file *file, if (IS_ERR(kbuf)) return PTR_ERR(kbuf); - nrs = kbuf; - targets = str_to_target_ids(nrs, ret, &nr_targets); + targets = str_to_target_ids(kbuf, ret, &nr_targets); if (!targets) { ret = -ENOMEM; goto out; From 24a4caf66d01fab4a180016c87d95fd265b6758a Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 19 Jan 2022 18:10:28 -0800 Subject: [PATCH 0170/5344] lib/Kconfig.debug: make TEST_KMOD depend on PAGE_SIZE_LESS_THAN_256KB commit bbd2e05fad3e692ff2495895975bd0fce02bdbae upstream. Commit b05fbcc36be1 ("btrfs: disable build on platforms having page size 256K") disabled btrfs for configurations that used a 256kB page size. However, it did not fully solve the problem because CONFIG_TEST_KMOD selects CONFIG_BTRFS, which does not account for the dependency. This results in a Kconfig warning and the failed BUILD_BUG_ON error returning. WARNING: unmet direct dependencies detected for BTRFS_FS Depends on [n]: BLOCK [=y] && !PPC_256K_PAGES && !PAGE_SIZE_256KB [=y] Selected by [m]: - TEST_KMOD [=m] && RUNTIME_TESTING_MENU [=y] && m && MODULES [=y] && NETDEVICES [=y] && NET_CORE [=y] && INET [=y] && BLOCK [=y] To resolve this, add CONFIG_PAGE_SIZE_LESS_THAN_256KB as a dependency of CONFIG_TEST_KMOD so there is no more invalid configuration or build errors. Link: https://lkml.kernel.org/r/20211129230141.228085-4-nathan@kernel.org Fixes: b05fbcc36be1 ("btrfs: disable build on platforms having page size 256K") Signed-off-by: Nathan Chancellor Reported-by: kernel test robot Cc: Chris Mason Cc: David Sterba Cc: Josef Bacik Cc: Luis Chamberlain Cc: Nick Desaulniers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Gang Li --- lib/Kconfig.debug | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ec54ee392d58e..ea6628a9d76bd 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1988,6 +1988,7 @@ config TEST_KMOD depends on m depends on NETDEVICES && NET_CORE && INET # for TUN depends on BLOCK + depends on PAGE_SIZE_LESS_THAN_256KB # for BTRFS select TEST_LKM select XFS_FS select TUN From b7d4f59cb941d890be6ddb36d7e0dd8f8e61a2ff Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 10 Dec 2021 14:46:22 -0800 Subject: [PATCH 0171/5344] timers: implement usleep_idle_range() commit e4779015fd5d2fb8390c258268addff24d6077c7 upstream. Patch series "mm/damon: Fix fake /proc/loadavg reports", v3. This patchset fixes DAMON's fake load report issue. The first patch makes yet another variant of usleep_range() for this fix, and the second patch fixes the issue of DAMON by making it using the newly introduced function. This patch (of 2): Some kernel threads such as DAMON could need to repeatedly sleep in micro seconds level. Because usleep_range() sleeps in uninterruptible state, however, such threads would make /proc/loadavg reports fake load. To help such cases, this commit implements a variant of usleep_range() called usleep_idle_range(). It is same to usleep_range() but sets the state of the current task as TASK_IDLE while sleeping. Link: https://lkml.kernel.org/r/20211126145015.15862-1-sj@kernel.org Link: https://lkml.kernel.org/r/20211126145015.15862-2-sj@kernel.org Signed-off-by: SeongJae Park Suggested-by: Andrew Morton Reviewed-by: Thomas Gleixner Tested-by: Oleksandr Natalenko Cc: John Stultz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Gang Li --- include/linux/delay.h | 14 +++++++++++++- kernel/time/timer.c | 16 +++++++++------- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/include/linux/delay.h b/include/linux/delay.h index 8e6828094c1ee..4e684562036d6 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -20,6 +20,7 @@ */ #include +#include extern unsigned long loops_per_jiffy; @@ -58,7 +59,18 @@ void calibrate_delay(void); void __attribute__((weak)) calibration_delay_done(void); void msleep(unsigned int msecs); unsigned long msleep_interruptible(unsigned int msecs); -void usleep_range(unsigned long min, unsigned long max); +void usleep_range_state(unsigned long min, unsigned long max, + unsigned int state); + +static inline void usleep_range(unsigned long min, unsigned long max) +{ + usleep_range_state(min, max, TASK_UNINTERRUPTIBLE); +} + +static inline void usleep_idle_range(unsigned long min, unsigned long max) +{ + usleep_range_state(min, max, TASK_IDLE); +} static inline void ssleep(unsigned int seconds) { diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 16a2b62f5f74c..93560ad5fc8d3 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -2080,26 +2080,28 @@ unsigned long msleep_interruptible(unsigned int msecs) EXPORT_SYMBOL(msleep_interruptible); /** - * usleep_range - Sleep for an approximate time - * @min: Minimum time in usecs to sleep - * @max: Maximum time in usecs to sleep + * usleep_range_state - Sleep for an approximate time in a given state + * @min: Minimum time in usecs to sleep + * @max: Maximum time in usecs to sleep + * @state: State of the current task that will be while sleeping * * In non-atomic context where the exact wakeup time is flexible, use - * usleep_range() instead of udelay(). The sleep improves responsiveness + * usleep_range_state() instead of udelay(). The sleep improves responsiveness * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces * power usage by allowing hrtimers to take advantage of an already- * scheduled interrupt instead of scheduling a new one just for this sleep. */ -void __sched usleep_range(unsigned long min, unsigned long max) +void __sched usleep_range_state(unsigned long min, unsigned long max, + unsigned int state) { ktime_t exp = ktime_add_us(ktime_get(), min); u64 delta = (u64)(max - min) * NSEC_PER_USEC; for (;;) { - __set_current_state(TASK_UNINTERRUPTIBLE); + __set_current_state(state); /* Do not return before the requested sleep time has elapsed */ if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS)) break; } } -EXPORT_SYMBOL(usleep_range); +EXPORT_SYMBOL(usleep_range_state); From 04bff2260fed2c07af82865b7683aaa0a096bdca Mon Sep 17 00:00:00 2001 From: Pavankumar Kondeti Date: Sat, 22 Jan 2022 08:33:22 +0530 Subject: [PATCH 0172/5344] usb: gadget: f_sourcesink: Fix isoc transfer for USB_SPEED_SUPER_PLUS commit 904edf8aeb459697129be5fde847e2a502f41fd9 upstream. Currently when gadget enumerates in super speed plus, the isoc endpoint request buffer size is not calculated correctly. Fix this by checking the gadget speed against USB_SPEED_SUPER_PLUS and update the request buffer size. Fixes: 90c4d05780d4 ("usb: fix various gadgets null ptr deref on 10gbps cabling.") Cc: stable Signed-off-by: Pavankumar Kondeti Link: https://lore.kernel.org/r/1642820602-20619-1-git-send-email-quic_pkondeti@quicinc.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Gang Li --- drivers/usb/gadget/function/f_sourcesink.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/gadget/function/f_sourcesink.c b/drivers/usb/gadget/function/f_sourcesink.c index 282737e4609ce..2c65a9bb3c81b 100644 --- a/drivers/usb/gadget/function/f_sourcesink.c +++ b/drivers/usb/gadget/function/f_sourcesink.c @@ -583,6 +583,7 @@ static int source_sink_start_ep(struct f_sourcesink *ss, bool is_in, if (is_iso) { switch (speed) { + case USB_SPEED_SUPER_PLUS: case USB_SPEED_SUPER: size = ss->isoc_maxpacket * (ss->isoc_mult + 1) * From ff49170ed74bda457fdfe3c8d7180adfae9b20f8 Mon Sep 17 00:00:00 2001 From: Georgi Valkov Date: Tue, 1 Feb 2022 08:16:18 +0100 Subject: [PATCH 0173/5344] ipheth: fix EOVERFLOW in ipheth_rcvbulk_callback commit 63e4b45c82ed1bde979da7052229a4229ce9cabf upstream. When rx_buf is allocated we need to account for IPHETH_IP_ALIGN, which reduces the usable size by 2 bytes. Otherwise we have 1512 bytes usable instead of 1514, and if we receive more than 1512 bytes, ipheth_rcvbulk_callback is called with status -EOVERFLOW, after which the driver malfunctiones and all communication stops. Resolves ipheth 2-1:4.2: ipheth_rcvbulk_callback: urb status: -75 Fixes: f33d9e2b48a3 ("usbnet: ipheth: fix connectivity with iOS 14") Signed-off-by: Georgi Valkov Tested-by: Jan Kiszka Link: https://lore.kernel.org/all/B60B8A4B-92A0-49B3-805D-809A2433B46C@abv.bg/ Link: https://lore.kernel.org/all/24851bd2769434a5fc24730dce8e8a984c5a4505.1643699778.git.jan.kiszka@siemens.com/ Signed-off-by: Jakub Kicinski Signed-off-by: Gang Li --- drivers/net/usb/ipheth.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/usb/ipheth.c b/drivers/net/usb/ipheth.c index 345576f1a7470..73ad78f47763c 100644 --- a/drivers/net/usb/ipheth.c +++ b/drivers/net/usb/ipheth.c @@ -121,7 +121,7 @@ static int ipheth_alloc_urbs(struct ipheth_device *iphone) if (tx_buf == NULL) goto free_rx_urb; - rx_buf = usb_alloc_coherent(iphone->udev, IPHETH_BUF_SIZE, + rx_buf = usb_alloc_coherent(iphone->udev, IPHETH_BUF_SIZE + IPHETH_IP_ALIGN, GFP_KERNEL, &rx_urb->transfer_dma); if (rx_buf == NULL) goto free_tx_buf; @@ -146,7 +146,7 @@ static int ipheth_alloc_urbs(struct ipheth_device *iphone) static void ipheth_free_urbs(struct ipheth_device *iphone) { - usb_free_coherent(iphone->udev, IPHETH_BUF_SIZE, iphone->rx_buf, + usb_free_coherent(iphone->udev, IPHETH_BUF_SIZE + IPHETH_IP_ALIGN, iphone->rx_buf, iphone->rx_urb->transfer_dma); usb_free_coherent(iphone->udev, IPHETH_BUF_SIZE, iphone->tx_buf, iphone->tx_urb->transfer_dma); @@ -317,7 +317,7 @@ static int ipheth_rx_submit(struct ipheth_device *dev, gfp_t mem_flags) usb_fill_bulk_urb(dev->rx_urb, udev, usb_rcvbulkpipe(udev, dev->bulk_in), - dev->rx_buf, IPHETH_BUF_SIZE, + dev->rx_buf, IPHETH_BUF_SIZE + IPHETH_IP_ALIGN, ipheth_rcvbulk_callback, dev); dev->rx_urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; From eb5a40f094fb6067fd6478a98429a295910cebe7 Mon Sep 17 00:00:00 2001 From: Antony Antony Date: Tue, 1 Feb 2022 07:51:57 +0100 Subject: [PATCH 0174/5344] xfrm: fix the if_id check in changelink commit 6d0d95a1c2b07270870e7be16575c513c29af3f1 upstream. if_id will be always 0, because it was not yet initialized. Fixes: 8dce43919566 ("xfrm: interface with if_id 0 should return error") Reported-by: Pavel Machek Signed-off-by: Antony Antony Signed-off-by: Steffen Klassert Signed-off-by: Gang Li --- net/xfrm/xfrm_interface.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 08343201513a9..3932d3aaff270 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -695,12 +695,12 @@ static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], struct net *net = xi->net; struct xfrm_if_parms p = {}; + xfrmi_netlink_parms(data, &p); if (!p.if_id) { NL_SET_ERR_MSG(extack, "if_id must be non zero"); return -EINVAL; } - xfrmi_netlink_parms(data, &p); xi = xfrmi_locate(net, &p); if (!xi) { xi = netdev_priv(dev); From 81145b1f0db9cadb7d6ce32a42f7e3c05e415622 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 20 Dec 2021 16:38:02 +0000 Subject: [PATCH 0175/5344] livepatch: Fix build failure on 32 bits processors commit 2f293651eca3eacaeb56747dede31edace7329d2 upstream. Trying to build livepatch on powerpc/32 results in: kernel/livepatch/core.c: In function 'klp_resolve_symbols': kernel/livepatch/core.c:221:23: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] 221 | sym = (Elf64_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info); | ^ kernel/livepatch/core.c:221:21: error: assignment to 'Elf32_Sym *' {aka 'struct elf32_sym *'} from incompatible pointer type 'Elf64_Sym *' {aka 'struct elf64_sym *'} [-Werror=incompatible-pointer-types] 221 | sym = (Elf64_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info); | ^ kernel/livepatch/core.c: In function 'klp_apply_section_relocs': kernel/livepatch/core.c:312:35: error: passing argument 1 of 'klp_resolve_symbols' from incompatible pointer type [-Werror=incompatible-pointer-types] 312 | ret = klp_resolve_symbols(sechdrs, strtab, symndx, sec, sec_objname); | ^~~~~~~ | | | Elf32_Shdr * {aka struct elf32_shdr *} kernel/livepatch/core.c:193:44: note: expected 'Elf64_Shdr *' {aka 'struct elf64_shdr *'} but argument is of type 'Elf32_Shdr *' {aka 'struct elf32_shdr *'} 193 | static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab, | ~~~~~~~~~~~~^~~~~~~ Fix it by using the right types instead of forcing 64 bits types. Fixes: 7c8e2bdd5f0d ("livepatch: Apply vmlinux-specific KLP relocations early") Signed-off-by: Christophe Leroy Acked-by: Petr Mladek Acked-by: Joe Lawrence Acked-by: Miroslav Benes Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/5288e11b018a762ea3351cc8fb2d4f15093a4457.1640017960.git.christophe.leroy@csgroup.eu Signed-off-by: Gang Li --- kernel/livepatch/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 5954d52d0a1d4..9db10ba07f8d2 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -190,7 +190,7 @@ static int klp_find_object_symbol(const char *objname, const char *name, return -EINVAL; } -static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab, +static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab, unsigned int symndx, Elf_Shdr *relasec, const char *sec_objname) { @@ -218,7 +218,7 @@ static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab, relas = (Elf_Rela *) relasec->sh_addr; /* For each rela in this klp relocation section */ for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) { - sym = (Elf64_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info); + sym = (Elf_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info); if (sym->st_shndx != SHN_LIVEPATCH) { pr_err("symbol %s is not marked as a livepatch symbol\n", strtab + sym->st_name); From a4dc4496fc16caece3a17e5b13b2ecc1c7288ef8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 14 Feb 2022 20:10:03 -0800 Subject: [PATCH 0176/5344] io_uring: add a schedule point in io_add_buffers() commit f240762f88b4b1b58561939ffd44837759756477 upstream. Looping ~65535 times doing kmalloc() calls can trigger soft lockups, especially with DEBUG features (like KASAN). [ 253.536212] watchdog: BUG: soft lockup - CPU#64 stuck for 26s! [b219417889:12575] [ 253.544433] Modules linked in: vfat fat i2c_mux_pca954x i2c_mux spidev cdc_acm xhci_pci xhci_hcd sha3_generic gq(O) [ 253.544451] CPU: 64 PID: 12575 Comm: b219417889 Tainted: G S O 5.17.0-smp-DEV #801 [ 253.544457] RIP: 0010:kernel_text_address (./include/asm-generic/sections.h:192 ./include/linux/kallsyms.h:29 kernel/extable.c:67 kernel/extable.c:98) [ 253.544464] Code: 0f 93 c0 48 c7 c1 e0 63 d7 a4 48 39 cb 0f 92 c1 20 c1 0f b6 c1 5b 5d c3 90 0f 1f 44 00 00 55 48 89 e5 41 57 41 56 53 48 89 fb <48> c7 c0 00 00 80 a0 41 be 01 00 00 00 48 39 c7 72 0c 48 c7 c0 40 [ 253.544468] RSP: 0018:ffff8882d8baf4c0 EFLAGS: 00000246 [ 253.544471] RAX: 1ffff1105b175e00 RBX: ffffffffa13ef09a RCX: 00000000a13ef001 [ 253.544474] RDX: ffffffffa13ef09a RSI: ffff8882d8baf558 RDI: ffffffffa13ef09a [ 253.544476] RBP: ffff8882d8baf4d8 R08: ffff8882d8baf5e0 R09: 0000000000000004 [ 253.544479] R10: ffff8882d8baf5e8 R11: ffffffffa0d59a50 R12: ffff8882eab20380 [ 253.544481] R13: ffffffffa0d59a50 R14: dffffc0000000000 R15: 1ffff1105b175eb0 [ 253.544483] FS: 00000000016d3380(0000) GS:ffff88af48c00000(0000) knlGS:0000000000000000 [ 253.544486] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 253.544488] CR2: 00000000004af0f0 CR3: 00000002eabfa004 CR4: 00000000003706e0 [ 253.544491] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 253.544492] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 253.544494] Call Trace: [ 253.544496] [ 253.544498] ? io_queue_sqe (fs/io_uring.c:7143) [ 253.544505] __kernel_text_address (kernel/extable.c:78) [ 253.544508] unwind_get_return_address (arch/x86/kernel/unwind_frame.c:19) [ 253.544514] arch_stack_walk (arch/x86/kernel/stacktrace.c:27) [ 253.544517] ? io_queue_sqe (fs/io_uring.c:7143) [ 253.544521] stack_trace_save (kernel/stacktrace.c:123) [ 253.544527] ____kasan_kmalloc (mm/kasan/common.c:39 mm/kasan/common.c:45 mm/kasan/common.c:436 mm/kasan/common.c:515) [ 253.544531] ? ____kasan_kmalloc (mm/kasan/common.c:39 mm/kasan/common.c:45 mm/kasan/common.c:436 mm/kasan/common.c:515) [ 253.544533] ? __kasan_kmalloc (mm/kasan/common.c:524) [ 253.544535] ? kmem_cache_alloc_trace (./include/linux/kasan.h:270 mm/slab.c:3567) [ 253.544541] ? io_issue_sqe (fs/io_uring.c:4556 fs/io_uring.c:4589 fs/io_uring.c:6828) [ 253.544544] ? __io_queue_sqe (fs/io_uring.c:?) [ 253.544551] __kasan_kmalloc (mm/kasan/common.c:524) [ 253.544553] kmem_cache_alloc_trace (./include/linux/kasan.h:270 mm/slab.c:3567) [ 253.544556] ? io_issue_sqe (fs/io_uring.c:4556 fs/io_uring.c:4589 fs/io_uring.c:6828) [ 253.544560] io_issue_sqe (fs/io_uring.c:4556 fs/io_uring.c:4589 fs/io_uring.c:6828) [ 253.544564] ? __kasan_slab_alloc (mm/kasan/common.c:45 mm/kasan/common.c:436 mm/kasan/common.c:469) [ 253.544567] ? __kasan_slab_alloc (mm/kasan/common.c:39 mm/kasan/common.c:45 mm/kasan/common.c:436 mm/kasan/common.c:469) [ 253.544569] ? kmem_cache_alloc_bulk (mm/slab.h:732 mm/slab.c:3546) [ 253.544573] ? __io_alloc_req_refill (fs/io_uring.c:2078) [ 253.544578] ? io_submit_sqes (fs/io_uring.c:7441) [ 253.544581] ? __se_sys_io_uring_enter (fs/io_uring.c:10154 fs/io_uring.c:10096) [ 253.544584] ? __x64_sys_io_uring_enter (fs/io_uring.c:10096) [ 253.544587] ? do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) [ 253.544590] ? entry_SYSCALL_64_after_hwframe (??:?) [ 253.544596] __io_queue_sqe (fs/io_uring.c:?) [ 253.544600] io_queue_sqe (fs/io_uring.c:7143) [ 253.544603] io_submit_sqe (fs/io_uring.c:?) [ 253.544608] io_submit_sqes (fs/io_uring.c:?) [ 253.544612] __se_sys_io_uring_enter (fs/io_uring.c:10154 fs/io_uring.c:10096) [ 253.544616] __x64_sys_io_uring_enter (fs/io_uring.c:10096) [ 253.544619] do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) [ 253.544623] entry_SYSCALL_64_after_hwframe (??:?) Fixes: ddf0322db79c ("io_uring: add IORING_OP_PROVIDE_BUFFERS") Signed-off-by: Eric Dumazet Cc: Jens Axboe Cc: Pavel Begunkov Cc: io-uring Reported-by: syzbot Link: https://lore.kernel.org/r/20220215041003.2394784-1-eric.dumazet@gmail.com Signed-off-by: Jens Axboe Signed-off-by: Gang Li --- fs/io_uring.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index fa10f4469d18b..70bf380bd7d3f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4298,6 +4298,7 @@ static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) } else { list_add_tail(&buf->list, &(*head)->list); } + cond_resched(); } return i ? i : -ENOMEM; From ad09bcd76a3c843c84858f7031b827c09e8b3a28 Mon Sep 17 00:00:00 2001 From: Laibin Qiu Date: Sat, 22 Jan 2022 19:10:45 +0800 Subject: [PATCH 0177/5344] block/wbt: fix negative inflight counter when remove scsi device commit e92bc4cd34de2ce454bdea8cd198b8067ee4e123 upstream. Now that we disable wbt by set WBT_STATE_OFF_DEFAULT in wbt_disable_default() when switch elevator to bfq. And when we remove scsi device, wbt will be enabled by wbt_enable_default. If it become false positive between wbt_wait() and wbt_track() when submit write request. The following is the scenario that triggered the problem. T1 T2 T3 elevator_switch_mq bfq_init_queue wbt_disable_default <= Set rwb->enable_state (OFF) Submit_bio blk_mq_make_request rq_qos_throttle <= rwb->enable_state (OFF) scsi_remove_device sd_remove del_gendisk blk_unregister_queue elv_unregister_queue wbt_enable_default <= Set rwb->enable_state (ON) q_qos_track <= rwb->enable_state (ON) ^^^^^^ this request will mark WBT_TRACKED without inflight add and will lead to drop rqw->inflight to -1 in wbt_done() which will trigger IO hung. Fix this by move wbt_enable_default() from elv_unregister to bfq_exit_queue(). Only re-enable wbt when bfq exit. Fixes: 76a8040817b4b ("blk-wbt: make sure throttle is enabled properly") Remove oneline stale comment, and kill one oneshot local variable. Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/linux-block/20211214133103.551813-1-qiulaibin@huawei.com/ Signed-off-by: Laibin Qiu Signed-off-by: Jens Axboe Signed-off-by: Gang Li --- block/bfq-iosched.c | 2 ++ block/elevator.c | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 6264db493d45d..efb190b4d955a 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -6396,6 +6396,8 @@ static void bfq_exit_queue(struct elevator_queue *e) spin_unlock_irq(&bfqd->lock); #endif + wbt_enable_default(bfqd->queue); + kfree(bfqd); } diff --git a/block/elevator.c b/block/elevator.c index 30c934efacb4c..3ba826230c578 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -525,8 +525,6 @@ void elv_unregister_queue(struct request_queue *q) kobject_del(&e->kobj); e->registered = 0; - /* Re-enable throttling in case elevator disabled it */ - wbt_enable_default(q); } } From e6b8599581344f8a833b5ca9cbc818090bf18354 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sun, 20 Feb 2022 12:17:40 +0900 Subject: [PATCH 0178/5344] scsi: libsas: Fix sas_ata_qc_issue() handling of NCQ NON DATA commands commit 8454563e4c2aafbfb81a383ab423ea8b9b430a25 upstream. To detect for the DMA_NONE (no data transfer) DMA direction, sas_ata_qc_issue() tests if the command protocol is ATA_PROT_NODATA. This test does not include the ATA_CMD_NCQ_NON_DATA command as this command protocol is defined as ATA_PROT_NCQ_NODATA (equal to ATA_PROT_FLAG_NCQ) and not as ATA_PROT_NODATA. To include both NCQ and non-NCQ commands when testing for the DMA_NONE DMA direction, use "!ata_is_data()". Link: https://lore.kernel.org/r/20220220031810.738362-2-damien.lemoal@opensource.wdc.com Fixes: 176ddd89171d ("scsi: libsas: Reset num_scatter if libata marks qc as NODATA") Cc: stable@vger.kernel.org Reviewed-by: John Garry Reviewed-by: Jack Wang Signed-off-by: Damien Le Moal Signed-off-by: Martin K. Petersen Signed-off-by: Gang Li --- drivers/scsi/libsas/sas_ata.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c index 5d28bb7f2ca40..5c801705c4700 100644 --- a/drivers/scsi/libsas/sas_ata.c +++ b/drivers/scsi/libsas/sas_ata.c @@ -201,7 +201,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc) task->total_xfer_len = qc->nbytes; task->num_scatter = qc->n_elem; task->data_dir = qc->dma_dir; - } else if (qc->tf.protocol == ATA_PROT_NODATA) { + } else if (!ata_is_data(qc->tf.protocol)) { task->data_dir = DMA_NONE; } else { for_each_sg(qc->sg, sg, qc->n_elem, si) From 98770680c131a0363dd3bde885c1cc7a9e2b8380 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ma=C3=ADra=20Canal?= Date: Tue, 22 Feb 2022 10:16:57 -0300 Subject: [PATCH 0179/5344] drm/amd/display: Remove vupdate_int_entry definition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 3679b8518cd213c25d555553ef212e233faf698c upstream. Remove the vupdate_int_entry definition and utilization to avoid the following warning by Clang: drivers/gpu/drm/amd/amdgpu/../display/dc/irq/dcn21/irq_service_dcn21.c:410:2: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides] vupdate_no_lock_int_entry(0), ^~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/gpu/drm/amd/amdgpu/../display/dc/irq/dcn21/irq_service_dcn21.c:280:39: note: expanded from macro 'vupdate_no_lock_int_entry' [DC_IRQ_SOURCE_VUPDATE1 + reg_num] = {\ ^~ drivers/gpu/drm/amd/amdgpu/../display/dc/irq/dcn21/irq_service_dcn21.c:404:2: note: previous initialization is here vupdate_int_entry(0), ^~~~~~~~~~~~~~~~~~~~ drivers/gpu/drm/amd/amdgpu/../display/dc/irq/dcn21/irq_service_dcn21.c:269:39: note: expanded from macro 'vupdate_int_entry' [DC_IRQ_SOURCE_VUPDATE1 + reg_num] = {\ ^~ drivers/gpu/drm/amd/amdgpu/../display/dc/irq/dcn21/irq_service_dcn21.c:411:2: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides] vupdate_no_lock_int_entry(1), ^~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/gpu/drm/amd/amdgpu/../display/dc/irq/dcn21/irq_service_dcn21.c:280:39: note: expanded from macro 'vupdate_no_lock_int_entry' [DC_IRQ_SOURCE_VUPDATE1 + reg_num] = {\ ^~ drivers/gpu/drm/amd/amdgpu/../display/dc/irq/dcn21/irq_service_dcn21.c:405:2: note: previous initialization is here vupdate_int_entry(1), ^~~~~~~~~~~~~~~~~~~~ drivers/gpu/drm/amd/amdgpu/../display/dc/irq/dcn21/irq_service_dcn21.c:269:39: note: expanded from macro 'vupdate_int_entry' [DC_IRQ_SOURCE_VUPDATE1 + reg_num] = {\ ^~ drivers/gpu/drm/amd/amdgpu/../display/dc/irq/dcn21/irq_service_dcn21.c:412:2: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides] vupdate_no_lock_int_entry(2), ^~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/gpu/drm/amd/amdgpu/../display/dc/irq/dcn21/irq_service_dcn21.c:280:39: note: expanded from macro 'vupdate_no_lock_int_entry' [DC_IRQ_SOURCE_VUPDATE1 + reg_num] = {\ ^~ drivers/gpu/drm/amd/amdgpu/../display/dc/irq/dcn21/irq_service_dcn21.c:406:2: note: previous initialization is here vupdate_int_entry(2), ^~~~~~~~~~~~~~~~~~~~ drivers/gpu/drm/amd/amdgpu/../display/dc/irq/dcn21/irq_service_dcn21.c:269:39: note: expanded from macro 'vupdate_int_entry' [DC_IRQ_SOURCE_VUPDATE1 + reg_num] = {\ ^~ drivers/gpu/drm/amd/amdgpu/../display/dc/irq/dcn21/irq_service_dcn21.c:413:2: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides] vupdate_no_lock_int_entry(3), ^~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/gpu/drm/amd/amdgpu/../display/dc/irq/dcn21/irq_service_dcn21.c:280:39: note: expanded from macro 'vupdate_no_lock_int_entry' [DC_IRQ_SOURCE_VUPDATE1 + reg_num] = {\ ^~ drivers/gpu/drm/amd/amdgpu/../display/dc/irq/dcn21/irq_service_dcn21.c:407:2: note: previous initialization is here vupdate_int_entry(3), ^~~~~~~~~~~~~~~~~~~~ drivers/gpu/drm/amd/amdgpu/../display/dc/irq/dcn21/irq_service_dcn21.c:269:39: note: expanded from macro 'vupdate_int_entry' [DC_IRQ_SOURCE_VUPDATE1 + reg_num] = {\ ^~ drivers/gpu/drm/amd/amdgpu/../display/dc/irq/dcn21/irq_service_dcn21.c:414:2: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides] vupdate_no_lock_int_entry(4), ^~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/gpu/drm/amd/amdgpu/../display/dc/irq/dcn21/irq_service_dcn21.c:280:39: note: expanded from macro 'vupdate_no_lock_int_entry' [DC_IRQ_SOURCE_VUPDATE1 + reg_num] = {\ ^~ drivers/gpu/drm/amd/amdgpu/../display/dc/irq/dcn21/irq_service_dcn21.c:408:2: note: previous initialization is here vupdate_int_entry(4), ^~~~~~~~~~~~~~~~~~~~ drivers/gpu/drm/amd/amdgpu/../display/dc/irq/dcn21/irq_service_dcn21.c:269:39: note: expanded from macro 'vupdate_int_entry' [DC_IRQ_SOURCE_VUPDATE1 + reg_num] = {\ ^~ drivers/gpu/drm/amd/amdgpu/../display/dc/irq/dcn21/irq_service_dcn21.c:415:2: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides] vupdate_no_lock_int_entry(5), ^~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/gpu/drm/amd/amdgpu/../display/dc/irq/dcn21/irq_service_dcn21.c:280:39: note: expanded from macro 'vupdate_no_lock_int_entry' [DC_IRQ_SOURCE_VUPDATE1 + reg_num] = {\ ^~ drivers/gpu/drm/amd/amdgpu/../display/dc/irq/dcn21/irq_service_dcn21.c:409:2: note: previous initialization is here vupdate_int_entry(5), ^~~~~~~~~~~~~~~~~~~~ drivers/gpu/drm/amd/amdgpu/../display/dc/irq/dcn21/irq_service_dcn21.c:269:39: note: expanded from macro 'vupdate_int_entry' [DC_IRQ_SOURCE_VUPDATE1 + reg_num] = {\ ^~ 6 warnings generated. Fixes: 688f97ed3f5e ("drm/amd/display: Add vupdate_no_lock interrupts for DCN2.1") Signed-off-by: Maíra Canal Signed-off-by: Alex Deucher Signed-off-by: Gang Li --- .../amd/display/dc/irq/dcn21/irq_service_dcn21.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/irq/dcn21/irq_service_dcn21.c b/drivers/gpu/drm/amd/display/dc/irq/dcn21/irq_service_dcn21.c index 623455cd75203..d6f988403941b 100644 --- a/drivers/gpu/drm/amd/display/dc/irq/dcn21/irq_service_dcn21.c +++ b/drivers/gpu/drm/amd/display/dc/irq/dcn21/irq_service_dcn21.c @@ -227,14 +227,6 @@ static const struct irq_source_info_funcs vupdate_no_lock_irq_info_funcs = { .funcs = &pflip_irq_info_funcs\ } -#define vupdate_int_entry(reg_num)\ - [DC_IRQ_SOURCE_VUPDATE1 + reg_num] = {\ - IRQ_REG_ENTRY(OTG, reg_num,\ - OTG_GLOBAL_SYNC_STATUS, VUPDATE_INT_EN,\ - OTG_GLOBAL_SYNC_STATUS, VUPDATE_EVENT_CLEAR),\ - .funcs = &vblank_irq_info_funcs\ - } - /* vupdate_no_lock_int_entry maps to DC_IRQ_SOURCE_VUPDATEx, to match semantic * of DCE's DC_IRQ_SOURCE_VUPDATEx. */ @@ -348,12 +340,6 @@ irq_source_info_dcn21[DAL_IRQ_SOURCES_NUMBER] = { dc_underflow_int_entry(6), [DC_IRQ_SOURCE_DMCU_SCP] = dummy_irq_entry(), [DC_IRQ_SOURCE_VBIOS_SW] = dummy_irq_entry(), - vupdate_int_entry(0), - vupdate_int_entry(1), - vupdate_int_entry(2), - vupdate_int_entry(3), - vupdate_int_entry(4), - vupdate_int_entry(5), vupdate_no_lock_int_entry(0), vupdate_no_lock_int_entry(1), vupdate_no_lock_int_entry(2), From aa387a8e7d4eeff0b2d64740ab97a2bfa976d5fa Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Mon, 24 Jan 2022 21:25:04 +0200 Subject: [PATCH 0180/5344] net/mlx5: Fix possible deadlock on rule deletion commit b645e57debca846f51b3209907546ea857ddd3f5 upstream. Add missing call to up_write_ref_node() which releases the semaphore in case the FTE doesn't have destinations, such in drop rule case. Fixes: 465e7baab6d9 ("net/mlx5: Fix deletion of duplicate rules") Signed-off-by: Maor Gottlieb Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed Signed-off-by: Gang Li --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index cb25e091777b8..0ae3846d75e91 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -1939,6 +1939,8 @@ void mlx5_del_flow_rules(struct mlx5_flow_handle *handle) fte->node.del_hw_func = NULL; up_write_ref_node(&fte->node, false); tree_put_node(&fte->node, false); + } else { + up_write_ref_node(&fte->node, false); } kfree(handle); } From f39fab5fc6b7d8d2e23c7659b0906f4c42cae407 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 1 Mar 2022 00:46:19 +0100 Subject: [PATCH 0181/5344] netfilter: nf_queue: handle socket prefetch commit 3b836da4081fa585cf6c392f62557496f2cb0efe upstream. In case someone combines bpf socket assign and nf_queue, then we will queue an skb who references a struct sock that did not have its reference count incremented. As we leave rcu protection, there is no guarantee that skb->sk is still valid. For refcount-less skb->sk case, try to increment the reference count and then override the destructor. In case of failure we have two choices: orphan the skb and 'delete' preselect or let nf_queue() drop the packet. Do the latter, it should not happen during normal operation. Fixes: cf7fbe660f2d ("bpf: Add socket assign support") Acked-by: Joe Stringer Signed-off-by: Florian Westphal Signed-off-by: Gang Li --- net/netfilter/nf_queue.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index f8f52ff99cfb0..0e221df8da753 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -183,6 +183,18 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, break; } + if (skb_sk_is_prefetched(skb)) { + struct sock *sk = skb->sk; + + if (!sk_is_refcounted(sk)) { + if (!refcount_inc_not_zero(&sk->sk_refcnt)) + return -ENOTCONN; + + /* drop refcount on skb_orphan */ + skb->destructor = sock_edemux; + } + } + entry = kmalloc(sizeof(*entry) + route_key_size, GFP_ATOMIC); if (!entry) { status = -ENOMEM; From 9bb218dbebc938880ee10801ce353099e5bfc278 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 1 Mar 2022 22:29:04 -0500 Subject: [PATCH 0182/5344] tracing/histogram: Fix sorting on old "cpu" value commit 1d1898f65616c4601208963c3376c1d828cbf2c7 upstream. When trying to add a histogram against an event with the "cpu" field, it was impossible due to "cpu" being a keyword to key off of the running CPU. So to fix this, it was changed to "common_cpu" to match the other generic fields (like "common_pid"). But since some scripts used "cpu" for keying off of the CPU (for events that did not have "cpu" as a field, which is most of them), a backward compatibility trick was added such that if "cpu" was used as a key, and the event did not have "cpu" as a field name, then it would fallback and switch over to "common_cpu". This fix has a couple of subtle bugs. One was that when switching over to "common_cpu", it did not change the field name, it just set a flag. But the code still found a "cpu" field. The "cpu" field is used for filtering and is returned when the event does not have a "cpu" field. This was found by: # cd /sys/kernel/tracing # echo hist:key=cpu,pid:sort=cpu > events/sched/sched_wakeup/trigger # cat events/sched/sched_wakeup/hist Which showed the histogram unsorted: { cpu: 19, pid: 1175 } hitcount: 1 { cpu: 6, pid: 239 } hitcount: 2 { cpu: 23, pid: 1186 } hitcount: 14 { cpu: 12, pid: 249 } hitcount: 2 { cpu: 3, pid: 994 } hitcount: 5 Instead of hard coding the "cpu" checks, take advantage of the fact that trace_event_field_field() returns a special field for "cpu" and "CPU" if the event does not have "cpu" as a field. This special field has the "filter_type" of "FILTER_CPU". Check that to test if the returned field is of the CPU type instead of doing the string compare. Also, fix the sorting bug by testing for the hist_field flag of HIST_FIELD_FL_CPU when setting up the sort routine. Otherwise it will use the special CPU field to know what compare routine to use, and since that special field does not have a size, it returns tracing_map_cmp_none. Cc: stable@vger.kernel.org Fixes: 1e3bac71c505 ("tracing/histogram: Rename "cpu" to "common_cpu"") Reported-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) Signed-off-by: Gang Li --- kernel/trace/trace_events_hist.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index f63766366e238..6c8c9869d71cc 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2882,9 +2882,9 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, /* * For backward compatibility, if field_name * was "cpu", then we treat this the same as - * common_cpu. + * common_cpu. This also works for "CPU". */ - if (strcmp(field_name, "cpu") == 0) { + if (field && field->filter_type == FILTER_CPU) { *flags |= HIST_FIELD_FL_CPU; } else { hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, @@ -5231,7 +5231,7 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data) if (hist_field->flags & HIST_FIELD_FL_STACKTRACE) cmp_fn = tracing_map_cmp_none; - else if (!field) + else if (!field || hist_field->flags & HIST_FIELD_FL_CPU) cmp_fn = tracing_map_cmp_num(hist_field->size, hist_field->is_signed); else if (is_string_field(field)) From 1008f060bdfda923ac9e01d50b4920c9fa1808fb Mon Sep 17 00:00:00 2001 From: Anirudh Rayabharam Date: Sat, 5 Mar 2022 15:25:25 +0530 Subject: [PATCH 0183/5344] vhost: fix hung thread due to erroneous iotlb entries commit e2ae38cf3d91837a493cb2093c87700ff3cbe667 upstream. In vhost_iotlb_add_range_ctx(), range size can overflow to 0 when start is 0 and last is ULONG_MAX. One instance where it can happen is when userspace sends an IOTLB message with iova=size=uaddr=0 (vhost_process_iotlb_msg). So, an entry with size = 0, start = 0, last = ULONG_MAX ends up in the iotlb. Next time a packet is sent, iotlb_access_ok() loops indefinitely due to that erroneous entry. Call Trace: iotlb_access_ok+0x21b/0x3e0 drivers/vhost/vhost.c:1340 vq_meta_prefetch+0xbc/0x280 drivers/vhost/vhost.c:1366 vhost_transport_do_send_pkt+0xe0/0xfd0 drivers/vhost/vsock.c:104 vhost_worker+0x23d/0x3d0 drivers/vhost/vhost.c:372 kthread+0x2e9/0x3a0 kernel/kthread.c:377 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 Reported by syzbot at: https://syzkaller.appspot.com/bug?extid=0abd373e2e50d704db87 To fix this, do two things: 1. Return -EINVAL in vhost_chr_write_iter() when userspace asks to map a range with size 0. 2. Fix vhost_iotlb_add_range_ctx() to handle the range [0, ULONG_MAX] by splitting it into two entries. Fixes: 0bbe30668d89e ("vhost: factor out IOTLB") Reported-by: syzbot+0abd373e2e50d704db87@syzkaller.appspotmail.com Tested-by: syzbot+0abd373e2e50d704db87@syzkaller.appspotmail.com Signed-off-by: Anirudh Rayabharam Link: https://lore.kernel.org/r/20220305095525.5145-1-mail@anirudhrb.com Signed-off-by: Michael S. Tsirkin Signed-off-by: Gang Li --- drivers/vhost/iotlb.c | 11 +++++++++++ drivers/vhost/vhost.c | 5 +++++ 2 files changed, 16 insertions(+) diff --git a/drivers/vhost/iotlb.c b/drivers/vhost/iotlb.c index 5c99e1112cbb7..e262d7366c398 100644 --- a/drivers/vhost/iotlb.c +++ b/drivers/vhost/iotlb.c @@ -57,6 +57,17 @@ int vhost_iotlb_add_range_ctx(struct vhost_iotlb *iotlb, if (last < start) return -EFAULT; + /* If the range being mapped is [0, ULONG_MAX], split it into two entries + * otherwise its size would overflow u64. + */ + if (start == 0 && last == ULONG_MAX) { + u64 mid = last / 2; + + vhost_iotlb_add_range_ctx(iotlb, start, mid, addr, perm, opaque); + addr += mid + 1; + start = mid + 1; + } + if (iotlb->limit && iotlb->nmaps == iotlb->limit && iotlb->flags & VHOST_IOTLB_FLAG_RETIRE) { diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 0c741df2e34b9..cc78ffdd2bd9a 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1158,6 +1158,11 @@ ssize_t vhost_chr_write_iter(struct vhost_dev *dev, goto done; } + if (msg.size == 0) { + ret = -EINVAL; + goto done; + } + if (dev->msg_handler) ret = dev->msg_handler(dev, &msg); else From 2542408d71d6324fe97d080d92ad77af65f5c47a Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Fri, 4 Feb 2022 11:47:44 +0200 Subject: [PATCH 0184/5344] net/mlx5: Fix a race on command flush flow commit 063bd355595428750803d8736a9bb7c8db67d42d upstream. Fix a refcount use after free warning due to a race on command entry. Such race occurs when one of the commands releases its last refcount and frees its index and entry while another process running command flush flow takes refcount to this command entry. The process which handles commands flush may see this command as needed to be flushed if the other process released its refcount but didn't release the index yet. Fix it by adding the needed spin lock. It fixes the following warning trace: refcount_t: addition on 0; use-after-free. WARNING: CPU: 11 PID: 540311 at lib/refcount.c:25 refcount_warn_saturate+0x80/0xe0 ... RIP: 0010:refcount_warn_saturate+0x80/0xe0 ... Call Trace: mlx5_cmd_trigger_completions+0x293/0x340 [mlx5_core] mlx5_cmd_flush+0x3a/0xf0 [mlx5_core] enter_error_state+0x44/0x80 [mlx5_core] mlx5_fw_fatal_reporter_err_work+0x37/0xe0 [mlx5_core] process_one_work+0x1be/0x390 worker_thread+0x4d/0x3d0 ? rescuer_thread+0x350/0x350 kthread+0x141/0x160 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x1f/0x30 Fixes: 50b2412b7e78 ("net/mlx5: Avoid possible free of command entry while timeout comp handler") Signed-off-by: Moshe Shemesh Reviewed-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed Signed-off-by: Gang Li --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 60a0b3a04e730..402dae3bca4da 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -130,11 +130,8 @@ static int cmd_alloc_index(struct mlx5_cmd *cmd) static void cmd_free_index(struct mlx5_cmd *cmd, int idx) { - unsigned long flags; - - spin_lock_irqsave(&cmd->alloc_lock, flags); + lockdep_assert_held(&cmd->alloc_lock); set_bit(idx, &cmd->bitmask); - spin_unlock_irqrestore(&cmd->alloc_lock, flags); } static void cmd_ent_get(struct mlx5_cmd_work_ent *ent) @@ -144,17 +141,21 @@ static void cmd_ent_get(struct mlx5_cmd_work_ent *ent) static void cmd_ent_put(struct mlx5_cmd_work_ent *ent) { + struct mlx5_cmd *cmd = ent->cmd; + unsigned long flags; + + spin_lock_irqsave(&cmd->alloc_lock, flags); if (!refcount_dec_and_test(&ent->refcnt)) - return; + goto out; if (ent->idx >= 0) { - struct mlx5_cmd *cmd = ent->cmd; - cmd_free_index(cmd, ent->idx); up(ent->page_queue ? &cmd->pages_sem : &cmd->sem); } cmd_free_ent(ent); +out: + spin_unlock_irqrestore(&cmd->alloc_lock, flags); } static struct mlx5_cmd_layout *get_inst(struct mlx5_cmd *cmd, int idx) From b3a4207d8c4d8959ead7376e881a08b5138b3baf Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 10 Mar 2022 15:52:11 +0800 Subject: [PATCH 0185/5344] vhost: allow batching hint without size commit 95932ab2ea07b79cdb33121e2f40ccda9e6a73b5 upstream. Commit e2ae38cf3d91 ("vhost: fix hung thread due to erroneous iotlb entries") tries to reject the IOTLB message whose size is zero. But the size is not necessarily meaningful, one example is the batching hint, so the commit breaks that. Fixing this be reject zero size message only if the message is used to update/invalidate the IOTLB. Fixes: e2ae38cf3d91 ("vhost: fix hung thread due to erroneous iotlb entries") Reported-by: Eli Cohen Cc: Anirudh Rayabharam Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20220310075211.4801-1-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin Tested-by: Eli Cohen Signed-off-by: Gang Li --- drivers/vhost/vhost.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index cc78ffdd2bd9a..c0958d1472841 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1158,7 +1158,9 @@ ssize_t vhost_chr_write_iter(struct vhost_dev *dev, goto done; } - if (msg.size == 0) { + if ((msg.type == VHOST_IOTLB_UPDATE || + msg.type == VHOST_IOTLB_INVALIDATE) && + msg.size == 0) { ret = -EINVAL; goto done; } From 6f6081d971c445d957de39954a1837f0df649be8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 18 Mar 2022 11:28:13 -0600 Subject: [PATCH 0186/5344] io_uring: terminate manual loop iterator loop correctly for non-vecs commit 5e929367468c8f97cd1ffb0417316cecfebef94b upstream. The fix for not advancing the iterator if we're using fixed buffers is broken in that it can hit a condition where we don't terminate the loop. This results in io-wq looping forever, asking to read (or write) 0 bytes for every subsequent loop. Reported-by: Joel Jaeschke Link: https://github.com/axboe/liburing/issues/549 Fixes: 16c8d2df7ec0 ("io_uring: ensure symmetry in handling iter types in loop_rw_iter()") Signed-off-by: Jens Axboe Signed-off-by: Gang Li --- fs/io_uring.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 70bf380bd7d3f..efde3b86689e8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3311,13 +3311,15 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) ret = nr; break; } + ret += nr; if (!iov_iter_is_bvec(iter)) { iov_iter_advance(iter, nr); } else { - req->rw.len -= nr; req->rw.addr += nr; + req->rw.len -= nr; + if (!req->rw.len) + break; } - ret += nr; if (nr != iovec.iov_len) break; } From 52acb65ff4e2fbf3e3d255e8b0889800ca86814a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 21 Mar 2022 09:59:57 -0700 Subject: [PATCH 0187/5344] tcp: ensure PMTU updates are processed during fastopen commit ed0c99dc0f499ff8b6e75b5ae6092ab42be1ad39 upstream. tp->rx_opt.mss_clamp is not populated, yet, during TFO send so we rise it to the local MSS. tp->mss_cache is not updated, however: tcp_v6_connect(): tp->rx_opt.mss_clamp = IPV6_MIN_MTU - headers; tcp_connect(): tcp_connect_init(): tp->mss_cache = min(mtu, tp->rx_opt.mss_clamp) tcp_send_syn_data(): tp->rx_opt.mss_clamp = tp->advmss After recent fixes to ICMPv6 PTB handling we started dropping PMTU updates higher than tp->mss_cache. Because of the stale tp->mss_cache value PMTU updates during TFO are always dropped. Thanks to Wei for helping zero in on the problem and the fix! Fixes: c7bb4b89033b ("ipv6: tcp: drop silly ICMPv6 packet too big messages") Reported-by: Andre Nash Reported-by: Neil Spring Reviewed-by: Wei Wang Acked-by: Yuchung Cheng Acked-by: Martin KaFai Lau Signed-off-by: Jakub Kicinski Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20220321165957.1769954-1-kuba@kernel.org Signed-off-by: Jakub Kicinski Signed-off-by: Gang Li --- net/ipv4/tcp_output.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a170bcf6d6cd0..95ad9d3373f37 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3867,6 +3867,7 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) */ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_request *fo = tp->fastopen_req; int space, err = 0; @@ -3881,8 +3882,10 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) * private TCP options. The cost is reduced data space in SYN :( */ tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp); + /* Sync mss_cache after updating the mss_clamp */ + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); - space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - + space = __tcp_mtu_to_mss(sk, icsk->icsk_pmtu_cookie) - MAX_TCP_OPTION_SPACE; space = min_t(size_t, space, fo->size); From dce558f70fd64ab15d9a3208f6ac0f415ca8c091 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Tue, 22 Mar 2022 14:44:06 -0700 Subject: [PATCH 0188/5344] mm/hwpoison: fix error page recovered but reported "not recovered" commit 046545a661af2beec21de7b90ca0e35f05088a81 upstream. When an uncorrected memory error is consumed there is a race between the CMCI from the memory controller reporting an uncorrected error with a UCNA signature, and the core reporting and SRAR signature machine check when the data is about to be consumed. If the CMCI wins that race, the page is marked poisoned when uc_decode_notifier() calls memory_failure() and the machine check processing code finds the page already poisoned. It calls kill_accessing_process() to make sure a SIGBUS is sent. But returns the wrong error code. Console log looks like this: mce: Uncorrected hardware memory error in user-access at 3710b3400 Memory failure: 0x3710b3: recovery action for dirty LRU page: Recovered Memory failure: 0x3710b3: already hardware poisoned Memory failure: 0x3710b3: Sending SIGBUS to einj_mem_uc:361438 due to hardware memory corruption mce: Memory error not recovered kill_accessing_process() is supposed to return -EHWPOISON to notify that SIGBUS is already set to the process and kill_me_maybe() doesn't have to send it again. But current code simply fails to do this, so fix it to make sure to work as intended. This change avoids the noise message "Memory error not recovered" and skips duplicate SIGBUSs. [tony.luck@intel.com: reword some parts of commit message] Link: https://lkml.kernel.org/r/20220113231117.1021405-1-naoya.horiguchi@linux.dev Fixes: a3f5d80ea401 ("mm,hwpoison: send SIGBUS with error virutal address") Signed-off-by: Naoya Horiguchi Reported-by: Youquan Song Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Gang Li --- mm/memory-failure.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c874c3510ab13..9e68f49cf4d7a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -666,8 +666,10 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn, (void *)&priv); if (ret == 1 && priv.tk.addr) kill_proc(&priv.tk, pfn, flags); + else + ret = 0; mmap_read_unlock(p->mm); - return ret ? -EFAULT : -EHWPOISON; + return ret > 0 ? -EHWPOISON : -EFAULT; } static const char *action_name[] = { From bd2f2af6f8193eb3fb677ad1442e0c26feb73b5d Mon Sep 17 00:00:00 2001 From: Anirudh Rayabharam Date: Sat, 12 Mar 2022 19:41:21 +0530 Subject: [PATCH 0189/5344] vhost: handle error while adding split ranges to iotlb commit 03a91c9af2c42ae14afafb829a4b7e6589ab5892 upstream. vhost_iotlb_add_range_ctx() handles the range [0, ULONG_MAX] by splitting it into two ranges and adding them separately. The return value of adding the first range to the iotlb is currently ignored. Check the return value and bail out in case of an error. Signed-off-by: Anirudh Rayabharam Link: https://lore.kernel.org/r/20220312141121.4981-1-mail@anirudhrb.com Signed-off-by: Michael S. Tsirkin Fixes: e2ae38cf3d91 ("vhost: fix hung thread due to erroneous iotlb entries") Reviewed-by: Stefano Garzarella Signed-off-by: Gang Li --- drivers/vhost/iotlb.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/iotlb.c b/drivers/vhost/iotlb.c index e262d7366c398..483ea5451b520 100644 --- a/drivers/vhost/iotlb.c +++ b/drivers/vhost/iotlb.c @@ -62,8 +62,12 @@ int vhost_iotlb_add_range_ctx(struct vhost_iotlb *iotlb, */ if (start == 0 && last == ULONG_MAX) { u64 mid = last / 2; + int err = vhost_iotlb_add_range_ctx(iotlb, start, mid, addr, + perm, opaque); + + if (err) + return err; - vhost_iotlb_add_range_ctx(iotlb, start, mid, addr, perm, opaque); addr += mid + 1; start = mid + 1; } From 65d8f7473e68e626d2b6f4a108e63f5aed023387 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 31 Mar 2022 08:27:09 -0400 Subject: [PATCH 0190/5344] XArray: Update the LRU list in xas_split() commit 3ed4bb77156da0bc732847c8c9df92454c1fbeea upstream. When splitting a value entry, we may need to add the new nodes to the LRU list and remove the parent node from the LRU list. The WARN_ON checks in shadow_lru_isolate() catch this oversight. This bug was latent until we stopped splitting folios in shrink_page_list() with commit 820c4e2e6f51 ("mm/vmscan: Free non-shmem folios without splitting them"). That allows the creation of large shadow entries, and subsequently when trying to page in a small page, we will split the large shadow entry in __filemap_add_folio(). Fixes: 8fc75643c5e1 ("XArray: add xas_split") Reported-by: Hugh Dickins Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Gang Li --- lib/xarray.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/xarray.c b/lib/xarray.c index 7d22b30591275..96e108940afd7 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -1078,6 +1078,7 @@ void xas_split(struct xa_state *xas, void *entry, unsigned int order) xa_mk_node(child)); if (xa_is_value(curr)) values--; + xas_update(xas, child); } else { unsigned int canon = offset - xas->xa_sibs; @@ -1092,6 +1093,7 @@ void xas_split(struct xa_state *xas, void *entry, unsigned int order) } while (offset-- > xas->xa_offset); node->nr_values += values; + xas_update(xas, node); } EXPORT_SYMBOL_GPL(xas_split); #endif From 6e922d957464eb6562000ba02a9776b6a3ac01bf Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 4 Jun 2020 10:48:19 +0200 Subject: [PATCH 0191/5344] ovl: only pass ->ki_flags to ovl_iocb_to_rwf() commit b778e1ee1afe2784c2a034bb1e0192423097cb36 upstream. Next patch will want to pass a modified set of flags, so... Signed-off-by: Miklos Szeredi Signed-off-by: Jiachen Zhang --- fs/overlayfs/file.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index ab5e92897270a..5bd9f3bb91423 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -213,9 +213,8 @@ static void ovl_file_accessed(struct file *file) touch_atime(&file->f_path); } -static rwf_t ovl_iocb_to_rwf(struct kiocb *iocb) +static rwf_t ovl_iocb_to_rwf(int ifl) { - int ifl = iocb->ki_flags; rwf_t flags = 0; if (ifl & IOCB_NOWAIT) @@ -246,7 +245,7 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) old_cred = ovl_override_creds(file_inode(file)->i_sb); ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, - ovl_iocb_to_rwf(iocb)); + ovl_iocb_to_rwf(iocb->ki_flags)); revert_creds(old_cred); ovl_file_accessed(file); @@ -281,7 +280,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) old_cred = ovl_override_creds(file_inode(file)->i_sb); file_start_write(real.file); ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, - ovl_iocb_to_rwf(iocb)); + ovl_iocb_to_rwf(iocb->ki_flags)); file_end_write(real.file); revert_creds(old_cred); From 46b9f752278cd875e0aaebdb6bf46bca1786f950 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 31 Aug 2020 14:15:29 -0400 Subject: [PATCH 0192/5344] ovl: provide a mount option "volatile" commit c86243b090bc25f81abe33ddfa9fe833e02c4d64 upstream. Container folks are complaining that dnf/yum issues too many sync while installing packages and this slows down the image build. Build requirement is such that they don't care if a node goes down while build was still going on. In that case, they will simply throw away unfinished layer and start new build. So they don't care about syncing intermediate state to the disk and hence don't want to pay the price associated with sync. So they are asking for mount options where they can disable sync on overlay mount point. They primarily seem to have two use cases. - For building images, they will mount overlay with nosync and then sync upper layer after unmounting overlay and reuse upper as lower for next layer. - For running containers, they don't seem to care about syncing upper layer because if node goes down, they will simply throw away upper layer and create a fresh one. So this patch provides a mount option "volatile" which disables all forms of sync. Now it is caller's responsibility to throw away upper if system crashes or shuts down and start fresh. With "volatile", I am seeing roughly 20% speed up in my VM where I am just installing emacs in an image. Installation time drops from 31 seconds to 25 seconds when nosync option is used. This is for the case of building on top of an image where all packages are already cached. That way I take out the network operations latency out of the measurement. Giuseppe is also looking to cut down on number of iops done on the disk. He is complaining that often in cloud their VMs are throttled if they cross the limit. This option can help them where they reduce number of iops (by cutting down on frequent sync and writebacks). Signed-off-by: Giuseppe Scrivano Signed-off-by: Vivek Goyal Signed-off-by: Miklos Szeredi Signed-off-by: Jiachen Zhang --- Documentation/filesystems/overlayfs.txt | 19 +++++++ fs/overlayfs/copy_up.c | 12 +++-- fs/overlayfs/file.c | 9 +++- fs/overlayfs/ovl_entry.h | 6 +++ fs/overlayfs/readdir.c | 3 ++ fs/overlayfs/super.c | 66 +++++++++++++++++++++++++ 6 files changed, 110 insertions(+), 5 deletions(-) diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt index 845d689e0fd71..83337edb21cf2 100644 --- a/Documentation/filesystems/overlayfs.txt +++ b/Documentation/filesystems/overlayfs.txt @@ -481,6 +481,25 @@ verified on mount time to check that upper file handles are not stale. This verification may cause significant overhead in some cases. +Volatile mount +-------------- + +This is enabled with the "volatile" mount option. Volatile mounts are not +guaranteed to survive a crash. It is strongly recommended that volatile +mounts are only used if data written to the overlay can be recreated +without significant effort. + +The advantage of mounting with the "volatile" option is that all forms of +sync calls to the upper filesystem are omitted. + +When overlay is mounted with "volatile" option, the directory +"$workdir/work/incompat/volatile" is created. During next mount, overlay +checks for this directory and refuses to mount if present. This is a strong +indicator that user should throw away upper and work directories and create +fresh one. In very limited cases where the user knows that the system has +not crashed and contents of upperdir are intact, The "volatile" directory +can be removed. + Testsuite --------- diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 143c52de97ecc..3e5f412471dbf 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -117,7 +117,8 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new) return error; } -static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) +static int ovl_copy_up_data(struct ovl_fs *ofs, struct path *old, + struct path *new, loff_t len) { struct file *old_file; struct file *new_file; @@ -170,7 +171,7 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) len -= bytes; } out: - if (!error) + if (!error && ovl_should_sync(ofs)) error = vfs_fsync(new_file, 0); fput(new_file); out_fput: @@ -439,6 +440,7 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) { + struct ovl_fs *ofs = c->dentry->d_sb->s_fs_info; int err; /* @@ -454,7 +456,8 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) upperpath.dentry = temp; ovl_path_lowerdata(c->dentry, &datapath); - err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size); + err = ovl_copy_up_data(ofs, &datapath, &upperpath, + c->stat.size); if (err) return err; } @@ -738,6 +741,7 @@ static bool ovl_need_meta_copy_up(struct dentry *dentry, umode_t mode, /* Copy up data of an inode which was copied up metadata only in the past. */ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) { + struct ovl_fs *ofs = c->dentry->d_sb->s_fs_info; struct path upperpath, datapath; int err; char *capability = NULL; @@ -758,7 +762,7 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) goto out; } - err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size); + err = ovl_copy_up_data(ofs, &datapath, &upperpath, c->stat.size); if (err) goto out_free; diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 5bd9f3bb91423..e5227ddc40abb 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -262,6 +262,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) struct fd real; const struct cred *old_cred; ssize_t ret; + int ifl = iocb->ki_flags; if (!iov_iter_count(iter)) return 0; @@ -277,10 +278,13 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) if (ret) goto out_unlock; + if (!ovl_should_sync(inode->i_sb->s_fs_info)) + ifl &= ~(IOCB_DSYNC | IOCB_SYNC); + old_cred = ovl_override_creds(file_inode(file)->i_sb); file_start_write(real.file); ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, - ovl_iocb_to_rwf(iocb->ki_flags)); + ovl_iocb_to_rwf(ifl)); file_end_write(real.file); revert_creds(old_cred); @@ -343,6 +347,9 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) const struct cred *old_cred; int ret; + if (!ovl_should_sync(file_inode(file)->i_sb->s_fs_info)) + return 0; + ret = ovl_real_fdget_meta(file, &real, !datasync); if (ret) return ret; diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 28348c44ea5b2..d9f3d366e48d5 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -17,6 +17,7 @@ struct ovl_config { bool nfs_export; int xino; bool metacopy; + bool ovl_volatile; }; struct ovl_sb { @@ -75,6 +76,11 @@ struct ovl_fs { unsigned int xino_bits; }; +static inline bool ovl_should_sync(struct ovl_fs *ofs) +{ + return !ofs->config.ovl_volatile; +} + /* private information held for every overlayfs dentry */ struct ovl_entry { union { diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 7255e6a5838f8..0cee14cc21712 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -831,6 +831,9 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, if (!OVL_TYPE_UPPER(ovl_path_type(dentry))) return 0; + if (!ovl_should_sync(dentry->d_sb->s_fs_info)) + return 0; + /* * Need to check if we started out being a lower dir, but got copied up */ diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index f036d7544d4a6..cfaff4dc2730d 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -262,6 +262,8 @@ static int ovl_sync_fs(struct super_block *sb, int wait) if (!ofs->upper_mnt) return 0; + if (!ovl_should_sync(ofs)) + return 0; /* * If this is a sync(2) call or an emergency sync, all the super blocks * will be iterated, including upper_sb, so no need to do anything. @@ -366,6 +368,8 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry) if (ofs->config.metacopy != ovl_metacopy_def) seq_printf(m, ",metacopy=%s", ofs->config.metacopy ? "on" : "off"); + if (ofs->config.ovl_volatile) + seq_puts(m, ",volatile"); return 0; } @@ -406,6 +410,7 @@ enum { OPT_XINO_AUTO, OPT_METACOPY_ON, OPT_METACOPY_OFF, + OPT_VOLATILE, OPT_ERR, }; @@ -424,6 +429,7 @@ static const match_table_t ovl_tokens = { {OPT_XINO_AUTO, "xino=auto"}, {OPT_METACOPY_ON, "metacopy=on"}, {OPT_METACOPY_OFF, "metacopy=off"}, + {OPT_VOLATILE, "volatile"}, {OPT_ERR, NULL} }; @@ -562,6 +568,10 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) config->metacopy = false; break; + case OPT_VOLATILE: + config->ovl_volatile = true; + break; + default: pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p); return -EINVAL; @@ -576,6 +586,11 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) config->workdir = NULL; } + if (!config->upperdir && config->ovl_volatile) { + pr_info("option \"volatile\" is meaningless in a non-upper mount, ignoring it.\n"); + config->ovl_volatile = false; + } + err = ovl_parse_redirect_mode(config, config->redirect_mode); if (err) return err; @@ -1076,6 +1091,45 @@ static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs, return err; } +static struct dentry *ovl_lookup_or_create(struct dentry *parent, + const char *name, umode_t mode) +{ + size_t len = strlen(name); + struct dentry *child; + + inode_lock_nested(parent->d_inode, I_MUTEX_PARENT); + child = lookup_one_len(name, parent, len); + if (!IS_ERR(child) && !child->d_inode) + child = ovl_create_real(parent->d_inode, child, + OVL_CATTR(mode)); + inode_unlock(parent->d_inode); + dput(parent); + + return child; +} + +/* + * Creates $workdir/work/incompat/volatile/dirty file if it is not already + * present. + */ +static int ovl_create_volatile_dirty(struct ovl_fs *ofs) +{ + unsigned int ctr; + struct dentry *d = dget(ofs->workbasedir); + static const char *const volatile_path[] = { + OVL_WORKDIR_NAME, "incompat", "volatile", "dirty" + }; + const char *const *name = volatile_path; + + for (ctr = ARRAY_SIZE(volatile_path); ctr; ctr--, name++) { + d = ovl_lookup_or_create(d, *name, ctr > 1 ? S_IFDIR : S_IFREG); + if (IS_ERR(d)) + return PTR_ERR(d); + } + dput(d); + return 0; +} + static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, struct path *workpath) { @@ -1135,6 +1189,18 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, vfs_removexattr(ofs->workdir, OVL_XATTR_OPAQUE); } + /* + * For volatile mount, create a incompat/volatile/dirty file to keep + * track of it. + */ + if (ofs->config.ovl_volatile) { + err = ovl_create_volatile_dirty(ofs); + if (err < 0) { + pr_err("Failed to create volatile/dirty file.\n"); + goto out; + } + } + /* Check if upper/work fs supports file handles */ fh_type = ovl_can_decode_fh(ofs->workdir->d_sb); if (ofs->config.index && !fh_type) { From ce35a5b26cfb551c8c3c9b23fba8130280c5a051 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 1 Jun 2020 21:45:36 -0700 Subject: [PATCH 0193/5344] vfs: track per-sb writeback errors and report them to syncfs commit 735e4ae5ba28c886d249ad04d3c8cc097dad6336 upstream. Patch series "vfs: have syncfs() return error when there are writeback errors", v6. Currently, syncfs does not return errors when one of the inodes fails to be written back. It will return errors based on the legacy AS_EIO and AS_ENOSPC flags when syncing out the block device fails, but that's not particularly helpful for filesystems that aren't backed by a blockdev. It's also possible for a stray sync to lose those errors. The basic idea in this set is to track writeback errors at the superblock level, so that we can quickly and easily check whether something bad happened without having to fsync each file individually. syncfs is then changed to reliably report writeback errors after they occur, much in the same fashion as fsync does now. This patch (of 2): Usually we suggest that applications call fsync when they want to ensure that all data written to the file has made it to the backing store, but that can be inefficient when there are a lot of open files. Calling syncfs on the filesystem can be more efficient in some situations, but the error reporting doesn't currently work the way most people expect. If a single inode on a filesystem reports a writeback error, syncfs won't necessarily return an error. syncfs only returns an error if __sync_blockdev fails, and on some filesystems that's a no-op. It would be better if syncfs reported an error if there were any writeback failures. Then applications could call syncfs to see if there are any errors on any open files, and could then call fsync on all of the other descriptors to figure out which one failed. This patch adds a new errseq_t to struct super_block, and has mapping_set_error also record writeback errors there. To report those errors, we also need to keep an errseq_t in struct file to act as a cursor. This patch adds a dedicated field for that purpose, which slots nicely into 4 bytes of padding at the end of struct file on x86_64. An earlier version of this patch used an O_PATH file descriptor to cue the kernel that the open file should track the superblock error and not the inode's writeback error. I think that API is just too weird though. This is simpler and should make syncfs error reporting "just work" even if someone is multiplexing fsync and syncfs on the same fds. Signed-off-by: Jeff Layton Signed-off-by: Andrew Morton Reviewed-by: Jan Kara Cc: Andres Freund Cc: Matthew Wilcox Cc: Al Viro Cc: Christoph Hellwig Cc: Dave Chinner Cc: David Howells Link: http://lkml.kernel.org/r/20200428135155.19223-1-jlayton@kernel.org Link: http://lkml.kernel.org/r/20200428135155.19223-2-jlayton@kernel.org Signed-off-by: Linus Torvalds Signed-off-by: Jiachen Zhang --- drivers/dax/device.c | 1 + fs/file_table.c | 1 + fs/open.c | 3 +-- fs/sync.c | 6 ++++-- include/linux/fs.h | 16 ++++++++++++++++ include/linux/pagemap.h | 5 ++++- 6 files changed, 27 insertions(+), 5 deletions(-) diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 1af823b2fe6bd..4c0af2eb7e196 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -377,6 +377,7 @@ static int dax_open(struct inode *inode, struct file *filp) inode->i_mapping->a_ops = &dev_dax_aops; filp->f_mapping = inode->i_mapping; filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); + filp->f_sb_err = file_sample_sb_err(filp); filp->private_data = dev_dax; inode->i_flags = S_DAX; diff --git a/fs/file_table.c b/fs/file_table.c index b2c2d716914ea..7e3270224a8b9 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -198,6 +198,7 @@ static struct file *alloc_file(const struct path *path, int flags, file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; file->f_wb_err = filemap_sample_wb_err(file->f_mapping); + file->f_sb_err = file_sample_sb_err(file); if ((file->f_mode & FMODE_READ) && likely(fop->read || fop->read_iter)) file->f_mode |= FMODE_CAN_READ; diff --git a/fs/open.c b/fs/open.c index 0c976e884b027..828eb7b78f198 100644 --- a/fs/open.c +++ b/fs/open.c @@ -743,9 +743,8 @@ static int do_dentry_open(struct file *f, path_get(&f->f_path); f->f_inode = inode; f->f_mapping = inode->i_mapping; - - /* Ensure that we skip any errors that predate opening of the file */ f->f_wb_err = filemap_sample_wb_err(f->f_mapping); + f->f_sb_err = file_sample_sb_err(f); if (unlikely(f->f_flags & O_PATH)) { f->f_mode = FMODE_PATH | FMODE_OPENED; diff --git a/fs/sync.c b/fs/sync.c index 4d1ff010bc5af..c6f6f5be5682a 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -161,7 +161,7 @@ SYSCALL_DEFINE1(syncfs, int, fd) { struct fd f = fdget(fd); struct super_block *sb; - int ret; + int ret, ret2; if (!f.file) return -EBADF; @@ -171,8 +171,10 @@ SYSCALL_DEFINE1(syncfs, int, fd) ret = sync_filesystem(sb); up_read(&sb->s_umount); + ret2 = errseq_check_and_advance(&sb->s_wb_err, &f.file->f_sb_err); + fdput(f); - return ret; + return ret ? ret : ret2; } /** diff --git a/include/linux/fs.h b/include/linux/fs.h index b3b357efc731b..c4438a53d7891 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -983,6 +983,7 @@ struct file { #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; errseq_t f_wb_err; + errseq_t f_sb_err; /* for syncfs */ } __randomize_layout __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ @@ -1527,6 +1528,9 @@ struct super_block { /* Being remounted read-only */ int s_readonly_remount; + /* per-sb errseq_t for reporting writeback errors via syncfs */ + errseq_t s_wb_err; + /* AIO completions deferred from interrupt context */ struct workqueue_struct *s_dio_done_wq; struct hlist_head s_pins; @@ -2834,6 +2838,18 @@ static inline errseq_t filemap_sample_wb_err(struct address_space *mapping) return errseq_sample(&mapping->wb_err); } +/** + * file_sample_sb_err - sample the current errseq_t to test for later errors + * @mapping: mapping to be sampled + * + * Grab the most current superblock-level errseq_t value for the given + * struct file. + */ +static inline errseq_t file_sample_sb_err(struct file *file) +{ + return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err); +} + static inline int filemap_nr_thps(struct address_space *mapping) { #ifdef CONFIG_READ_ONLY_THP_FOR_FS diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index fac2dc02bd27a..3b1a6ec3ca8f3 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -51,7 +51,10 @@ static inline void mapping_set_error(struct address_space *mapping, int error) return; /* Record in wb_err for checkers using errseq_t based tracking */ - filemap_set_wb_err(mapping, error); + __filemap_set_wb_err(mapping, error); + + /* Record it in superblock */ + errseq_set(&mapping->host->i_sb->s_wb_err, error); /* Record it in flags for now, for legacy callers */ if (error == -ENOSPC) From 22a3fd1d5e563e6e561b12b2c92a268044a3a76e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 1 Jun 2020 21:45:40 -0700 Subject: [PATCH 0194/5344] fs/buffer.c: record blockdev write errors in super_block that it backs commit 485e9605c05733759d3bd5aba4fbe561801f3658 upstream. When syncing out a block device (a'la __sync_blockdev), any error encountered will only be recorded in the bd_inode's mapping. When the blockdev contains a filesystem however, we'd like to also record the error in the super_block that's stored there. Make mark_buffer_write_io_error also record the error in the corresponding super_block when a writeback error occurs and the block device contains a mounted superblock. Since superblocks are RCU freed, hold the rcu_read_lock to ensure that the superblock doesn't go away while we're marking it. Signed-off-by: Jeff Layton Signed-off-by: Andrew Morton Reviewed-by: Jan Kara Cc: Al Viro Cc: Andres Freund Cc: Matthew Wilcox Cc: David Howells Cc: Christoph Hellwig Cc: Dave Chinner Link: http://lkml.kernel.org/r/20200428135155.19223-3-jlayton@kernel.org Signed-off-by: Linus Torvalds Signed-off-by: Jiachen Zhang --- fs/buffer.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/buffer.c b/fs/buffer.c index 847cfffd9c071..86e21ed6e2eae 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1120,12 +1120,19 @@ EXPORT_SYMBOL(mark_buffer_dirty); void mark_buffer_write_io_error(struct buffer_head *bh) { + struct super_block *sb; + set_buffer_write_io_error(bh); /* FIXME: do we need to set this in both places? */ if (bh->b_page && bh->b_page->mapping) mapping_set_error(bh->b_page->mapping, -EIO); if (bh->b_assoc_map) mapping_set_error(bh->b_assoc_map, -EIO); + rcu_read_lock(); + sb = READ_ONCE(bh->b_bdev->bd_super); + if (sb) + errseq_set(&sb->s_wb_err, -EIO); + rcu_read_unlock(); } EXPORT_SYMBOL(mark_buffer_write_io_error); From 8cc1cea85458ab22c81969aac1006e7068fc438b Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Thu, 7 Jan 2021 16:10:43 -0800 Subject: [PATCH 0195/5344] ovl: implement volatile-specific fsync error behaviour commit 335d3fc57941e5c6164c69d439aec1cb7a800876 upstream. Overlayfs's volatile option allows the user to bypass all forced sync calls to the upperdir filesystem. This comes at the cost of safety. We can never ensure that the user's data is intact, but we can make a best effort to expose whether or not the data is likely to be in a bad state. The best way to handle this in the time being is that if an overlayfs's upperdir experiences an error after a volatile mount occurs, that error will be returned on fsync, fdatasync, sync, and syncfs. This is contradictory to the traditional behaviour of VFS which fails the call once, and only raises an error if a subsequent fsync error has occurred, and been raised by the filesystem. One awkward aspect of the patch is that we have to manually set the superblock's errseq_t after the sync_fs callback as opposed to just returning an error from syncfs. This is because the call chain looks something like this: sys_syncfs -> sync_filesystem -> __sync_filesystem -> /* The return value is ignored here sb->s_op->sync_fs(sb) _sync_blockdev /* Where the VFS fetches the error to raise to userspace */ errseq_check_and_advance Because of this we call errseq_set every time the sync_fs callback occurs. Due to the nature of this seen / unseen dichotomy, if the upperdir is an inconsistent state at the initial mount time, overlayfs will refuse to mount, as overlayfs cannot get a snapshot of the upperdir's errseq that will increment on error until the user calls syncfs. Signed-off-by: Sargun Dhillon Suggested-by: Amir Goldstein Reviewed-by: Amir Goldstein Fixes: c86243b090bc ("ovl: provide a mount option "volatile"") Cc: stable@vger.kernel.org Reviewed-by: Vivek Goyal Reviewed-by: Jeff Layton Signed-off-by: Miklos Szeredi Signed-off-by: Jiachen Zhang --- Documentation/filesystems/overlayfs.txt | 8 ++++++ fs/overlayfs/file.c | 5 ++-- fs/overlayfs/overlayfs.h | 1 + fs/overlayfs/ovl_entry.h | 2 ++ fs/overlayfs/readdir.c | 6 +++-- fs/overlayfs/super.c | 34 ++++++++++++++++++++----- fs/overlayfs/util.c | 27 ++++++++++++++++++++ 7 files changed, 72 insertions(+), 11 deletions(-) diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt index 83337edb21cf2..11551b0f7ce16 100644 --- a/Documentation/filesystems/overlayfs.txt +++ b/Documentation/filesystems/overlayfs.txt @@ -492,6 +492,14 @@ without significant effort. The advantage of mounting with the "volatile" option is that all forms of sync calls to the upper filesystem are omitted. +In order to avoid a giving a false sense of safety, the syncfs (and fsync) +semantics of volatile mounts are slightly different than that of the rest of +VFS. If any writeback error occurs on the upperdir's filesystem after a +volatile mount takes place, all sync functions will return an error. Once this +condition is reached, the filesystem will not recover, and every subsequent sync +call will return an error, even if the upperdir has not experience a new error +since the last sync call. + When overlay is mounted with "volatile" option, the directory "$workdir/work/incompat/volatile" is created. During next mount, overlay checks for this directory and refuses to mount if present. This is a strong diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index e5227ddc40abb..8d43faf5bda04 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -347,8 +347,9 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) const struct cred *old_cred; int ret; - if (!ovl_should_sync(file_inode(file)->i_sb->s_fs_info)) - return 0; + ret = ovl_sync_status(file_inode(file)->i_sb->s_fs_info); + if (ret <= 0) + return ret; ret = ovl_real_fdget_meta(file, &real, !datasync); if (ret) diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 6934bcf030f0b..2a131e6fe415a 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -277,6 +277,7 @@ bool ovl_is_metacopy_dentry(struct dentry *dentry); char *ovl_get_redirect_xattr(struct dentry *dentry, int padding); ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value, size_t padding); +int ovl_sync_status(struct ovl_fs *ofs); static inline bool ovl_is_impuredir(struct dentry *dentry) { diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index d9f3d366e48d5..547f2925ba307 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -74,6 +74,8 @@ struct ovl_fs { struct inode *indexdir_trap; /* Inode numbers in all layers do not use the high xino_bits */ unsigned int xino_bits; + /* r/o snapshot of upperdir sb's only taken on volatile mounts */ + errseq_t errseq; }; static inline bool ovl_should_sync(struct ovl_fs *ofs) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 0cee14cc21712..1d30428879a7f 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -826,13 +826,15 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, struct ovl_dir_file *od = file->private_data; struct dentry *dentry = file->f_path.dentry; struct file *realfile = od->realfile; + int err; /* Nothing to sync for lower */ if (!OVL_TYPE_UPPER(ovl_path_type(dentry))) return 0; - if (!ovl_should_sync(dentry->d_sb->s_fs_info)) - return 0; + err = ovl_sync_status(dentry->d_sb->s_fs_info); + if (err <= 0) + return err; /* * Need to check if we started out being a lower dir, but got copied up diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index cfaff4dc2730d..ad4d7735bf49e 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -259,11 +259,20 @@ static int ovl_sync_fs(struct super_block *sb, int wait) struct super_block *upper_sb; int ret; - if (!ofs->upper_mnt) - return 0; + ret = ovl_sync_status(ofs); + /* + * We have to always set the err, because the return value isn't + * checked in syncfs, and instead indirectly return an error via + * the sb's writeback errseq, which VFS inspects after this call. + */ + if (ret < 0) { + errseq_set(&sb->s_wb_err, -EIO); + return -EIO; + } + + if (!ret) + return ret; - if (!ovl_should_sync(ofs)) - return 0; /* * If this is a sync(2) call or an emergency sync, all the super blocks * will be iterated, including upper_sb, so no need to do anything. @@ -1701,6 +1710,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_op = &ovl_super_operations; if (ofs->config.upperdir) { + struct super_block *upper_sb; + if (!ofs->config.workdir) { pr_err("overlayfs: missing 'workdir'\n"); goto out_err; @@ -1710,6 +1721,16 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (err) goto out_err; + upper_sb = ofs->upper_mnt->mnt_sb; + if (!ovl_should_sync(ofs)) { + ofs->errseq = errseq_sample(&upper_sb->s_wb_err); + if (errseq_check(&upper_sb->s_wb_err, ofs->errseq)) { + err = -EIO; + pr_err("Cannot mount volatile when upperdir has an unseen error. Sync upperdir fs to clear state.\n"); + goto out_err; + } + } + err = ovl_get_workdir(sb, ofs, &upperpath); if (err) goto out_err; @@ -1717,9 +1738,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (!ofs->workdir) sb->s_flags |= SB_RDONLY; - sb->s_stack_depth = ofs->upper_mnt->mnt_sb->s_stack_depth; - sb->s_time_gran = ofs->upper_mnt->mnt_sb->s_time_gran; - + sb->s_stack_depth = upper_sb->s_stack_depth; + sb->s_time_gran = upper_sb->s_time_gran; } oe = ovl_get_lowerstack(sb, ofs); err = PTR_ERR(oe); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index f5678a3f83508..60dcf42ea38ed 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -936,3 +936,30 @@ char *ovl_get_redirect_xattr(struct dentry *dentry, int padding) kfree(buf); return ERR_PTR(res); } + +/* + * ovl_sync_status() - Check fs sync status for volatile mounts + * + * Returns 1 if this is not a volatile mount and a real sync is required. + * + * Returns 0 if syncing can be skipped because mount is volatile, and no errors + * have occurred on the upperdir since the mount. + * + * Returns -errno if it is a volatile mount, and the error that occurred since + * the last mount. If the error code changes, it'll return the latest error + * code. + */ + +int ovl_sync_status(struct ovl_fs *ofs) +{ + struct vfsmount *mnt; + + if (ovl_should_sync(ofs)) + return 1; + + mnt = ofs->upper_mnt; + if (!mnt) + return 0; + + return errseq_check(&mnt->mnt_sb->s_wb_err, ofs->errseq); +} From 6b2a97b498b9617c31df67fdd1e31c690a5a3295 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Sat, 10 Oct 2020 23:16:37 -0700 Subject: [PATCH 0196/5344] mm: validate inode in mapping_set_error() commit 8b7b2eb131d3476062ffd34358785b44be25172f upstream. The swap address_space doesn't have host. Thus, it makes kernel crash once swap write meets error. Fix it. Fixes: 735e4ae5ba28 ("vfs: track per-sb writeback errors and report them to syncfs") Signed-off-by: Minchan Kim Signed-off-by: Andrew Morton Acked-by: Jeff Layton Cc: Jan Kara Cc: Andres Freund Cc: Matthew Wilcox Cc: Al Viro Cc: Christoph Hellwig Cc: Dave Chinner Cc: David Howells Cc: Link: https://lkml.kernel.org/r/20201010000650.750063-1-minchan@kernel.org Signed-off-by: Linus Torvalds Signed-off-by: Jiachen Zhang --- include/linux/pagemap.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 3b1a6ec3ca8f3..1b0591edad49b 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -54,7 +54,8 @@ static inline void mapping_set_error(struct address_space *mapping, int error) __filemap_set_wb_err(mapping, error); /* Record it in superblock */ - errseq_set(&mapping->host->i_sb->s_wb_err, error); + if (mapping->host) + errseq_set(&mapping->host->i_sb->s_wb_err, error); /* Record it in flags for now, for legacy callers */ if (error == -ENOSPC) From 3eea1c9cfe19204460efe5ccc2d3c9fc1ece2279 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Thu, 28 Apr 2022 21:26:19 +0800 Subject: [PATCH 0197/5344] vringh: Fix loop descriptors check in the indirect cases We should use size of descriptor chain to test loop condition in indirect case. And another statistical counts is also introduced for indirect descriptors to avoid conflict with the statistical counts of direct descriptors. Fixes: f87d0fbb5798 ("vringh: host-side implementation of virtio rings.") Link: https://lore.kernel.org/all/20220505100910.137-1-xieyongji@bytedance.com/ Signed-off-by: Xie Yongji Signed-off-by: Fam Zheng --- drivers/vhost/vringh.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index 65c17f56efe6a..8b7f1071b8897 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -289,7 +289,7 @@ __vringh_iov(struct vringh *vrh, u16 i, int (*copy)(const struct vringh *vrh, void *dst, const void *src, size_t len)) { - int err, count = 0, up_next, desc_max; + int err, count = 0, indirect_count = 0, up_next, desc_max; struct vring_desc desc, *descs; struct vringh_range range = { -1ULL, 0 }, slowrange; bool slow = false; @@ -346,7 +346,12 @@ __vringh_iov(struct vringh *vrh, u16 i, continue; } - if (count++ == vrh->vring.num) { + if (up_next == -1) + count++; + else + indirect_count++; + + if (count > vrh->vring.num || indirect_count > desc_max) { vringh_bad("Descriptor loop in %p", descs); err = -ELOOP; goto fail; @@ -408,6 +413,7 @@ __vringh_iov(struct vringh *vrh, u16 i, i = return_from_indirect(vrh, &up_next, &descs, &desc_max); slow = false; + indirect_count = 0; } else break; } From 5919cdd0fe372da71337d8be3a54fa52e5b5e5a5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Jul 2019 10:37:22 -0700 Subject: [PATCH 0198/5344] blkcg: rename blkcg->cgwb_refcnt to ->online_pin and always use it commit d866dbf6178713e37d2fec2870af00b345684e1a upstream. blkcg->cgwb_refcnt is used to delay blkcg offlining so that blkgs don't get offlined while there are active cgwbs on them. However, it ends up making offlining unordered sometimes causing parents to be offlined before children. To fix it, we want child blkcgs to pin the parents' online states turning the refcnt into a more generic online pinning mechanism. In prepartion, * blkcg->cgwb_refcnt -> blkcg->online_pin * blkcg_cgwb_get/put() -> blkcg_pin/unpin_online() * Take them out of CONFIG_CGROUP_WRITEBACK Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: hanjinke.666 --- block/blk-cgroup.c | 6 +++--- include/linux/blk-cgroup.h | 39 +++++++++++++------------------------- mm/backing-dev.c | 6 +++--- 3 files changed, 19 insertions(+), 32 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 3358daa0ac8e3..44867534b14ea 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1025,8 +1025,8 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css) /* this prevents anyone from attaching or migrating to this blkcg */ wb_blkcg_offline(blkcg); - /* put the base cgwb reference allowing step 2 to be triggered */ - blkcg_cgwb_put(blkcg); + /* put the base online pin allowing step 2 to be triggered */ + blkcg_unpin_online(blkcg); } /** @@ -1133,11 +1133,11 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) } spin_lock_init(&blkcg->lock); + refcount_set(&blkcg->online_pin, 1); INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN); INIT_HLIST_HEAD(&blkcg->blkg_list); #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&blkcg->cgwb_list); - refcount_set(&blkcg->cgwb_refcnt, 1); #endif list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs); diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 19394c77ed995..f1caa4f454a2a 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -46,6 +46,7 @@ struct blkcg_gq; struct blkcg { struct cgroup_subsys_state css; spinlock_t lock; + refcount_t online_pin; struct radix_tree_root blkg_tree; struct blkcg_gq __rcu *blkg_hint; @@ -56,7 +57,6 @@ struct blkcg { struct list_head all_blkcgs_node; #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; - refcount_t cgwb_refcnt; #endif }; @@ -413,47 +413,34 @@ static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd) extern void blkcg_destroy_blkgs(struct blkcg *blkcg); -#ifdef CONFIG_CGROUP_WRITEBACK - /** - * blkcg_cgwb_get - get a reference for blkcg->cgwb_list + * blkcg_pin_online - pin online state * @blkcg: blkcg of interest * - * This is used to track the number of active wb's related to a blkcg. + * While pinned, a blkcg is kept online. This is primarily used to + * impedance-match blkg and cgwb lifetimes so that blkg doesn't go offline + * while an associated cgwb is still active. */ -static inline void blkcg_cgwb_get(struct blkcg *blkcg) +static inline void blkcg_pin_online(struct blkcg *blkcg) { - refcount_inc(&blkcg->cgwb_refcnt); + refcount_inc(&blkcg->online_pin); } /** - * blkcg_cgwb_put - put a reference for @blkcg->cgwb_list + * blkcg_unpin_online - unpin online state * @blkcg: blkcg of interest * - * This is used to track the number of active wb's related to a blkcg. - * When this count goes to zero, all active wb has finished so the + * This is primarily used to impedance-match blkg and cgwb lifetimes so + * that blkg doesn't go offline while an associated cgwb is still active. + * When this count goes to zero, all active cgwbs have finished so the * blkcg can continue destruction by calling blkcg_destroy_blkgs(). - * This work may occur in cgwb_release_workfn() on the cgwb_release - * workqueue. */ -static inline void blkcg_cgwb_put(struct blkcg *blkcg) +static inline void blkcg_unpin_online(struct blkcg *blkcg) { - if (refcount_dec_and_test(&blkcg->cgwb_refcnt)) + if (refcount_dec_and_test(&blkcg->online_pin)) blkcg_destroy_blkgs(blkcg); } -#else - -static inline void blkcg_cgwb_get(struct blkcg *blkcg) { } - -static inline void blkcg_cgwb_put(struct blkcg *blkcg) -{ - /* wb isn't being accounted, so trigger destruction right away */ - blkcg_destroy_blkgs(blkcg); -} - -#endif - /** * blkg_path - format cgroup path of blkg * @blkg: blkg of interest diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 44c9abf56c9d1..cf242c88fe20c 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -493,8 +493,8 @@ static void cgwb_release_workfn(struct work_struct *work) css_put(wb->blkcg_css); mutex_unlock(&wb->bdi->cgwb_release_mutex); - /* triggers blkg destruction if cgwb_refcnt becomes zero */ - blkcg_cgwb_put(blkcg); + /* triggers blkg destruction if no online users left */ + blkcg_unpin_online(blkcg); fprop_local_destroy_percpu(&wb->memcg_completions); @@ -602,7 +602,7 @@ static int cgwb_create(struct backing_dev_info *bdi, list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list); list_add(&wb->memcg_node, memcg_cgwb_list); list_add(&wb->blkcg_node, blkcg_cgwb_list); - blkcg_cgwb_get(blkcg); + blkcg_pin_online(blkcg); css_get(memcg_css); css_get(blkcg_css); } From 879fb0022e7f97002be378c9568d12290691cb16 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Jul 2019 10:37:55 -0700 Subject: [PATCH 0199/5344] blkcg: don't offline parent blkcg first commit 4308a434e5e08c78676aa66bc626ef78cbef0883 upstream. blkcg->cgwb_refcnt is used to delay blkcg offlining so that blkgs don't get offlined while there are active cgwbs on them. However, it ends up making offlining unordered sometimes causing parents to be offlined before children. Let's fix this by making child blkcgs pin the parents' online states. Note that pin/unpin names are chosen over get/put intentionally because css uses get/put online for something different. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: hanjinke.666 --- block/blk-cgroup.c | 16 ++++++++++++++++ include/linux/blk-cgroup.h | 6 +++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 44867534b14ea..381047f68c720 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1156,6 +1156,21 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) return ret; } +static int blkcg_css_online(struct cgroup_subsys_state *css) +{ + struct blkcg *blkcg = css_to_blkcg(css); + struct blkcg *parent = blkcg_parent(blkcg); + + /* + * blkcg_pin_online() is used to delay blkcg offline so that blkgs + * don't go offline while cgwbs are still active on them. Pin the + * parent so that offline always happens towards the root. + */ + if (parent) + blkcg_pin_online(parent); + return 0; +} + /** * blkcg_init_queue - initialize blkcg part of request queue * @q: request_queue to initialize @@ -1300,6 +1315,7 @@ static void blkcg_exit(struct task_struct *tsk) struct cgroup_subsys io_cgrp_subsys = { .css_alloc = blkcg_css_alloc, + .css_online = blkcg_css_online, .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .can_attach = blkcg_can_attach, diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index f1caa4f454a2a..868b8a05c7e35 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -437,8 +437,12 @@ static inline void blkcg_pin_online(struct blkcg *blkcg) */ static inline void blkcg_unpin_online(struct blkcg *blkcg) { - if (refcount_dec_and_test(&blkcg->online_pin)) + do { + if (!refcount_dec_and_test(&blkcg->online_pin)) + break; blkcg_destroy_blkgs(blkcg); + blkcg = blkcg_parent(blkcg); + } while (blkcg); } /** From a5f01c4762d4adefa4e484545631fd3fdc006d97 Mon Sep 17 00:00:00 2001 From: Gang Li Date: Tue, 10 May 2022 10:42:55 +0800 Subject: [PATCH 0200/5344] locking/local_lock: Remove initialization of non-existent members MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is an error when compiling the kernel with `CONFIG_DEBUG_LOCK_ALLOC` enabled. Remove the initialization code of `lock_type`. ``` In file included from ./include/linux/local_lock.h:5, from mm/mmap_lock.c:14: ./include/linux/local_lock_internal.h:21:4: error: ‘struct lockdep_map’ has no member named ‘lock_type’ .lock_type = LD_LOCK_PERCPU, \ ^~~~~~~~~ ``` Fixes: 5501aa12d32d ("locking/local_lock: Add missing owner initialization") Signed-off-by: Gang Li --- include/linux/local_lock_internal.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h index 3f02b818625ef..4d11e3d5e76c8 100644 --- a/include/linux/local_lock_internal.h +++ b/include/linux/local_lock_internal.h @@ -18,7 +18,6 @@ typedef struct { .dep_map = { \ .name = #lockname, \ .wait_type_inner = LD_WAIT_CONFIG, \ - .lock_type = LD_LOCK_PERCPU, \ }, \ .owner = NULL, From 6b357bf6e580b30e0a47ffdcb3b22baead7c9703 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Tue, 20 Jul 2021 11:46:39 +0800 Subject: [PATCH 0201/5344] crypto: sm4 - create SM4 library based on sm4 generic code commit 2b31277af577b1b2da62c3ad7d3315b422869102 upstream. Take the existing small footprint and mostly time invariant C code and turn it into a SM4 library that can be used for non-performance critical, casual use of SM4, and as a fallback for, e.g., SIMD code that needs a secondary path that can be taken in contexts where the SIMD unit is off limits. Secondly, some codes have been optimized, such as unrolling small times loop, removing unnecessary memory shifts, exporting sbox, fk, ck arrays, and basic encryption and decryption functions. Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu Signed-off-by: Guixiong Wei --- crypto/Kconfig | 4 + crypto/sm4_generic.c | 153 +------------------------------------ include/crypto/sm4.h | 24 +++++- lib/crypto/Makefile | 4 + lib/crypto/sm4.c | 176 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 209 insertions(+), 152 deletions(-) create mode 100644 lib/crypto/sm4.c diff --git a/crypto/Kconfig b/crypto/Kconfig index b2cc0ad3792ad..655a6f51ab131 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1534,9 +1534,13 @@ config CRYPTO_SERPENT_AVX2_X86_64 See also: +config CRYPTO_LIB_SM4 + tristate + config CRYPTO_SM4 tristate "SM4 cipher algorithm" select CRYPTO_ALGAPI + select CRYPTO_LIB_SM4 help SM4 cipher algorithms (OSCCA GB/T 32907-2016). diff --git a/crypto/sm4_generic.c b/crypto/sm4_generic.c index 71ffb343709a5..544ea9ee1c0cf 100644 --- a/crypto/sm4_generic.c +++ b/crypto/sm4_generic.c @@ -16,132 +16,6 @@ #include #include -static const u32 fk[4] = { - 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc -}; - -static const u8 sbox[256] = { - 0xd6, 0x90, 0xe9, 0xfe, 0xcc, 0xe1, 0x3d, 0xb7, - 0x16, 0xb6, 0x14, 0xc2, 0x28, 0xfb, 0x2c, 0x05, - 0x2b, 0x67, 0x9a, 0x76, 0x2a, 0xbe, 0x04, 0xc3, - 0xaa, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99, - 0x9c, 0x42, 0x50, 0xf4, 0x91, 0xef, 0x98, 0x7a, - 0x33, 0x54, 0x0b, 0x43, 0xed, 0xcf, 0xac, 0x62, - 0xe4, 0xb3, 0x1c, 0xa9, 0xc9, 0x08, 0xe8, 0x95, - 0x80, 0xdf, 0x94, 0xfa, 0x75, 0x8f, 0x3f, 0xa6, - 0x47, 0x07, 0xa7, 0xfc, 0xf3, 0x73, 0x17, 0xba, - 0x83, 0x59, 0x3c, 0x19, 0xe6, 0x85, 0x4f, 0xa8, - 0x68, 0x6b, 0x81, 0xb2, 0x71, 0x64, 0xda, 0x8b, - 0xf8, 0xeb, 0x0f, 0x4b, 0x70, 0x56, 0x9d, 0x35, - 0x1e, 0x24, 0x0e, 0x5e, 0x63, 0x58, 0xd1, 0xa2, - 0x25, 0x22, 0x7c, 0x3b, 0x01, 0x21, 0x78, 0x87, - 0xd4, 0x00, 0x46, 0x57, 0x9f, 0xd3, 0x27, 0x52, - 0x4c, 0x36, 0x02, 0xe7, 0xa0, 0xc4, 0xc8, 0x9e, - 0xea, 0xbf, 0x8a, 0xd2, 0x40, 0xc7, 0x38, 0xb5, - 0xa3, 0xf7, 0xf2, 0xce, 0xf9, 0x61, 0x15, 0xa1, - 0xe0, 0xae, 0x5d, 0xa4, 0x9b, 0x34, 0x1a, 0x55, - 0xad, 0x93, 0x32, 0x30, 0xf5, 0x8c, 0xb1, 0xe3, - 0x1d, 0xf6, 0xe2, 0x2e, 0x82, 0x66, 0xca, 0x60, - 0xc0, 0x29, 0x23, 0xab, 0x0d, 0x53, 0x4e, 0x6f, - 0xd5, 0xdb, 0x37, 0x45, 0xde, 0xfd, 0x8e, 0x2f, - 0x03, 0xff, 0x6a, 0x72, 0x6d, 0x6c, 0x5b, 0x51, - 0x8d, 0x1b, 0xaf, 0x92, 0xbb, 0xdd, 0xbc, 0x7f, - 0x11, 0xd9, 0x5c, 0x41, 0x1f, 0x10, 0x5a, 0xd8, - 0x0a, 0xc1, 0x31, 0x88, 0xa5, 0xcd, 0x7b, 0xbd, - 0x2d, 0x74, 0xd0, 0x12, 0xb8, 0xe5, 0xb4, 0xb0, - 0x89, 0x69, 0x97, 0x4a, 0x0c, 0x96, 0x77, 0x7e, - 0x65, 0xb9, 0xf1, 0x09, 0xc5, 0x6e, 0xc6, 0x84, - 0x18, 0xf0, 0x7d, 0xec, 0x3a, 0xdc, 0x4d, 0x20, - 0x79, 0xee, 0x5f, 0x3e, 0xd7, 0xcb, 0x39, 0x48 -}; - -static const u32 ck[] = { - 0x00070e15, 0x1c232a31, 0x383f464d, 0x545b6269, - 0x70777e85, 0x8c939aa1, 0xa8afb6bd, 0xc4cbd2d9, - 0xe0e7eef5, 0xfc030a11, 0x181f262d, 0x343b4249, - 0x50575e65, 0x6c737a81, 0x888f969d, 0xa4abb2b9, - 0xc0c7ced5, 0xdce3eaf1, 0xf8ff060d, 0x141b2229, - 0x30373e45, 0x4c535a61, 0x686f767d, 0x848b9299, - 0xa0a7aeb5, 0xbcc3cad1, 0xd8dfe6ed, 0xf4fb0209, - 0x10171e25, 0x2c333a41, 0x484f565d, 0x646b7279 -}; - -static u32 sm4_t_non_lin_sub(u32 x) -{ - int i; - u8 *b = (u8 *)&x; - - for (i = 0; i < 4; ++i) - b[i] = sbox[b[i]]; - - return x; -} - -static u32 sm4_key_lin_sub(u32 x) -{ - return x ^ rol32(x, 13) ^ rol32(x, 23); - -} - -static u32 sm4_enc_lin_sub(u32 x) -{ - return x ^ rol32(x, 2) ^ rol32(x, 10) ^ rol32(x, 18) ^ rol32(x, 24); -} - -static u32 sm4_key_sub(u32 x) -{ - return sm4_key_lin_sub(sm4_t_non_lin_sub(x)); -} - -static u32 sm4_enc_sub(u32 x) -{ - return sm4_enc_lin_sub(sm4_t_non_lin_sub(x)); -} - -static u32 sm4_round(const u32 *x, const u32 rk) -{ - return x[0] ^ sm4_enc_sub(x[1] ^ x[2] ^ x[3] ^ rk); -} - - -/** - * crypto_sm4_expand_key - Expands the SM4 key as described in GB/T 32907-2016 - * @ctx: The location where the computed key will be stored. - * @in_key: The supplied key. - * @key_len: The length of the supplied key. - * - * Returns 0 on success. The function fails only if an invalid key size (or - * pointer) is supplied. - */ -int crypto_sm4_expand_key(struct crypto_sm4_ctx *ctx, const u8 *in_key, - unsigned int key_len) -{ - u32 rk[4], t; - const u32 *key = (u32 *)in_key; - int i; - - if (key_len != SM4_KEY_SIZE) - return -EINVAL; - - for (i = 0; i < 4; ++i) - rk[i] = get_unaligned_be32(&key[i]) ^ fk[i]; - - for (i = 0; i < 32; ++i) { - t = rk[0] ^ sm4_key_sub(rk[1] ^ rk[2] ^ rk[3] ^ ck[i]); - ctx->rkey_enc[i] = t; - rk[0] = rk[1]; - rk[1] = rk[2]; - rk[2] = rk[3]; - rk[3] = t; - } - - for (i = 0; i < 32; ++i) - ctx->rkey_dec[i] = ctx->rkey_enc[31 - i]; - - return 0; -} -EXPORT_SYMBOL_GPL(crypto_sm4_expand_key); - /** * crypto_sm4_set_key - Set the AES key. * @tfm: The %crypto_tfm that is used in the context. @@ -149,7 +23,7 @@ EXPORT_SYMBOL_GPL(crypto_sm4_expand_key); * @key_len: The size of the key. * * Returns 0 on success, on failure the %CRYPTO_TFM_RES_BAD_KEY_LEN flag in tfm - * is set. The function uses crypto_sm4_expand_key() to expand the key. + * is set. This function uses sm4_expandkey() to expand the key. * &crypto_sm4_ctx _must_ be the private data embedded in @tfm which is * retrieved with crypto_tfm_ctx(). */ @@ -160,7 +34,7 @@ int crypto_sm4_set_key(struct crypto_tfm *tfm, const u8 *in_key, u32 *flags = &tfm->crt_flags; int ret; - ret = crypto_sm4_expand_key(ctx, in_key, key_len); + ret = sm4_expandkey(ctx, in_key, key_len); if (!ret) return 0; @@ -169,32 +43,13 @@ int crypto_sm4_set_key(struct crypto_tfm *tfm, const u8 *in_key, } EXPORT_SYMBOL_GPL(crypto_sm4_set_key); -static void sm4_do_crypt(const u32 *rk, u32 *out, const u32 *in) -{ - u32 x[4], i, t; - - for (i = 0; i < 4; ++i) - x[i] = get_unaligned_be32(&in[i]); - - for (i = 0; i < 32; ++i) { - t = sm4_round(x, rk[i]); - x[0] = x[1]; - x[1] = x[2]; - x[2] = x[3]; - x[3] = t; - } - - for (i = 0; i < 4; ++i) - put_unaligned_be32(x[3 - i], &out[i]); -} - /* encrypt a block of text */ void crypto_sm4_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm); - sm4_do_crypt(ctx->rkey_enc, (u32 *)out, (u32 *)in); + sm4_crypt_block(ctx->rkey_enc, out, in); } EXPORT_SYMBOL_GPL(crypto_sm4_encrypt); @@ -204,7 +59,7 @@ void crypto_sm4_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm); - sm4_do_crypt(ctx->rkey_dec, (u32 *)out, (u32 *)in); + sm4_crypt_block(ctx->rkey_dec, out, in); } EXPORT_SYMBOL_GPL(crypto_sm4_decrypt); diff --git a/include/crypto/sm4.h b/include/crypto/sm4.h index 7afd730d16ffb..06322325f8627 100644 --- a/include/crypto/sm4.h +++ b/include/crypto/sm4.h @@ -3,6 +3,7 @@ /* * Common values for the SM4 algorithm * Copyright (C) 2018 ARM Limited or its affiliates. + * Copyright (c) 2021 Tianjia Zhang */ #ifndef _CRYPTO_SM4_H @@ -20,11 +21,28 @@ struct crypto_sm4_ctx { u32 rkey_dec[SM4_RKEY_WORDS]; }; -int crypto_sm4_set_key(struct crypto_tfm *tfm, const u8 *in_key, - unsigned int key_len); -int crypto_sm4_expand_key(struct crypto_sm4_ctx *ctx, const u8 *in_key, +/** + * sm4_expandkey - Expands the SM4 key as described in GB/T 32907-2016 + * @ctx: The location where the computed key will be stored. + * @in_key: The supplied key. + * @key_len: The length of the supplied key. + * + * Returns 0 on success. The function fails only if an invalid key size (or + * pointer) is supplied. + */ +int sm4_expandkey(struct crypto_sm4_ctx *ctx, const u8 *in_key, unsigned int key_len); +/** + * sm4_crypt_block - Encrypt or decrypt a single SM4 block + * @rk: The rkey_enc for encrypt or rkey_dec for decrypt + * @out: Buffer to store output data + * @in: Buffer containing the input data + */ +void sm4_crypt_block(const u32 *rk, u8 *out, const u8 *in); + +int crypto_sm4_set_key(struct crypto_tfm *tfm, const u8 *in_key, + unsigned int key_len); void crypto_sm4_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in); void crypto_sm4_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in); diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index cbe0b6a6450d7..5f9e4b3899948 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -11,3 +11,7 @@ libdes-y := des.o obj-$(CONFIG_CRYPTO_LIB_SHA256) += libsha256.o libsha256-y := sha256.o + +obj-$(CONFIG_CRYPTO_LIB_SM4) += libsm4.o +libsm4-y := sm4.o + diff --git a/lib/crypto/sm4.c b/lib/crypto/sm4.c new file mode 100644 index 0000000000000..5fbf8c741a2fc --- /dev/null +++ b/lib/crypto/sm4.c @@ -0,0 +1,176 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SM4, as specified in + * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html + * + * Copyright (C) 2018 ARM Limited or its affiliates. + * Copyright (c) 2021 Tianjia Zhang + */ + +#include +#include +#include + +static const u32 fk[4] = { + 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc +}; + +static const u32 __cacheline_aligned ck[32] = { + 0x00070e15, 0x1c232a31, 0x383f464d, 0x545b6269, + 0x70777e85, 0x8c939aa1, 0xa8afb6bd, 0xc4cbd2d9, + 0xe0e7eef5, 0xfc030a11, 0x181f262d, 0x343b4249, + 0x50575e65, 0x6c737a81, 0x888f969d, 0xa4abb2b9, + 0xc0c7ced5, 0xdce3eaf1, 0xf8ff060d, 0x141b2229, + 0x30373e45, 0x4c535a61, 0x686f767d, 0x848b9299, + 0xa0a7aeb5, 0xbcc3cad1, 0xd8dfe6ed, 0xf4fb0209, + 0x10171e25, 0x2c333a41, 0x484f565d, 0x646b7279 +}; + +static const u8 __cacheline_aligned sbox[256] = { + 0xd6, 0x90, 0xe9, 0xfe, 0xcc, 0xe1, 0x3d, 0xb7, + 0x16, 0xb6, 0x14, 0xc2, 0x28, 0xfb, 0x2c, 0x05, + 0x2b, 0x67, 0x9a, 0x76, 0x2a, 0xbe, 0x04, 0xc3, + 0xaa, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99, + 0x9c, 0x42, 0x50, 0xf4, 0x91, 0xef, 0x98, 0x7a, + 0x33, 0x54, 0x0b, 0x43, 0xed, 0xcf, 0xac, 0x62, + 0xe4, 0xb3, 0x1c, 0xa9, 0xc9, 0x08, 0xe8, 0x95, + 0x80, 0xdf, 0x94, 0xfa, 0x75, 0x8f, 0x3f, 0xa6, + 0x47, 0x07, 0xa7, 0xfc, 0xf3, 0x73, 0x17, 0xba, + 0x83, 0x59, 0x3c, 0x19, 0xe6, 0x85, 0x4f, 0xa8, + 0x68, 0x6b, 0x81, 0xb2, 0x71, 0x64, 0xda, 0x8b, + 0xf8, 0xeb, 0x0f, 0x4b, 0x70, 0x56, 0x9d, 0x35, + 0x1e, 0x24, 0x0e, 0x5e, 0x63, 0x58, 0xd1, 0xa2, + 0x25, 0x22, 0x7c, 0x3b, 0x01, 0x21, 0x78, 0x87, + 0xd4, 0x00, 0x46, 0x57, 0x9f, 0xd3, 0x27, 0x52, + 0x4c, 0x36, 0x02, 0xe7, 0xa0, 0xc4, 0xc8, 0x9e, + 0xea, 0xbf, 0x8a, 0xd2, 0x40, 0xc7, 0x38, 0xb5, + 0xa3, 0xf7, 0xf2, 0xce, 0xf9, 0x61, 0x15, 0xa1, + 0xe0, 0xae, 0x5d, 0xa4, 0x9b, 0x34, 0x1a, 0x55, + 0xad, 0x93, 0x32, 0x30, 0xf5, 0x8c, 0xb1, 0xe3, + 0x1d, 0xf6, 0xe2, 0x2e, 0x82, 0x66, 0xca, 0x60, + 0xc0, 0x29, 0x23, 0xab, 0x0d, 0x53, 0x4e, 0x6f, + 0xd5, 0xdb, 0x37, 0x45, 0xde, 0xfd, 0x8e, 0x2f, + 0x03, 0xff, 0x6a, 0x72, 0x6d, 0x6c, 0x5b, 0x51, + 0x8d, 0x1b, 0xaf, 0x92, 0xbb, 0xdd, 0xbc, 0x7f, + 0x11, 0xd9, 0x5c, 0x41, 0x1f, 0x10, 0x5a, 0xd8, + 0x0a, 0xc1, 0x31, 0x88, 0xa5, 0xcd, 0x7b, 0xbd, + 0x2d, 0x74, 0xd0, 0x12, 0xb8, 0xe5, 0xb4, 0xb0, + 0x89, 0x69, 0x97, 0x4a, 0x0c, 0x96, 0x77, 0x7e, + 0x65, 0xb9, 0xf1, 0x09, 0xc5, 0x6e, 0xc6, 0x84, + 0x18, 0xf0, 0x7d, 0xec, 0x3a, 0xdc, 0x4d, 0x20, + 0x79, 0xee, 0x5f, 0x3e, 0xd7, 0xcb, 0x39, 0x48 +}; + +static inline u32 sm4_t_non_lin_sub(u32 x) +{ + u32 out; + + out = (u32)sbox[x & 0xff]; + out |= (u32)sbox[(x >> 8) & 0xff] << 8; + out |= (u32)sbox[(x >> 16) & 0xff] << 16; + out |= (u32)sbox[(x >> 24) & 0xff] << 24; + + return out; +} + +static inline u32 sm4_key_lin_sub(u32 x) +{ + return x ^ rol32(x, 13) ^ rol32(x, 23); +} + +static inline u32 sm4_enc_lin_sub(u32 x) +{ + return x ^ rol32(x, 2) ^ rol32(x, 10) ^ rol32(x, 18) ^ rol32(x, 24); +} + +static inline u32 sm4_key_sub(u32 x) +{ + return sm4_key_lin_sub(sm4_t_non_lin_sub(x)); +} + +static inline u32 sm4_enc_sub(u32 x) +{ + return sm4_enc_lin_sub(sm4_t_non_lin_sub(x)); +} + +static inline u32 sm4_round(u32 x0, u32 x1, u32 x2, u32 x3, u32 rk) +{ + return x0 ^ sm4_enc_sub(x1 ^ x2 ^ x3 ^ rk); +} + + +/** + * sm4_expandkey - Expands the SM4 key as described in GB/T 32907-2016 + * @ctx: The location where the computed key will be stored. + * @in_key: The supplied key. + * @key_len: The length of the supplied key. + * + * Returns 0 on success. The function fails only if an invalid key size (or + * pointer) is supplied. + */ +int sm4_expandkey(struct crypto_sm4_ctx *ctx, const u8 *in_key, + unsigned int key_len) +{ + u32 rk[4]; + const u32 *key = (u32 *)in_key; + int i; + + if (key_len != SM4_KEY_SIZE) + return -EINVAL; + + rk[0] = get_unaligned_be32(&key[0]) ^ fk[0]; + rk[1] = get_unaligned_be32(&key[1]) ^ fk[1]; + rk[2] = get_unaligned_be32(&key[2]) ^ fk[2]; + rk[3] = get_unaligned_be32(&key[3]) ^ fk[3]; + + for (i = 0; i < 32; i += 4) { + rk[0] ^= sm4_key_sub(rk[1] ^ rk[2] ^ rk[3] ^ ck[i + 0]); + rk[1] ^= sm4_key_sub(rk[2] ^ rk[3] ^ rk[0] ^ ck[i + 1]); + rk[2] ^= sm4_key_sub(rk[3] ^ rk[0] ^ rk[1] ^ ck[i + 2]); + rk[3] ^= sm4_key_sub(rk[0] ^ rk[1] ^ rk[2] ^ ck[i + 3]); + + ctx->rkey_enc[i + 0] = rk[0]; + ctx->rkey_enc[i + 1] = rk[1]; + ctx->rkey_enc[i + 2] = rk[2]; + ctx->rkey_enc[i + 3] = rk[3]; + ctx->rkey_dec[31 - 0 - i] = rk[0]; + ctx->rkey_dec[31 - 1 - i] = rk[1]; + ctx->rkey_dec[31 - 2 - i] = rk[2]; + ctx->rkey_dec[31 - 3 - i] = rk[3]; + } + + return 0; +} +EXPORT_SYMBOL_GPL(sm4_expandkey); + +/** + * sm4_crypt_block - Encrypt or decrypt a single SM4 block + * @rk: The rkey_enc for encrypt or rkey_dec for decrypt + * @out: Buffer to store output data + * @in: Buffer containing the input data + */ +void sm4_crypt_block(const u32 *rk, u8 *out, const u8 *in) +{ + u32 x[4], i; + + x[0] = get_unaligned_be32(in + 0 * 4); + x[1] = get_unaligned_be32(in + 1 * 4); + x[2] = get_unaligned_be32(in + 2 * 4); + x[3] = get_unaligned_be32(in + 3 * 4); + + for (i = 0; i < 32; i += 4) { + x[0] = sm4_round(x[0], x[1], x[2], x[3], rk[i + 0]); + x[1] = sm4_round(x[1], x[2], x[3], x[0], rk[i + 1]); + x[2] = sm4_round(x[2], x[3], x[0], x[1], rk[i + 2]); + x[3] = sm4_round(x[3], x[0], x[1], x[2], rk[i + 3]); + } + + put_unaligned_be32(x[3 - 0], out + 0 * 4); + put_unaligned_be32(x[3 - 1], out + 1 * 4); + put_unaligned_be32(x[3 - 2], out + 2 * 4); + put_unaligned_be32(x[3 - 3], out + 3 * 4); +} +EXPORT_SYMBOL_GPL(sm4_crypt_block); + +MODULE_DESCRIPTION("Generic SM4 library"); +MODULE_LICENSE("GPL v2"); From ee46130aa07cf0395848ec9a7c066ce79efb1230 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Tue, 20 Jul 2021 11:46:40 +0800 Subject: [PATCH 0202/5344] crypto: arm64/sm4-ce - Make dependent on sm4 library instead of sm4-generic commit c59de48e125c6d49a8abd165e388ca57bfe37b17 upstream. SM4 library is abstracted from sm4-generic algorithm, sm4-ce can depend on the SM4 library instead of sm4-generic, and some functions in sm4-generic do not need to be exported. Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu Signed-off-by: Guixiong Wei --- arch/arm64/crypto/Kconfig | 2 +- arch/arm64/crypto/sm4-ce-glue.c | 20 ++++++++++++++------ crypto/sm4_generic.c | 27 ++++++++++++--------------- include/crypto/sm4.h | 9 ++------- lib/crypto/sm4.c | 2 +- 5 files changed, 30 insertions(+), 30 deletions(-) diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 4922c4451e7c3..9a4aa1a419bb2 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -51,7 +51,7 @@ config CRYPTO_SM4_ARM64_CE tristate "SM4 symmetric cipher (ARMv8.2 Crypto Extensions)" depends on KERNEL_MODE_NEON select CRYPTO_ALGAPI - select CRYPTO_SM4 + select CRYPTO_LIB_SM4 config CRYPTO_GHASH_ARM64_CE tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions" diff --git a/arch/arm64/crypto/sm4-ce-glue.c b/arch/arm64/crypto/sm4-ce-glue.c index 2754c875d39c3..9c93cfc4841bc 100644 --- a/arch/arm64/crypto/sm4-ce-glue.c +++ b/arch/arm64/crypto/sm4-ce-glue.c @@ -17,12 +17,20 @@ MODULE_LICENSE("GPL v2"); asmlinkage void sm4_ce_do_crypt(const u32 *rk, void *out, const void *in); +static int sm4_ce_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int key_len) +{ + struct sm4_ctx *ctx = crypto_tfm_ctx(tfm); + + return sm4_expandkey(ctx, key, key_len); +} + static void sm4_ce_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { - const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm); + const struct sm4_ctx *ctx = crypto_tfm_ctx(tfm); if (!crypto_simd_usable()) { - crypto_sm4_encrypt(tfm, out, in); + sm4_crypt_block(ctx->rkey_enc, out, in); } else { kernel_neon_begin(); sm4_ce_do_crypt(ctx->rkey_enc, out, in); @@ -32,10 +40,10 @@ static void sm4_ce_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) static void sm4_ce_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { - const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm); + const struct sm4_ctx *ctx = crypto_tfm_ctx(tfm); if (!crypto_simd_usable()) { - crypto_sm4_decrypt(tfm, out, in); + sm4_crypt_block(ctx->rkey_dec, out, in); } else { kernel_neon_begin(); sm4_ce_do_crypt(ctx->rkey_dec, out, in); @@ -49,12 +57,12 @@ static struct crypto_alg sm4_ce_alg = { .cra_priority = 200, .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = SM4_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_sm4_ctx), + .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, .cra_u.cipher = { .cia_min_keysize = SM4_KEY_SIZE, .cia_max_keysize = SM4_KEY_SIZE, - .cia_setkey = crypto_sm4_set_key, + .cia_setkey = sm4_ce_setkey, .cia_encrypt = sm4_ce_encrypt, .cia_decrypt = sm4_ce_decrypt } diff --git a/crypto/sm4_generic.c b/crypto/sm4_generic.c index 544ea9ee1c0cf..ba539d05481f5 100644 --- a/crypto/sm4_generic.c +++ b/crypto/sm4_generic.c @@ -17,20 +17,20 @@ #include /** - * crypto_sm4_set_key - Set the AES key. + * sm4_setkey - Set the SM4 key. * @tfm: The %crypto_tfm that is used in the context. * @in_key: The input key. * @key_len: The size of the key. * * Returns 0 on success, on failure the %CRYPTO_TFM_RES_BAD_KEY_LEN flag in tfm * is set. This function uses sm4_expandkey() to expand the key. - * &crypto_sm4_ctx _must_ be the private data embedded in @tfm which is + * &sm4_ctx _must_ be the private data embedded in @tfm which is * retrieved with crypto_tfm_ctx(). */ -int crypto_sm4_set_key(struct crypto_tfm *tfm, const u8 *in_key, +static int sm4_setkey(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len) { - struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm); + struct sm4_ctx *ctx = crypto_tfm_ctx(tfm); u32 *flags = &tfm->crt_flags; int ret; @@ -41,27 +41,24 @@ int crypto_sm4_set_key(struct crypto_tfm *tfm, const u8 *in_key, *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; return -EINVAL; } -EXPORT_SYMBOL_GPL(crypto_sm4_set_key); /* encrypt a block of text */ -void crypto_sm4_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) +static void sm4_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { - const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm); + const struct sm4_ctx *ctx = crypto_tfm_ctx(tfm); sm4_crypt_block(ctx->rkey_enc, out, in); } -EXPORT_SYMBOL_GPL(crypto_sm4_encrypt); /* decrypt a block of text */ -void crypto_sm4_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) +static void sm4_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { - const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm); + const struct sm4_ctx *ctx = crypto_tfm_ctx(tfm); sm4_crypt_block(ctx->rkey_dec, out, in); } -EXPORT_SYMBOL_GPL(crypto_sm4_decrypt); static struct crypto_alg sm4_alg = { .cra_name = "sm4", @@ -69,15 +66,15 @@ static struct crypto_alg sm4_alg = { .cra_priority = 100, .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = SM4_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_sm4_ctx), + .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, .cra_u = { .cipher = { .cia_min_keysize = SM4_KEY_SIZE, .cia_max_keysize = SM4_KEY_SIZE, - .cia_setkey = crypto_sm4_set_key, - .cia_encrypt = crypto_sm4_encrypt, - .cia_decrypt = crypto_sm4_decrypt + .cia_setkey = sm4_setkey, + .cia_encrypt = sm4_encrypt, + .cia_decrypt = sm4_decrypt } } }; diff --git a/include/crypto/sm4.h b/include/crypto/sm4.h index 06322325f8627..709f286e7b253 100644 --- a/include/crypto/sm4.h +++ b/include/crypto/sm4.h @@ -16,7 +16,7 @@ #define SM4_BLOCK_SIZE 16 #define SM4_RKEY_WORDS 32 -struct crypto_sm4_ctx { +struct sm4_ctx { u32 rkey_enc[SM4_RKEY_WORDS]; u32 rkey_dec[SM4_RKEY_WORDS]; }; @@ -30,7 +30,7 @@ struct crypto_sm4_ctx { * Returns 0 on success. The function fails only if an invalid key size (or * pointer) is supplied. */ -int sm4_expandkey(struct crypto_sm4_ctx *ctx, const u8 *in_key, +int sm4_expandkey(struct sm4_ctx *ctx, const u8 *in_key, unsigned int key_len); /** @@ -41,9 +41,4 @@ int sm4_expandkey(struct crypto_sm4_ctx *ctx, const u8 *in_key, */ void sm4_crypt_block(const u32 *rk, u8 *out, const u8 *in); -int crypto_sm4_set_key(struct crypto_tfm *tfm, const u8 *in_key, - unsigned int key_len); -void crypto_sm4_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in); -void crypto_sm4_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in); - #endif diff --git a/lib/crypto/sm4.c b/lib/crypto/sm4.c index 5fbf8c741a2fc..633b59fed9db8 100644 --- a/lib/crypto/sm4.c +++ b/lib/crypto/sm4.c @@ -108,7 +108,7 @@ static inline u32 sm4_round(u32 x0, u32 x1, u32 x2, u32 x3, u32 rk) * Returns 0 on success. The function fails only if an invalid key size (or * pointer) is supplied. */ -int sm4_expandkey(struct crypto_sm4_ctx *ctx, const u8 *in_key, +int sm4_expandkey(struct sm4_ctx *ctx, const u8 *in_key, unsigned int key_len) { u32 rk[4]; From 563e7ef4b3f900ec7848ecc1f8e5531226e5557e Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Tue, 20 Jul 2021 11:46:41 +0800 Subject: [PATCH 0203/5344] crypto: x86/sm4 - add AES-NI/AVX/x86_64 implementation commit a7ee22ee1445c7fdb00ab80116bb9710ca86a860 upstream. This patch adds AES-NI/AVX/x86_64 assembler implementation of SM4 block cipher. Through two affine transforms, we can use the AES S-Box to simulate the SM4 S-Box to achieve the effect of instruction acceleration. The main algorithm implementation comes from SM4 AES-NI work by libgcrypt and Markku-Juhani O. Saarinen at: https://github.com/mjosaarinen/sm4ni This optimization supports the four modes of SM4, ECB, CBC, CFB, and CTR. Since CBC and CFB do not support multiple block parallel encryption, the optimization effect is not obvious. Benchmark on Intel Xeon Cascadelake, the data comes from the 218 mode and 518 mode of tcrypt. The abscissas are blocks of different lengths. The data is tabulated and the unit is Mb/s: sm4-generic | 16 64 128 256 1024 1420 4096 ECB enc | 40.99 46.50 48.05 48.41 49.20 49.25 49.28 ECB dec | 41.07 46.99 48.15 48.67 49.20 49.25 49.29 CBC enc | 37.71 45.28 46.77 47.60 48.32 48.37 48.40 CBC dec | 36.48 44.82 46.43 47.45 48.23 48.30 48.36 CFB enc | 37.94 44.84 46.12 46.94 47.57 47.46 47.68 CFB dec | 37.50 42.84 43.74 44.37 44.85 44.80 44.96 CTR enc | 39.20 45.63 46.75 47.49 48.09 47.85 48.08 CTR dec | 39.64 45.70 46.72 47.47 47.98 47.88 48.06 sm4-aesni-avx ECB enc | 33.75 134.47 221.64 243.43 264.05 251.58 258.13 ECB dec | 34.02 134.92 223.11 245.14 264.12 251.04 258.33 CBC enc | 38.85 46.18 47.67 48.34 49.00 48.96 49.14 CBC dec | 33.54 131.29 223.88 245.27 265.50 252.41 263.78 CFB enc | 38.70 46.10 47.58 48.29 49.01 48.94 49.19 CFB dec | 32.79 128.40 223.23 244.87 265.77 253.31 262.79 CTR enc | 32.58 122.23 220.29 241.16 259.57 248.32 256.69 CTR dec | 32.81 122.47 218.99 241.54 258.42 248.58 256.61 Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu Signed-off-by: Guixiong Wei --- arch/x86/crypto/Makefile | 4 + arch/x86/crypto/sm4-aesni-avx-asm_64.S | 589 +++++++++++++++++++++++++ arch/x86/crypto/sm4_aesni_avx_glue.c | 459 +++++++++++++++++++ crypto/Kconfig | 21 + 4 files changed, 1073 insertions(+) create mode 100644 arch/x86/crypto/sm4-aesni-avx-asm_64.S create mode 100644 arch/x86/crypto/sm4_aesni_avx_glue.c diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 759b1a927826b..9275e113a177d 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -115,3 +115,7 @@ sha256-ssse3-y += sha256_ni_asm.o endif sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o + +obj-$(CONFIG_CRYPTO_SM4_AESNI_AVX_X86_64) += sm4-aesni-avx-x86_64.o +sm4-aesni-avx-x86_64-y := sm4-aesni-avx-asm_64.o sm4_aesni_avx_glue.o + diff --git a/arch/x86/crypto/sm4-aesni-avx-asm_64.S b/arch/x86/crypto/sm4-aesni-avx-asm_64.S new file mode 100644 index 0000000000000..fa2c3f50aecbd --- /dev/null +++ b/arch/x86/crypto/sm4-aesni-avx-asm_64.S @@ -0,0 +1,589 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SM4 Cipher Algorithm, AES-NI/AVX optimized. + * as specified in + * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html + * + * Copyright (C) 2018 Markku-Juhani O. Saarinen + * Copyright (C) 2020 Jussi Kivilinna + * Copyright (c) 2021 Tianjia Zhang + */ + +/* Based on SM4 AES-NI work by libgcrypt and Markku-Juhani O. Saarinen at: + * https://github.com/mjosaarinen/sm4ni + */ + +#include +#include + +#define rRIP (%rip) + +#define RX0 %xmm0 +#define RX1 %xmm1 +#define MASK_4BIT %xmm2 +#define RTMP0 %xmm3 +#define RTMP1 %xmm4 +#define RTMP2 %xmm5 +#define RTMP3 %xmm6 +#define RTMP4 %xmm7 + +#define RA0 %xmm8 +#define RA1 %xmm9 +#define RA2 %xmm10 +#define RA3 %xmm11 + +#define RB0 %xmm12 +#define RB1 %xmm13 +#define RB2 %xmm14 +#define RB3 %xmm15 + +#define RNOT %xmm0 +#define RBSWAP %xmm1 + + +/* Transpose four 32-bit words between 128-bit vectors. */ +#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ + vpunpckhdq x1, x0, t2; \ + vpunpckldq x1, x0, x0; \ + \ + vpunpckldq x3, x2, t1; \ + vpunpckhdq x3, x2, x2; \ + \ + vpunpckhqdq t1, x0, x1; \ + vpunpcklqdq t1, x0, x0; \ + \ + vpunpckhqdq x2, t2, x3; \ + vpunpcklqdq x2, t2, x2; + +/* pre-SubByte transform. */ +#define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \ + vpand x, mask4bit, tmp0; \ + vpandn x, mask4bit, x; \ + vpsrld $4, x, x; \ + \ + vpshufb tmp0, lo_t, tmp0; \ + vpshufb x, hi_t, x; \ + vpxor tmp0, x, x; + +/* post-SubByte transform. Note: x has been XOR'ed with mask4bit by + * 'vaeslastenc' instruction. + */ +#define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \ + vpandn mask4bit, x, tmp0; \ + vpsrld $4, x, x; \ + vpand x, mask4bit, x; \ + \ + vpshufb tmp0, lo_t, tmp0; \ + vpshufb x, hi_t, x; \ + vpxor tmp0, x, x; + + +.section .rodata.cst164, "aM", @progbits, 164 +.align 16 + +/* + * Following four affine transform look-up tables are from work by + * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni + * + * These allow exposing SM4 S-Box from AES SubByte. + */ + +/* pre-SubByte affine transform, from SM4 field to AES field. */ +.Lpre_tf_lo_s: + .quad 0x9197E2E474720701, 0xC7C1B4B222245157 +.Lpre_tf_hi_s: + .quad 0xE240AB09EB49A200, 0xF052B91BF95BB012 + +/* post-SubByte affine transform, from AES field to SM4 field. */ +.Lpost_tf_lo_s: + .quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82 +.Lpost_tf_hi_s: + .quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF + +/* For isolating SubBytes from AESENCLAST, inverse shift row */ +.Linv_shift_row: + .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b + .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 + +/* Inverse shift row + Rotate left by 8 bits on 32-bit words with vpshufb */ +.Linv_shift_row_rol_8: + .byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e + .byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06 + +/* Inverse shift row + Rotate left by 16 bits on 32-bit words with vpshufb */ +.Linv_shift_row_rol_16: + .byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01 + .byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09 + +/* Inverse shift row + Rotate left by 24 bits on 32-bit words with vpshufb */ +.Linv_shift_row_rol_24: + .byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04 + .byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c + +/* For CTR-mode IV byteswap */ +.Lbswap128_mask: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +/* For input word byte-swap */ +.Lbswap32_mask: + .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 + +.align 4 +/* 4-bit mask */ +.L0f0f0f0f: + .long 0x0f0f0f0f + + +.text +.align 16 + +/* + * void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst, + * const u8 *src, int nblocks) + */ +.align 8 +SYM_FUNC_START(sm4_aesni_avx_crypt4) + /* input: + * %rdi: round key array, CTX + * %rsi: dst (1..4 blocks) + * %rdx: src (1..4 blocks) + * %rcx: num blocks (1..4) + */ + FRAME_BEGIN + + vmovdqu 0*16(%rdx), RA0; + vmovdqa RA0, RA1; + vmovdqa RA0, RA2; + vmovdqa RA0, RA3; + cmpq $2, %rcx; + jb .Lblk4_load_input_done; + vmovdqu 1*16(%rdx), RA1; + je .Lblk4_load_input_done; + vmovdqu 2*16(%rdx), RA2; + cmpq $3, %rcx; + je .Lblk4_load_input_done; + vmovdqu 3*16(%rdx), RA3; + +.Lblk4_load_input_done: + + vmovdqa .Lbswap32_mask rRIP, RTMP2; + vpshufb RTMP2, RA0, RA0; + vpshufb RTMP2, RA1, RA1; + vpshufb RTMP2, RA2, RA2; + vpshufb RTMP2, RA3, RA3; + + vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT; + vmovdqa .Lpre_tf_lo_s rRIP, RTMP4; + vmovdqa .Lpre_tf_hi_s rRIP, RB0; + vmovdqa .Lpost_tf_lo_s rRIP, RB1; + vmovdqa .Lpost_tf_hi_s rRIP, RB2; + vmovdqa .Linv_shift_row rRIP, RB3; + vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP2; + vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP3; + transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); + +#define ROUND(round, s0, s1, s2, s3) \ + vbroadcastss (4*(round))(%rdi), RX0; \ + vpxor s1, RX0, RX0; \ + vpxor s2, RX0, RX0; \ + vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \ + \ + /* sbox, non-linear part */ \ + transform_pre(RX0, RTMP4, RB0, MASK_4BIT, RTMP0); \ + vaesenclast MASK_4BIT, RX0, RX0; \ + transform_post(RX0, RB1, RB2, MASK_4BIT, RTMP0); \ + \ + /* linear part */ \ + vpshufb RB3, RX0, RTMP0; \ + vpxor RTMP0, s0, s0; /* s0 ^ x */ \ + vpshufb RTMP2, RX0, RTMP1; \ + vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \ + vpshufb RTMP3, RX0, RTMP1; \ + vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \ + vpshufb .Linv_shift_row_rol_24 rRIP, RX0, RTMP1; \ + vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \ + vpslld $2, RTMP0, RTMP1; \ + vpsrld $30, RTMP0, RTMP0; \ + vpxor RTMP0, s0, s0; \ + /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ + vpxor RTMP1, s0, s0; + + leaq (32*4)(%rdi), %rax; +.align 16 +.Lroundloop_blk4: + ROUND(0, RA0, RA1, RA2, RA3); + ROUND(1, RA1, RA2, RA3, RA0); + ROUND(2, RA2, RA3, RA0, RA1); + ROUND(3, RA3, RA0, RA1, RA2); + leaq (4*4)(%rdi), %rdi; + cmpq %rax, %rdi; + jne .Lroundloop_blk4; + +#undef ROUND + + vmovdqa .Lbswap128_mask rRIP, RTMP2; + + transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); + vpshufb RTMP2, RA0, RA0; + vpshufb RTMP2, RA1, RA1; + vpshufb RTMP2, RA2, RA2; + vpshufb RTMP2, RA3, RA3; + + vmovdqu RA0, 0*16(%rsi); + cmpq $2, %rcx; + jb .Lblk4_store_output_done; + vmovdqu RA1, 1*16(%rsi); + je .Lblk4_store_output_done; + vmovdqu RA2, 2*16(%rsi); + cmpq $3, %rcx; + je .Lblk4_store_output_done; + vmovdqu RA3, 3*16(%rsi); + +.Lblk4_store_output_done: + vzeroall; + FRAME_END + ret; +SYM_FUNC_END(sm4_aesni_avx_crypt4) + +.align 8 +SYM_FUNC_START_LOCAL(__sm4_crypt_blk8) + /* input: + * %rdi: round key array, CTX + * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel + * plaintext blocks + * output: + * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel + * ciphertext blocks + */ + FRAME_BEGIN + + vmovdqa .Lbswap32_mask rRIP, RTMP2; + vpshufb RTMP2, RA0, RA0; + vpshufb RTMP2, RA1, RA1; + vpshufb RTMP2, RA2, RA2; + vpshufb RTMP2, RA3, RA3; + vpshufb RTMP2, RB0, RB0; + vpshufb RTMP2, RB1, RB1; + vpshufb RTMP2, RB2, RB2; + vpshufb RTMP2, RB3, RB3; + + vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT; + transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); + transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1); + +#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \ + vbroadcastss (4*(round))(%rdi), RX0; \ + vmovdqa .Lpre_tf_lo_s rRIP, RTMP4; \ + vmovdqa .Lpre_tf_hi_s rRIP, RTMP1; \ + vmovdqa RX0, RX1; \ + vpxor s1, RX0, RX0; \ + vpxor s2, RX0, RX0; \ + vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \ + vmovdqa .Lpost_tf_lo_s rRIP, RTMP2; \ + vmovdqa .Lpost_tf_hi_s rRIP, RTMP3; \ + vpxor r1, RX1, RX1; \ + vpxor r2, RX1, RX1; \ + vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ rk */ \ + \ + /* sbox, non-linear part */ \ + transform_pre(RX0, RTMP4, RTMP1, MASK_4BIT, RTMP0); \ + transform_pre(RX1, RTMP4, RTMP1, MASK_4BIT, RTMP0); \ + vmovdqa .Linv_shift_row rRIP, RTMP4; \ + vaesenclast MASK_4BIT, RX0, RX0; \ + vaesenclast MASK_4BIT, RX1, RX1; \ + transform_post(RX0, RTMP2, RTMP3, MASK_4BIT, RTMP0); \ + transform_post(RX1, RTMP2, RTMP3, MASK_4BIT, RTMP0); \ + \ + /* linear part */ \ + vpshufb RTMP4, RX0, RTMP0; \ + vpxor RTMP0, s0, s0; /* s0 ^ x */ \ + vpshufb RTMP4, RX1, RTMP2; \ + vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP4; \ + vpxor RTMP2, r0, r0; /* r0 ^ x */ \ + vpshufb RTMP4, RX0, RTMP1; \ + vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \ + vpshufb RTMP4, RX1, RTMP3; \ + vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP4; \ + vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) */ \ + vpshufb RTMP4, RX0, RTMP1; \ + vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \ + vpshufb RTMP4, RX1, RTMP3; \ + vmovdqa .Linv_shift_row_rol_24 rRIP, RTMP4; \ + vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) ^ rol(x,16) */ \ + vpshufb RTMP4, RX0, RTMP1; \ + vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \ + /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ + vpslld $2, RTMP0, RTMP1; \ + vpsrld $30, RTMP0, RTMP0; \ + vpxor RTMP0, s0, s0; \ + vpxor RTMP1, s0, s0; \ + vpshufb RTMP4, RX1, RTMP3; \ + vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,24) */ \ + /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ + vpslld $2, RTMP2, RTMP3; \ + vpsrld $30, RTMP2, RTMP2; \ + vpxor RTMP2, r0, r0; \ + vpxor RTMP3, r0, r0; + + leaq (32*4)(%rdi), %rax; +.align 16 +.Lroundloop_blk8: + ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3); + ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0); + ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1); + ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2); + leaq (4*4)(%rdi), %rdi; + cmpq %rax, %rdi; + jne .Lroundloop_blk8; + +#undef ROUND + + vmovdqa .Lbswap128_mask rRIP, RTMP2; + + transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); + transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1); + vpshufb RTMP2, RA0, RA0; + vpshufb RTMP2, RA1, RA1; + vpshufb RTMP2, RA2, RA2; + vpshufb RTMP2, RA3, RA3; + vpshufb RTMP2, RB0, RB0; + vpshufb RTMP2, RB1, RB1; + vpshufb RTMP2, RB2, RB2; + vpshufb RTMP2, RB3, RB3; + + FRAME_END + ret; +SYM_FUNC_END(__sm4_crypt_blk8) + +/* + * void sm4_aesni_avx_crypt8(const u32 *rk, u8 *dst, + * const u8 *src, int nblocks) + */ +.align 8 +SYM_FUNC_START(sm4_aesni_avx_crypt8) + /* input: + * %rdi: round key array, CTX + * %rsi: dst (1..8 blocks) + * %rdx: src (1..8 blocks) + * %rcx: num blocks (1..8) + */ + FRAME_BEGIN + + cmpq $5, %rcx; + jb sm4_aesni_avx_crypt4; + vmovdqu (0 * 16)(%rdx), RA0; + vmovdqu (1 * 16)(%rdx), RA1; + vmovdqu (2 * 16)(%rdx), RA2; + vmovdqu (3 * 16)(%rdx), RA3; + vmovdqu (4 * 16)(%rdx), RB0; + vmovdqa RB0, RB1; + vmovdqa RB0, RB2; + vmovdqa RB0, RB3; + je .Lblk8_load_input_done; + vmovdqu (5 * 16)(%rdx), RB1; + cmpq $7, %rcx; + jb .Lblk8_load_input_done; + vmovdqu (6 * 16)(%rdx), RB2; + je .Lblk8_load_input_done; + vmovdqu (7 * 16)(%rdx), RB3; + +.Lblk8_load_input_done: + call __sm4_crypt_blk8; + + cmpq $6, %rcx; + vmovdqu RA0, (0 * 16)(%rsi); + vmovdqu RA1, (1 * 16)(%rsi); + vmovdqu RA2, (2 * 16)(%rsi); + vmovdqu RA3, (3 * 16)(%rsi); + vmovdqu RB0, (4 * 16)(%rsi); + jb .Lblk8_store_output_done; + vmovdqu RB1, (5 * 16)(%rsi); + je .Lblk8_store_output_done; + vmovdqu RB2, (6 * 16)(%rsi); + cmpq $7, %rcx; + je .Lblk8_store_output_done; + vmovdqu RB3, (7 * 16)(%rsi); + +.Lblk8_store_output_done: + vzeroall; + FRAME_END + ret; +SYM_FUNC_END(sm4_aesni_avx_crypt8) + +/* + * void sm4_aesni_avx_ctr_enc_blk8(const u32 *rk, u8 *dst, + * const u8 *src, u8 *iv) + */ +.align 8 +SYM_FUNC_START(sm4_aesni_avx_ctr_enc_blk8) + /* input: + * %rdi: round key array, CTX + * %rsi: dst (8 blocks) + * %rdx: src (8 blocks) + * %rcx: iv (big endian, 128bit) + */ + FRAME_BEGIN + + /* load IV and byteswap */ + vmovdqu (%rcx), RA0; + + vmovdqa .Lbswap128_mask rRIP, RBSWAP; + vpshufb RBSWAP, RA0, RTMP0; /* be => le */ + + vpcmpeqd RNOT, RNOT, RNOT; + vpsrldq $8, RNOT, RNOT; /* low: -1, high: 0 */ + +#define inc_le128(x, minus_one, tmp) \ + vpcmpeqq minus_one, x, tmp; \ + vpsubq minus_one, x, x; \ + vpslldq $8, tmp, tmp; \ + vpsubq tmp, x, x; + + /* construct IVs */ + inc_le128(RTMP0, RNOT, RTMP2); /* +1 */ + vpshufb RBSWAP, RTMP0, RA1; + inc_le128(RTMP0, RNOT, RTMP2); /* +2 */ + vpshufb RBSWAP, RTMP0, RA2; + inc_le128(RTMP0, RNOT, RTMP2); /* +3 */ + vpshufb RBSWAP, RTMP0, RA3; + inc_le128(RTMP0, RNOT, RTMP2); /* +4 */ + vpshufb RBSWAP, RTMP0, RB0; + inc_le128(RTMP0, RNOT, RTMP2); /* +5 */ + vpshufb RBSWAP, RTMP0, RB1; + inc_le128(RTMP0, RNOT, RTMP2); /* +6 */ + vpshufb RBSWAP, RTMP0, RB2; + inc_le128(RTMP0, RNOT, RTMP2); /* +7 */ + vpshufb RBSWAP, RTMP0, RB3; + inc_le128(RTMP0, RNOT, RTMP2); /* +8 */ + vpshufb RBSWAP, RTMP0, RTMP1; + + /* store new IV */ + vmovdqu RTMP1, (%rcx); + + call __sm4_crypt_blk8; + + vpxor (0 * 16)(%rdx), RA0, RA0; + vpxor (1 * 16)(%rdx), RA1, RA1; + vpxor (2 * 16)(%rdx), RA2, RA2; + vpxor (3 * 16)(%rdx), RA3, RA3; + vpxor (4 * 16)(%rdx), RB0, RB0; + vpxor (5 * 16)(%rdx), RB1, RB1; + vpxor (6 * 16)(%rdx), RB2, RB2; + vpxor (7 * 16)(%rdx), RB3, RB3; + + vmovdqu RA0, (0 * 16)(%rsi); + vmovdqu RA1, (1 * 16)(%rsi); + vmovdqu RA2, (2 * 16)(%rsi); + vmovdqu RA3, (3 * 16)(%rsi); + vmovdqu RB0, (4 * 16)(%rsi); + vmovdqu RB1, (5 * 16)(%rsi); + vmovdqu RB2, (6 * 16)(%rsi); + vmovdqu RB3, (7 * 16)(%rsi); + + vzeroall; + FRAME_END + ret; +SYM_FUNC_END(sm4_aesni_avx_ctr_enc_blk8) + +/* + * void sm4_aesni_avx_cbc_dec_blk8(const u32 *rk, u8 *dst, + * const u8 *src, u8 *iv) + */ +.align 8 +SYM_FUNC_START(sm4_aesni_avx_cbc_dec_blk8) + /* input: + * %rdi: round key array, CTX + * %rsi: dst (8 blocks) + * %rdx: src (8 blocks) + * %rcx: iv + */ + FRAME_BEGIN + + vmovdqu (0 * 16)(%rdx), RA0; + vmovdqu (1 * 16)(%rdx), RA1; + vmovdqu (2 * 16)(%rdx), RA2; + vmovdqu (3 * 16)(%rdx), RA3; + vmovdqu (4 * 16)(%rdx), RB0; + vmovdqu (5 * 16)(%rdx), RB1; + vmovdqu (6 * 16)(%rdx), RB2; + vmovdqu (7 * 16)(%rdx), RB3; + + call __sm4_crypt_blk8; + + vmovdqu (7 * 16)(%rdx), RNOT; + vpxor (%rcx), RA0, RA0; + vpxor (0 * 16)(%rdx), RA1, RA1; + vpxor (1 * 16)(%rdx), RA2, RA2; + vpxor (2 * 16)(%rdx), RA3, RA3; + vpxor (3 * 16)(%rdx), RB0, RB0; + vpxor (4 * 16)(%rdx), RB1, RB1; + vpxor (5 * 16)(%rdx), RB2, RB2; + vpxor (6 * 16)(%rdx), RB3, RB3; + vmovdqu RNOT, (%rcx); /* store new IV */ + + vmovdqu RA0, (0 * 16)(%rsi); + vmovdqu RA1, (1 * 16)(%rsi); + vmovdqu RA2, (2 * 16)(%rsi); + vmovdqu RA3, (3 * 16)(%rsi); + vmovdqu RB0, (4 * 16)(%rsi); + vmovdqu RB1, (5 * 16)(%rsi); + vmovdqu RB2, (6 * 16)(%rsi); + vmovdqu RB3, (7 * 16)(%rsi); + + vzeroall; + FRAME_END + ret; +SYM_FUNC_END(sm4_aesni_avx_cbc_dec_blk8) + +/* + * void sm4_aesni_avx_cfb_dec_blk8(const u32 *rk, u8 *dst, + * const u8 *src, u8 *iv) + */ +.align 8 +SYM_FUNC_START(sm4_aesni_avx_cfb_dec_blk8) + /* input: + * %rdi: round key array, CTX + * %rsi: dst (8 blocks) + * %rdx: src (8 blocks) + * %rcx: iv + */ + FRAME_BEGIN + + /* Load input */ + vmovdqu (%rcx), RA0; + vmovdqu 0 * 16(%rdx), RA1; + vmovdqu 1 * 16(%rdx), RA2; + vmovdqu 2 * 16(%rdx), RA3; + vmovdqu 3 * 16(%rdx), RB0; + vmovdqu 4 * 16(%rdx), RB1; + vmovdqu 5 * 16(%rdx), RB2; + vmovdqu 6 * 16(%rdx), RB3; + + /* Update IV */ + vmovdqu 7 * 16(%rdx), RNOT; + vmovdqu RNOT, (%rcx); + + call __sm4_crypt_blk8; + + vpxor (0 * 16)(%rdx), RA0, RA0; + vpxor (1 * 16)(%rdx), RA1, RA1; + vpxor (2 * 16)(%rdx), RA2, RA2; + vpxor (3 * 16)(%rdx), RA3, RA3; + vpxor (4 * 16)(%rdx), RB0, RB0; + vpxor (5 * 16)(%rdx), RB1, RB1; + vpxor (6 * 16)(%rdx), RB2, RB2; + vpxor (7 * 16)(%rdx), RB3, RB3; + + vmovdqu RA0, (0 * 16)(%rsi); + vmovdqu RA1, (1 * 16)(%rsi); + vmovdqu RA2, (2 * 16)(%rsi); + vmovdqu RA3, (3 * 16)(%rsi); + vmovdqu RB0, (4 * 16)(%rsi); + vmovdqu RB1, (5 * 16)(%rsi); + vmovdqu RB2, (6 * 16)(%rsi); + vmovdqu RB3, (7 * 16)(%rsi); + + vzeroall; + FRAME_END + ret; +SYM_FUNC_END(sm4_aesni_avx_cfb_dec_blk8) diff --git a/arch/x86/crypto/sm4_aesni_avx_glue.c b/arch/x86/crypto/sm4_aesni_avx_glue.c new file mode 100644 index 0000000000000..c1f5728efd1d7 --- /dev/null +++ b/arch/x86/crypto/sm4_aesni_avx_glue.c @@ -0,0 +1,459 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SM4 Cipher Algorithm, AES-NI/AVX optimized. + * as specified in + * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html + * + * Copyright (c) 2021, Alibaba Group. + * Copyright (c) 2021 Tianjia Zhang + */ + +#include +#include +#include +#include +#include +#include +#include + +#define SM4_CRYPT8_BLOCK_SIZE (SM4_BLOCK_SIZE * 8) + +asmlinkage void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst, + const u8 *src, int nblocks); +asmlinkage void sm4_aesni_avx_crypt8(const u32 *rk, u8 *dst, + const u8 *src, int nblocks); +asmlinkage void sm4_aesni_avx_ctr_enc_blk8(const u32 *rk, u8 *dst, + const u8 *src, u8 *iv); +asmlinkage void sm4_aesni_avx_cbc_dec_blk8(const u32 *rk, u8 *dst, + const u8 *src, u8 *iv); +asmlinkage void sm4_aesni_avx_cfb_dec_blk8(const u32 *rk, u8 *dst, + const u8 *src, u8 *iv); + +static int sm4_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int key_len) +{ + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + + return sm4_expandkey(ctx, key, key_len); +} + +static int ecb_do_crypt(struct skcipher_request *req, const u32 *rkey) +{ + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + + kernel_fpu_begin(); + while (nbytes >= SM4_CRYPT8_BLOCK_SIZE) { + sm4_aesni_avx_crypt8(rkey, dst, src, 8); + dst += SM4_CRYPT8_BLOCK_SIZE; + src += SM4_CRYPT8_BLOCK_SIZE; + nbytes -= SM4_CRYPT8_BLOCK_SIZE; + } + while (nbytes >= SM4_BLOCK_SIZE) { + unsigned int nblocks = min(nbytes >> 4, 4u); + sm4_aesni_avx_crypt4(rkey, dst, src, nblocks); + dst += nblocks * SM4_BLOCK_SIZE; + src += nblocks * SM4_BLOCK_SIZE; + nbytes -= nblocks * SM4_BLOCK_SIZE; + } + kernel_fpu_end(); + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static int ecb_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + + return ecb_do_crypt(req, ctx->rkey_enc); +} + +static int ecb_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + + return ecb_do_crypt(req, ctx->rkey_dec); +} + +static int cbc_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *iv = walk.iv; + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + + while (nbytes >= SM4_BLOCK_SIZE) { + crypto_xor_cpy(dst, src, iv, SM4_BLOCK_SIZE); + sm4_crypt_block(ctx->rkey_enc, dst, dst); + iv = dst; + src += SM4_BLOCK_SIZE; + dst += SM4_BLOCK_SIZE; + nbytes -= SM4_BLOCK_SIZE; + } + if (iv != walk.iv) + memcpy(walk.iv, iv, SM4_BLOCK_SIZE); + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static int cbc_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + + kernel_fpu_begin(); + + while (nbytes >= SM4_CRYPT8_BLOCK_SIZE) { + sm4_aesni_avx_cbc_dec_blk8(ctx->rkey_dec, dst, + src, walk.iv); + dst += SM4_CRYPT8_BLOCK_SIZE; + src += SM4_CRYPT8_BLOCK_SIZE; + nbytes -= SM4_CRYPT8_BLOCK_SIZE; + } + + if (nbytes >= SM4_BLOCK_SIZE) { + u8 keystream[SM4_BLOCK_SIZE * 8]; + u8 iv[SM4_BLOCK_SIZE]; + unsigned int nblocks = min(nbytes >> 4, 8u); + int i; + + sm4_aesni_avx_crypt8(ctx->rkey_dec, keystream, + src, nblocks); + + src += ((int)nblocks - 2) * SM4_BLOCK_SIZE; + dst += (nblocks - 1) * SM4_BLOCK_SIZE; + memcpy(iv, src + SM4_BLOCK_SIZE, SM4_BLOCK_SIZE); + + for (i = nblocks - 1; i > 0; i--) { + crypto_xor_cpy(dst, src, + &keystream[i * SM4_BLOCK_SIZE], + SM4_BLOCK_SIZE); + src -= SM4_BLOCK_SIZE; + dst -= SM4_BLOCK_SIZE; + } + crypto_xor_cpy(dst, walk.iv, keystream, SM4_BLOCK_SIZE); + memcpy(walk.iv, iv, SM4_BLOCK_SIZE); + nbytes -= nblocks * SM4_BLOCK_SIZE; + } + + kernel_fpu_end(); + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static int cfb_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + u8 keystream[SM4_BLOCK_SIZE]; + const u8 *iv = walk.iv; + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + + while (nbytes >= SM4_BLOCK_SIZE) { + sm4_crypt_block(ctx->rkey_enc, keystream, iv); + crypto_xor_cpy(dst, src, keystream, SM4_BLOCK_SIZE); + iv = dst; + src += SM4_BLOCK_SIZE; + dst += SM4_BLOCK_SIZE; + nbytes -= SM4_BLOCK_SIZE; + } + if (iv != walk.iv) + memcpy(walk.iv, iv, SM4_BLOCK_SIZE); + + /* tail */ + if (walk.nbytes == walk.total && nbytes > 0) { + sm4_crypt_block(ctx->rkey_enc, keystream, walk.iv); + crypto_xor_cpy(dst, src, keystream, nbytes); + nbytes = 0; + } + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static int cfb_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + + kernel_fpu_begin(); + + while (nbytes >= SM4_CRYPT8_BLOCK_SIZE) { + sm4_aesni_avx_cfb_dec_blk8(ctx->rkey_enc, dst, + src, walk.iv); + dst += SM4_CRYPT8_BLOCK_SIZE; + src += SM4_CRYPT8_BLOCK_SIZE; + nbytes -= SM4_CRYPT8_BLOCK_SIZE; + } + + if (nbytes >= SM4_BLOCK_SIZE) { + u8 keystream[SM4_BLOCK_SIZE * 8]; + unsigned int nblocks = min(nbytes >> 4, 8u); + + memcpy(keystream, walk.iv, SM4_BLOCK_SIZE); + if (nblocks > 1) + memcpy(&keystream[SM4_BLOCK_SIZE], src, + (nblocks - 1) * SM4_BLOCK_SIZE); + memcpy(walk.iv, src + (nblocks - 1) * SM4_BLOCK_SIZE, + SM4_BLOCK_SIZE); + + sm4_aesni_avx_crypt8(ctx->rkey_enc, keystream, + keystream, nblocks); + + crypto_xor_cpy(dst, src, keystream, + nblocks * SM4_BLOCK_SIZE); + dst += nblocks * SM4_BLOCK_SIZE; + src += nblocks * SM4_BLOCK_SIZE; + nbytes -= nblocks * SM4_BLOCK_SIZE; + } + + kernel_fpu_end(); + + /* tail */ + if (walk.nbytes == walk.total && nbytes > 0) { + u8 keystream[SM4_BLOCK_SIZE]; + + sm4_crypt_block(ctx->rkey_enc, keystream, walk.iv); + crypto_xor_cpy(dst, src, keystream, nbytes); + nbytes = 0; + } + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static int ctr_crypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + + kernel_fpu_begin(); + + while (nbytes >= SM4_CRYPT8_BLOCK_SIZE) { + sm4_aesni_avx_ctr_enc_blk8(ctx->rkey_enc, dst, + src, walk.iv); + dst += SM4_CRYPT8_BLOCK_SIZE; + src += SM4_CRYPT8_BLOCK_SIZE; + nbytes -= SM4_CRYPT8_BLOCK_SIZE; + } + + if (nbytes >= SM4_BLOCK_SIZE) { + u8 keystream[SM4_BLOCK_SIZE * 8]; + unsigned int nblocks = min(nbytes >> 4, 8u); + int i; + + for (i = 0; i < nblocks; i++) { + memcpy(&keystream[i * SM4_BLOCK_SIZE], + walk.iv, SM4_BLOCK_SIZE); + crypto_inc(walk.iv, SM4_BLOCK_SIZE); + } + sm4_aesni_avx_crypt8(ctx->rkey_enc, keystream, + keystream, nblocks); + + crypto_xor_cpy(dst, src, keystream, + nblocks * SM4_BLOCK_SIZE); + dst += nblocks * SM4_BLOCK_SIZE; + src += nblocks * SM4_BLOCK_SIZE; + nbytes -= nblocks * SM4_BLOCK_SIZE; + } + + kernel_fpu_end(); + + /* tail */ + if (walk.nbytes == walk.total && nbytes > 0) { + u8 keystream[SM4_BLOCK_SIZE]; + + memcpy(keystream, walk.iv, SM4_BLOCK_SIZE); + crypto_inc(walk.iv, SM4_BLOCK_SIZE); + + sm4_crypt_block(ctx->rkey_enc, keystream, keystream); + + crypto_xor_cpy(dst, src, keystream, nbytes); + dst += nbytes; + src += nbytes; + nbytes = 0; + } + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static struct skcipher_alg sm4_aesni_avx_skciphers[] = { + { + .base = { + .cra_name = "__ecb(sm4)", + .cra_driver_name = "__ecb-sm4-aesni-avx", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = SM4_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .walksize = 8 * SM4_BLOCK_SIZE, + .setkey = sm4_skcipher_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, { + .base = { + .cra_name = "__cbc(sm4)", + .cra_driver_name = "__cbc-sm4-aesni-avx", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = SM4_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .walksize = 8 * SM4_BLOCK_SIZE, + .setkey = sm4_skcipher_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, { + .base = { + .cra_name = "__cfb(sm4)", + .cra_driver_name = "__cfb-sm4-aesni-avx", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .chunksize = SM4_BLOCK_SIZE, + .walksize = 8 * SM4_BLOCK_SIZE, + .setkey = sm4_skcipher_setkey, + .encrypt = cfb_encrypt, + .decrypt = cfb_decrypt, + }, { + .base = { + .cra_name = "__ctr(sm4)", + .cra_driver_name = "__ctr-sm4-aesni-avx", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .chunksize = SM4_BLOCK_SIZE, + .walksize = 8 * SM4_BLOCK_SIZE, + .setkey = sm4_skcipher_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + } +}; + +static struct simd_skcipher_alg * +simd_sm4_aesni_avx_skciphers[ARRAY_SIZE(sm4_aesni_avx_skciphers)]; + +static int __init sm4_init(void) +{ + const char *feature_name; + + if (!boot_cpu_has(X86_FEATURE_AVX) || + !boot_cpu_has(X86_FEATURE_AES) || + !boot_cpu_has(X86_FEATURE_OSXSAVE)) { + pr_info("AVX or AES-NI instructions are not detected.\n"); + return -ENODEV; + } + + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); + return -ENODEV; + } + + return simd_register_skciphers_compat(sm4_aesni_avx_skciphers, + ARRAY_SIZE(sm4_aesni_avx_skciphers), + simd_sm4_aesni_avx_skciphers); +} + +static void __exit sm4_exit(void) +{ + simd_unregister_skciphers(sm4_aesni_avx_skciphers, + ARRAY_SIZE(sm4_aesni_avx_skciphers), + simd_sm4_aesni_avx_skciphers); +} + +module_init(sm4_init); +module_exit(sm4_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Tianjia Zhang "); +MODULE_DESCRIPTION("SM4 Cipher Algorithm, AES-NI/AVX optimized"); +MODULE_ALIAS_CRYPTO("sm4"); +MODULE_ALIAS_CRYPTO("sm4-aesni-avx"); diff --git a/crypto/Kconfig b/crypto/Kconfig index 655a6f51ab131..7e75808e4b9c3 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1563,6 +1563,27 @@ config CRYPTO_SM4 If unsure, say N. +config CRYPTO_SM4_AESNI_AVX_X86_64 + tristate "SM4 cipher algorithm (x86_64/AES-NI/AVX)" + depends on X86 && 64BIT + select CRYPTO_SKCIPHER + select CRYPTO_SIMD + select CRYPTO_ALGAPI + select CRYPTO_LIB_SM4 + help + SM4 cipher algorithms (OSCCA GB/T 32907-2016) (x86_64/AES-NI/AVX). + + SM4 (GBT.32907-2016) is a cryptographic standard issued by the + Organization of State Commercial Administration of China (OSCCA) + as an authorized cryptographic algorithms for the use within China. + + This is SM4 optimized implementation using AES-NI/AVX/x86_64 + instruction set for block cipher. Through two affine transforms, + we can use the AES S-Box to simulate the SM4 S-Box to achieve the + effect of instruction acceleration. + + If unsure, say N. + config CRYPTO_TEA tristate "TEA, XTEA and XETA cipher algorithms" select CRYPTO_ALGAPI From 9ed1104f9f5d542712a83cbed947135a09fb6576 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Tue, 20 Jul 2021 11:46:42 +0800 Subject: [PATCH 0204/5344] crypto: tcrypt - add the asynchronous speed test for SM4 commit a7fc80bb22eb0f13791ee4f70484e88316cc2a24 upstream. tcrypt supports testing of SM4 cipher algorithms that use avx instruction set acceleration. The implementation of sm4 instruction set acceleration supports up to 8 blocks in parallel encryption and decryption, which is 128 bytes. Therefore, the 128-byte block size is also added to block_sizes. Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu Signed-off-by: Guixiong Wei --- crypto/tcrypt.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 0cece1f883ebe..8ff8e2f39df12 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -76,7 +76,7 @@ static char *check[] = { NULL }; -static u32 block_sizes[] = { 16, 64, 256, 1024, 1472, 8192, 0 }; +static u32 block_sizes[] = { 16, 64, 128, 256, 1024, 1472, 8192, 0 }; static u32 aead_sizes[] = { 16, 64, 256, 512, 1024, 2048, 4096, 8192, 0 }; #define XBUFSIZE 8 @@ -2047,6 +2047,7 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) case 191: ret += tcrypt_test("ecb(sm4)"); ret += tcrypt_test("cbc(sm4)"); + ret += tcrypt_test("cfb(sm4)"); ret += tcrypt_test("ctr(sm4)"); break; case 200: @@ -2310,6 +2311,10 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) speed_template_16); test_cipher_speed("cbc(sm4)", DECRYPT, sec, NULL, 0, speed_template_16); + test_cipher_speed("cfb(sm4)", ENCRYPT, sec, NULL, 0, + speed_template_16); + test_cipher_speed("cfb(sm4)", DECRYPT, sec, NULL, 0, + speed_template_16); test_cipher_speed("ctr(sm4)", ENCRYPT, sec, NULL, 0, speed_template_16); test_cipher_speed("ctr(sm4)", DECRYPT, sec, NULL, 0, @@ -2801,6 +2806,25 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) speed_template_8_32); break; + case 518: + test_acipher_speed("ecb(sm4)", ENCRYPT, sec, NULL, 0, + speed_template_16); + test_acipher_speed("ecb(sm4)", DECRYPT, sec, NULL, 0, + speed_template_16); + test_acipher_speed("cbc(sm4)", ENCRYPT, sec, NULL, 0, + speed_template_16); + test_acipher_speed("cbc(sm4)", DECRYPT, sec, NULL, 0, + speed_template_16); + test_acipher_speed("cfb(sm4)", ENCRYPT, sec, NULL, 0, + speed_template_16); + test_acipher_speed("cfb(sm4)", DECRYPT, sec, NULL, 0, + speed_template_16); + test_acipher_speed("ctr(sm4)", ENCRYPT, sec, NULL, 0, + speed_template_16); + test_acipher_speed("ctr(sm4)", DECRYPT, sec, NULL, 0, + speed_template_16); + break; + case 600: test_mb_skcipher_speed("ecb(aes)", ENCRYPT, sec, NULL, 0, speed_template_16_24_32, num_mb); From fe181109e80f897e8d48bde13064b6a98caf5648 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Wed, 18 Aug 2021 11:31:16 +0800 Subject: [PATCH 0205/5344] crypto: x86/sm4 - export reusable AESNI/AVX functions commit de79d9aae493a29d02926f396a4fd1a1309436fc upstream. Export the reusable functions in the SM4 AESNI/AVX implementation, mainly public functions, which are used to develop the SM4 AESNI/AVX2 implementation, and eliminate unnecessary duplication of code. At the same time, in order to make the public function universal, minor fixes was added. Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu Signed-off-by: Guixiong Wei --- arch/x86/crypto/sm4-avx.h | 24 ++++++++ arch/x86/crypto/sm4_aesni_avx_glue.c | 92 ++++++++++++++++++---------- 2 files changed, 84 insertions(+), 32 deletions(-) create mode 100644 arch/x86/crypto/sm4-avx.h diff --git a/arch/x86/crypto/sm4-avx.h b/arch/x86/crypto/sm4-avx.h new file mode 100644 index 0000000000000..1bceab7516aa1 --- /dev/null +++ b/arch/x86/crypto/sm4-avx.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef ASM_X86_SM4_AVX_H +#define ASM_X86_SM4_AVX_H + +#include +#include + +typedef void (*sm4_crypt_func)(const u32 *rk, u8 *dst, const u8 *src, u8 *iv); + +int sm4_avx_ecb_encrypt(struct skcipher_request *req); +int sm4_avx_ecb_decrypt(struct skcipher_request *req); + +int sm4_cbc_encrypt(struct skcipher_request *req); +int sm4_avx_cbc_decrypt(struct skcipher_request *req, + unsigned int bsize, sm4_crypt_func func); + +int sm4_cfb_encrypt(struct skcipher_request *req); +int sm4_avx_cfb_decrypt(struct skcipher_request *req, + unsigned int bsize, sm4_crypt_func func); + +int sm4_avx_ctr_crypt(struct skcipher_request *req, + unsigned int bsize, sm4_crypt_func func); + +#endif diff --git a/arch/x86/crypto/sm4_aesni_avx_glue.c b/arch/x86/crypto/sm4_aesni_avx_glue.c index c1f5728efd1d7..7800f77d68add 100644 --- a/arch/x86/crypto/sm4_aesni_avx_glue.c +++ b/arch/x86/crypto/sm4_aesni_avx_glue.c @@ -15,6 +15,7 @@ #include #include #include +#include "sm4-avx.h" #define SM4_CRYPT8_BLOCK_SIZE (SM4_BLOCK_SIZE * 8) @@ -71,23 +72,25 @@ static int ecb_do_crypt(struct skcipher_request *req, const u32 *rkey) return err; } -static int ecb_encrypt(struct skcipher_request *req) +int sm4_avx_ecb_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); return ecb_do_crypt(req, ctx->rkey_enc); } +EXPORT_SYMBOL_GPL(sm4_avx_ecb_encrypt); -static int ecb_decrypt(struct skcipher_request *req) +int sm4_avx_ecb_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); return ecb_do_crypt(req, ctx->rkey_dec); } +EXPORT_SYMBOL_GPL(sm4_avx_ecb_decrypt); -static int cbc_encrypt(struct skcipher_request *req) +int sm4_cbc_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); @@ -118,8 +121,10 @@ static int cbc_encrypt(struct skcipher_request *req) return err; } +EXPORT_SYMBOL_GPL(sm4_cbc_encrypt); -static int cbc_decrypt(struct skcipher_request *req) +int sm4_avx_cbc_decrypt(struct skcipher_request *req, + unsigned int bsize, sm4_crypt_func func) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); @@ -135,15 +140,14 @@ static int cbc_decrypt(struct skcipher_request *req) kernel_fpu_begin(); - while (nbytes >= SM4_CRYPT8_BLOCK_SIZE) { - sm4_aesni_avx_cbc_dec_blk8(ctx->rkey_dec, dst, - src, walk.iv); - dst += SM4_CRYPT8_BLOCK_SIZE; - src += SM4_CRYPT8_BLOCK_SIZE; - nbytes -= SM4_CRYPT8_BLOCK_SIZE; + while (nbytes >= bsize) { + func(ctx->rkey_dec, dst, src, walk.iv); + dst += bsize; + src += bsize; + nbytes -= bsize; } - if (nbytes >= SM4_BLOCK_SIZE) { + while (nbytes >= SM4_BLOCK_SIZE) { u8 keystream[SM4_BLOCK_SIZE * 8]; u8 iv[SM4_BLOCK_SIZE]; unsigned int nblocks = min(nbytes >> 4, 8u); @@ -165,6 +169,8 @@ static int cbc_decrypt(struct skcipher_request *req) } crypto_xor_cpy(dst, walk.iv, keystream, SM4_BLOCK_SIZE); memcpy(walk.iv, iv, SM4_BLOCK_SIZE); + dst += nblocks * SM4_BLOCK_SIZE; + src += (nblocks + 1) * SM4_BLOCK_SIZE; nbytes -= nblocks * SM4_BLOCK_SIZE; } @@ -174,8 +180,15 @@ static int cbc_decrypt(struct skcipher_request *req) return err; } +EXPORT_SYMBOL_GPL(sm4_avx_cbc_decrypt); + +static int cbc_decrypt(struct skcipher_request *req) +{ + return sm4_avx_cbc_decrypt(req, SM4_CRYPT8_BLOCK_SIZE, + sm4_aesni_avx_cbc_dec_blk8); +} -static int cfb_encrypt(struct skcipher_request *req) +int sm4_cfb_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); @@ -214,8 +227,10 @@ static int cfb_encrypt(struct skcipher_request *req) return err; } +EXPORT_SYMBOL_GPL(sm4_cfb_encrypt); -static int cfb_decrypt(struct skcipher_request *req) +int sm4_avx_cfb_decrypt(struct skcipher_request *req, + unsigned int bsize, sm4_crypt_func func) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); @@ -231,15 +246,14 @@ static int cfb_decrypt(struct skcipher_request *req) kernel_fpu_begin(); - while (nbytes >= SM4_CRYPT8_BLOCK_SIZE) { - sm4_aesni_avx_cfb_dec_blk8(ctx->rkey_enc, dst, - src, walk.iv); - dst += SM4_CRYPT8_BLOCK_SIZE; - src += SM4_CRYPT8_BLOCK_SIZE; - nbytes -= SM4_CRYPT8_BLOCK_SIZE; + while (nbytes >= bsize) { + func(ctx->rkey_enc, dst, src, walk.iv); + dst += bsize; + src += bsize; + nbytes -= bsize; } - if (nbytes >= SM4_BLOCK_SIZE) { + while (nbytes >= SM4_BLOCK_SIZE) { u8 keystream[SM4_BLOCK_SIZE * 8]; unsigned int nblocks = min(nbytes >> 4, 8u); @@ -276,8 +290,16 @@ static int cfb_decrypt(struct skcipher_request *req) return err; } +EXPORT_SYMBOL_GPL(sm4_avx_cfb_decrypt); -static int ctr_crypt(struct skcipher_request *req) +static int cfb_decrypt(struct skcipher_request *req) +{ + return sm4_avx_cfb_decrypt(req, SM4_CRYPT8_BLOCK_SIZE, + sm4_aesni_avx_cfb_dec_blk8); +} + +int sm4_avx_ctr_crypt(struct skcipher_request *req, + unsigned int bsize, sm4_crypt_func func) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); @@ -293,15 +315,14 @@ static int ctr_crypt(struct skcipher_request *req) kernel_fpu_begin(); - while (nbytes >= SM4_CRYPT8_BLOCK_SIZE) { - sm4_aesni_avx_ctr_enc_blk8(ctx->rkey_enc, dst, - src, walk.iv); - dst += SM4_CRYPT8_BLOCK_SIZE; - src += SM4_CRYPT8_BLOCK_SIZE; - nbytes -= SM4_CRYPT8_BLOCK_SIZE; + while (nbytes >= bsize) { + func(ctx->rkey_enc, dst, src, walk.iv); + dst += bsize; + src += bsize; + nbytes -= bsize; } - if (nbytes >= SM4_BLOCK_SIZE) { + while (nbytes >= SM4_BLOCK_SIZE) { u8 keystream[SM4_BLOCK_SIZE * 8]; unsigned int nblocks = min(nbytes >> 4, 8u); int i; @@ -343,6 +364,13 @@ static int ctr_crypt(struct skcipher_request *req) return err; } +EXPORT_SYMBOL_GPL(sm4_avx_ctr_crypt); + +static int ctr_crypt(struct skcipher_request *req) +{ + return sm4_avx_ctr_crypt(req, SM4_CRYPT8_BLOCK_SIZE, + sm4_aesni_avx_ctr_enc_blk8); +} static struct skcipher_alg sm4_aesni_avx_skciphers[] = { { @@ -359,8 +387,8 @@ static struct skcipher_alg sm4_aesni_avx_skciphers[] = { .max_keysize = SM4_KEY_SIZE, .walksize = 8 * SM4_BLOCK_SIZE, .setkey = sm4_skcipher_setkey, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, + .encrypt = sm4_avx_ecb_encrypt, + .decrypt = sm4_avx_ecb_decrypt, }, { .base = { .cra_name = "__cbc(sm4)", @@ -376,7 +404,7 @@ static struct skcipher_alg sm4_aesni_avx_skciphers[] = { .ivsize = SM4_BLOCK_SIZE, .walksize = 8 * SM4_BLOCK_SIZE, .setkey = sm4_skcipher_setkey, - .encrypt = cbc_encrypt, + .encrypt = sm4_cbc_encrypt, .decrypt = cbc_decrypt, }, { .base = { @@ -394,7 +422,7 @@ static struct skcipher_alg sm4_aesni_avx_skciphers[] = { .chunksize = SM4_BLOCK_SIZE, .walksize = 8 * SM4_BLOCK_SIZE, .setkey = sm4_skcipher_setkey, - .encrypt = cfb_encrypt, + .encrypt = sm4_cfb_encrypt, .decrypt = cfb_decrypt, }, { .base = { From ac1f7cceb5ac1705bd88f0519074ba9f64494f78 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Wed, 18 Aug 2021 11:31:17 +0800 Subject: [PATCH 0206/5344] crypto: x86/sm4 - add AES-NI/AVX2/x86_64 implementation commit 5b2efa2bb865eb784e06987c7ce98c3c835b495b upstream. Like the implementation of AESNI/AVX, this patch adds an accelerated implementation of AESNI/AVX2. In terms of code implementation, by reusing AESNI/AVX mode-related codes, the amount of code is greatly reduced. From the benchmark data, it can be seen that when the block size is 1024, compared to AVX acceleration, the performance achieved by AVX2 has increased by about 70%, it is also 7.7 times of the pure software implementation of sm4-generic. The main algorithm implementation comes from SM4 AES-NI work by libgcrypt and Markku-Juhani O. Saarinen at: https://github.com/mjosaarinen/sm4ni This optimization supports the four modes of SM4, ECB, CBC, CFB, and CTR. Since CBC and CFB do not support multiple block parallel encryption, the optimization effect is not obvious. Benchmark on Intel i5-6200U 2.30GHz, performance data of three implementation methods, pure software sm4-generic, aesni/avx acceleration, and aesni/avx2 acceleration, the data comes from the 218 mode and 518 mode of tcrypt. The abscissas are blocks of different lengths. The data is tabulated and the unit is Mb/s: block-size | 16 64 128 256 1024 1420 4096 sm4-generic ECB enc | 60.94 70.41 72.27 73.02 73.87 73.58 73.59 ECB dec | 61.87 70.53 72.15 73.09 73.89 73.92 73.86 CBC enc | 56.71 66.31 68.05 69.84 70.02 70.12 70.24 CBC dec | 54.54 65.91 68.22 69.51 70.63 70.79 70.82 CFB enc | 57.21 67.24 69.10 70.25 70.73 70.52 71.42 CFB dec | 57.22 64.74 66.31 67.24 67.40 67.64 67.58 CTR enc | 59.47 68.64 69.91 71.02 71.86 71.61 71.95 CTR dec | 59.94 68.77 69.95 71.00 71.84 71.55 71.95 sm4-aesni-avx ECB enc | 44.95 177.35 292.06 316.98 339.48 322.27 330.59 ECB dec | 45.28 178.66 292.31 317.52 339.59 322.52 331.16 CBC enc | 57.75 67.68 69.72 70.60 71.48 71.63 71.74 CBC dec | 44.32 176.83 284.32 307.24 328.61 312.61 325.82 CFB enc | 57.81 67.64 69.63 70.55 71.40 71.35 71.70 CFB dec | 43.14 167.78 282.03 307.20 328.35 318.24 325.95 CTR enc | 42.35 163.32 279.11 302.93 320.86 310.56 317.93 CTR dec | 42.39 162.81 278.49 302.37 321.11 310.33 318.37 sm4-aesni-avx2 ECB enc | 45.19 177.41 292.42 316.12 339.90 322.53 330.54 ECB dec | 44.83 178.90 291.45 317.31 339.85 322.55 331.07 CBC enc | 57.66 67.62 69.73 70.55 71.58 71.66 71.77 CBC dec | 44.34 176.86 286.10 501.68 559.58 483.87 527.46 CFB enc | 57.43 67.60 69.61 70.52 71.43 71.28 71.65 CFB dec | 43.12 167.75 268.09 499.33 558.35 490.36 524.73 CTR enc | 42.42 163.39 256.17 493.95 552.45 481.58 517.19 CTR dec | 42.49 163.11 256.36 493.34 552.62 481.49 516.83 Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu Signed-off-by: Guixiong Wei --- arch/x86/crypto/Makefile | 3 + arch/x86/crypto/sm4-aesni-avx2-asm_64.S | 497 ++++++++++++++++++++++++ arch/x86/crypto/sm4_aesni_avx2_glue.c | 169 ++++++++ crypto/Kconfig | 22 ++ 4 files changed, 691 insertions(+) create mode 100644 arch/x86/crypto/sm4-aesni-avx2-asm_64.S create mode 100644 arch/x86/crypto/sm4_aesni_avx2_glue.c diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 9275e113a177d..97711d9b15bdc 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -119,3 +119,6 @@ crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o obj-$(CONFIG_CRYPTO_SM4_AESNI_AVX_X86_64) += sm4-aesni-avx-x86_64.o sm4-aesni-avx-x86_64-y := sm4-aesni-avx-asm_64.o sm4_aesni_avx_glue.o +obj-$(CONFIG_CRYPTO_SM4_AESNI_AVX2_X86_64) += sm4-aesni-avx2-x86_64.o +sm4-aesni-avx2-x86_64-y := sm4-aesni-avx2-asm_64.o sm4_aesni_avx2_glue.o + diff --git a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S new file mode 100644 index 0000000000000..d2ffd7f76ee23 --- /dev/null +++ b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S @@ -0,0 +1,497 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * SM4 Cipher Algorithm, AES-NI/AVX2 optimized. + * as specified in + * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html + * + * Copyright (C) 2018 Markku-Juhani O. Saarinen + * Copyright (C) 2020 Jussi Kivilinna + * Copyright (c) 2021 Tianjia Zhang + */ + +/* Based on SM4 AES-NI work by libgcrypt and Markku-Juhani O. Saarinen at: + * https://github.com/mjosaarinen/sm4ni + */ + +#include +#include + +#define rRIP (%rip) + +/* vector registers */ +#define RX0 %ymm0 +#define RX1 %ymm1 +#define MASK_4BIT %ymm2 +#define RTMP0 %ymm3 +#define RTMP1 %ymm4 +#define RTMP2 %ymm5 +#define RTMP3 %ymm6 +#define RTMP4 %ymm7 + +#define RA0 %ymm8 +#define RA1 %ymm9 +#define RA2 %ymm10 +#define RA3 %ymm11 + +#define RB0 %ymm12 +#define RB1 %ymm13 +#define RB2 %ymm14 +#define RB3 %ymm15 + +#define RNOT %ymm0 +#define RBSWAP %ymm1 + +#define RX0x %xmm0 +#define RX1x %xmm1 +#define MASK_4BITx %xmm2 + +#define RNOTx %xmm0 +#define RBSWAPx %xmm1 + +#define RTMP0x %xmm3 +#define RTMP1x %xmm4 +#define RTMP2x %xmm5 +#define RTMP3x %xmm6 +#define RTMP4x %xmm7 + + +/* helper macros */ + +/* Transpose four 32-bit words between 128-bit vector lanes. */ +#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ + vpunpckhdq x1, x0, t2; \ + vpunpckldq x1, x0, x0; \ + \ + vpunpckldq x3, x2, t1; \ + vpunpckhdq x3, x2, x2; \ + \ + vpunpckhqdq t1, x0, x1; \ + vpunpcklqdq t1, x0, x0; \ + \ + vpunpckhqdq x2, t2, x3; \ + vpunpcklqdq x2, t2, x2; + +/* post-SubByte transform. */ +#define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \ + vpand x, mask4bit, tmp0; \ + vpandn x, mask4bit, x; \ + vpsrld $4, x, x; \ + \ + vpshufb tmp0, lo_t, tmp0; \ + vpshufb x, hi_t, x; \ + vpxor tmp0, x, x; + +/* post-SubByte transform. Note: x has been XOR'ed with mask4bit by + * 'vaeslastenc' instruction. */ +#define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \ + vpandn mask4bit, x, tmp0; \ + vpsrld $4, x, x; \ + vpand x, mask4bit, x; \ + \ + vpshufb tmp0, lo_t, tmp0; \ + vpshufb x, hi_t, x; \ + vpxor tmp0, x, x; + + +.section .rodata.cst164, "aM", @progbits, 164 +.align 16 + +/* + * Following four affine transform look-up tables are from work by + * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni + * + * These allow exposing SM4 S-Box from AES SubByte. + */ + +/* pre-SubByte affine transform, from SM4 field to AES field. */ +.Lpre_tf_lo_s: + .quad 0x9197E2E474720701, 0xC7C1B4B222245157 +.Lpre_tf_hi_s: + .quad 0xE240AB09EB49A200, 0xF052B91BF95BB012 + +/* post-SubByte affine transform, from AES field to SM4 field. */ +.Lpost_tf_lo_s: + .quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82 +.Lpost_tf_hi_s: + .quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF + +/* For isolating SubBytes from AESENCLAST, inverse shift row */ +.Linv_shift_row: + .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b + .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 + +/* Inverse shift row + Rotate left by 8 bits on 32-bit words with vpshufb */ +.Linv_shift_row_rol_8: + .byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e + .byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06 + +/* Inverse shift row + Rotate left by 16 bits on 32-bit words with vpshufb */ +.Linv_shift_row_rol_16: + .byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01 + .byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09 + +/* Inverse shift row + Rotate left by 24 bits on 32-bit words with vpshufb */ +.Linv_shift_row_rol_24: + .byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04 + .byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c + +/* For CTR-mode IV byteswap */ +.Lbswap128_mask: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +/* For input word byte-swap */ +.Lbswap32_mask: + .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 + +.align 4 +/* 4-bit mask */ +.L0f0f0f0f: + .long 0x0f0f0f0f + +.text +.align 16 + +.align 8 +SYM_FUNC_START_LOCAL(__sm4_crypt_blk16) + /* input: + * %rdi: round key array, CTX + * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel + * plaintext blocks + * output: + * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel + * ciphertext blocks + */ + FRAME_BEGIN + + vbroadcasti128 .Lbswap32_mask rRIP, RTMP2; + vpshufb RTMP2, RA0, RA0; + vpshufb RTMP2, RA1, RA1; + vpshufb RTMP2, RA2, RA2; + vpshufb RTMP2, RA3, RA3; + vpshufb RTMP2, RB0, RB0; + vpshufb RTMP2, RB1, RB1; + vpshufb RTMP2, RB2, RB2; + vpshufb RTMP2, RB3, RB3; + + vpbroadcastd .L0f0f0f0f rRIP, MASK_4BIT; + transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); + transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1); + +#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \ + vpbroadcastd (4*(round))(%rdi), RX0; \ + vbroadcasti128 .Lpre_tf_lo_s rRIP, RTMP4; \ + vbroadcasti128 .Lpre_tf_hi_s rRIP, RTMP1; \ + vmovdqa RX0, RX1; \ + vpxor s1, RX0, RX0; \ + vpxor s2, RX0, RX0; \ + vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \ + vbroadcasti128 .Lpost_tf_lo_s rRIP, RTMP2; \ + vbroadcasti128 .Lpost_tf_hi_s rRIP, RTMP3; \ + vpxor r1, RX1, RX1; \ + vpxor r2, RX1, RX1; \ + vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ rk */ \ + \ + /* sbox, non-linear part */ \ + transform_pre(RX0, RTMP4, RTMP1, MASK_4BIT, RTMP0); \ + transform_pre(RX1, RTMP4, RTMP1, MASK_4BIT, RTMP0); \ + vextracti128 $1, RX0, RTMP4x; \ + vextracti128 $1, RX1, RTMP0x; \ + vaesenclast MASK_4BITx, RX0x, RX0x; \ + vaesenclast MASK_4BITx, RTMP4x, RTMP4x; \ + vaesenclast MASK_4BITx, RX1x, RX1x; \ + vaesenclast MASK_4BITx, RTMP0x, RTMP0x; \ + vinserti128 $1, RTMP4x, RX0, RX0; \ + vbroadcasti128 .Linv_shift_row rRIP, RTMP4; \ + vinserti128 $1, RTMP0x, RX1, RX1; \ + transform_post(RX0, RTMP2, RTMP3, MASK_4BIT, RTMP0); \ + transform_post(RX1, RTMP2, RTMP3, MASK_4BIT, RTMP0); \ + \ + /* linear part */ \ + vpshufb RTMP4, RX0, RTMP0; \ + vpxor RTMP0, s0, s0; /* s0 ^ x */ \ + vpshufb RTMP4, RX1, RTMP2; \ + vbroadcasti128 .Linv_shift_row_rol_8 rRIP, RTMP4; \ + vpxor RTMP2, r0, r0; /* r0 ^ x */ \ + vpshufb RTMP4, RX0, RTMP1; \ + vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \ + vpshufb RTMP4, RX1, RTMP3; \ + vbroadcasti128 .Linv_shift_row_rol_16 rRIP, RTMP4; \ + vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) */ \ + vpshufb RTMP4, RX0, RTMP1; \ + vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \ + vpshufb RTMP4, RX1, RTMP3; \ + vbroadcasti128 .Linv_shift_row_rol_24 rRIP, RTMP4; \ + vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) ^ rol(x,16) */ \ + vpshufb RTMP4, RX0, RTMP1; \ + vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \ + vpslld $2, RTMP0, RTMP1; \ + vpsrld $30, RTMP0, RTMP0; \ + vpxor RTMP0, s0, s0; \ + /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ + vpxor RTMP1, s0, s0; \ + vpshufb RTMP4, RX1, RTMP3; \ + vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,24) */ \ + vpslld $2, RTMP2, RTMP3; \ + vpsrld $30, RTMP2, RTMP2; \ + vpxor RTMP2, r0, r0; \ + /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ + vpxor RTMP3, r0, r0; + + leaq (32*4)(%rdi), %rax; +.align 16 +.Lroundloop_blk8: + ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3); + ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0); + ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1); + ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2); + leaq (4*4)(%rdi), %rdi; + cmpq %rax, %rdi; + jne .Lroundloop_blk8; + +#undef ROUND + + vbroadcasti128 .Lbswap128_mask rRIP, RTMP2; + + transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); + transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1); + vpshufb RTMP2, RA0, RA0; + vpshufb RTMP2, RA1, RA1; + vpshufb RTMP2, RA2, RA2; + vpshufb RTMP2, RA3, RA3; + vpshufb RTMP2, RB0, RB0; + vpshufb RTMP2, RB1, RB1; + vpshufb RTMP2, RB2, RB2; + vpshufb RTMP2, RB3, RB3; + + FRAME_END + ret; +SYM_FUNC_END(__sm4_crypt_blk16) + +#define inc_le128(x, minus_one, tmp) \ + vpcmpeqq minus_one, x, tmp; \ + vpsubq minus_one, x, x; \ + vpslldq $8, tmp, tmp; \ + vpsubq tmp, x, x; + +/* + * void sm4_aesni_avx2_ctr_enc_blk16(const u32 *rk, u8 *dst, + * const u8 *src, u8 *iv) + */ +.align 8 +SYM_FUNC_START(sm4_aesni_avx2_ctr_enc_blk16) + /* input: + * %rdi: round key array, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + * %rcx: iv (big endian, 128bit) + */ + FRAME_BEGIN + + movq 8(%rcx), %rax; + bswapq %rax; + + vzeroupper; + + vbroadcasti128 .Lbswap128_mask rRIP, RTMP3; + vpcmpeqd RNOT, RNOT, RNOT; + vpsrldq $8, RNOT, RNOT; /* ab: -1:0 ; cd: -1:0 */ + vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */ + + /* load IV and byteswap */ + vmovdqu (%rcx), RTMP4x; + vpshufb RTMP3x, RTMP4x, RTMP4x; + vmovdqa RTMP4x, RTMP0x; + inc_le128(RTMP4x, RNOTx, RTMP1x); + vinserti128 $1, RTMP4x, RTMP0, RTMP0; + vpshufb RTMP3, RTMP0, RA0; /* +1 ; +0 */ + + /* check need for handling 64-bit overflow and carry */ + cmpq $(0xffffffffffffffff - 16), %rax; + ja .Lhandle_ctr_carry; + + /* construct IVs */ + vpsubq RTMP2, RTMP0, RTMP0; /* +3 ; +2 */ + vpshufb RTMP3, RTMP0, RA1; + vpsubq RTMP2, RTMP0, RTMP0; /* +5 ; +4 */ + vpshufb RTMP3, RTMP0, RA2; + vpsubq RTMP2, RTMP0, RTMP0; /* +7 ; +6 */ + vpshufb RTMP3, RTMP0, RA3; + vpsubq RTMP2, RTMP0, RTMP0; /* +9 ; +8 */ + vpshufb RTMP3, RTMP0, RB0; + vpsubq RTMP2, RTMP0, RTMP0; /* +11 ; +10 */ + vpshufb RTMP3, RTMP0, RB1; + vpsubq RTMP2, RTMP0, RTMP0; /* +13 ; +12 */ + vpshufb RTMP3, RTMP0, RB2; + vpsubq RTMP2, RTMP0, RTMP0; /* +15 ; +14 */ + vpshufb RTMP3, RTMP0, RB3; + vpsubq RTMP2, RTMP0, RTMP0; /* +16 */ + vpshufb RTMP3x, RTMP0x, RTMP0x; + + jmp .Lctr_carry_done; + +.Lhandle_ctr_carry: + /* construct IVs */ + inc_le128(RTMP0, RNOT, RTMP1); + inc_le128(RTMP0, RNOT, RTMP1); + vpshufb RTMP3, RTMP0, RA1; /* +3 ; +2 */ + inc_le128(RTMP0, RNOT, RTMP1); + inc_le128(RTMP0, RNOT, RTMP1); + vpshufb RTMP3, RTMP0, RA2; /* +5 ; +4 */ + inc_le128(RTMP0, RNOT, RTMP1); + inc_le128(RTMP0, RNOT, RTMP1); + vpshufb RTMP3, RTMP0, RA3; /* +7 ; +6 */ + inc_le128(RTMP0, RNOT, RTMP1); + inc_le128(RTMP0, RNOT, RTMP1); + vpshufb RTMP3, RTMP0, RB0; /* +9 ; +8 */ + inc_le128(RTMP0, RNOT, RTMP1); + inc_le128(RTMP0, RNOT, RTMP1); + vpshufb RTMP3, RTMP0, RB1; /* +11 ; +10 */ + inc_le128(RTMP0, RNOT, RTMP1); + inc_le128(RTMP0, RNOT, RTMP1); + vpshufb RTMP3, RTMP0, RB2; /* +13 ; +12 */ + inc_le128(RTMP0, RNOT, RTMP1); + inc_le128(RTMP0, RNOT, RTMP1); + vpshufb RTMP3, RTMP0, RB3; /* +15 ; +14 */ + inc_le128(RTMP0, RNOT, RTMP1); + vextracti128 $1, RTMP0, RTMP0x; + vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */ + +.align 4 +.Lctr_carry_done: + /* store new IV */ + vmovdqu RTMP0x, (%rcx); + + call __sm4_crypt_blk16; + + vpxor (0 * 32)(%rdx), RA0, RA0; + vpxor (1 * 32)(%rdx), RA1, RA1; + vpxor (2 * 32)(%rdx), RA2, RA2; + vpxor (3 * 32)(%rdx), RA3, RA3; + vpxor (4 * 32)(%rdx), RB0, RB0; + vpxor (5 * 32)(%rdx), RB1, RB1; + vpxor (6 * 32)(%rdx), RB2, RB2; + vpxor (7 * 32)(%rdx), RB3, RB3; + + vmovdqu RA0, (0 * 32)(%rsi); + vmovdqu RA1, (1 * 32)(%rsi); + vmovdqu RA2, (2 * 32)(%rsi); + vmovdqu RA3, (3 * 32)(%rsi); + vmovdqu RB0, (4 * 32)(%rsi); + vmovdqu RB1, (5 * 32)(%rsi); + vmovdqu RB2, (6 * 32)(%rsi); + vmovdqu RB3, (7 * 32)(%rsi); + + vzeroall; + FRAME_END + ret; +SYM_FUNC_END(sm4_aesni_avx2_ctr_enc_blk16) + +/* + * void sm4_aesni_avx2_cbc_dec_blk16(const u32 *rk, u8 *dst, + * const u8 *src, u8 *iv) + */ +.align 8 +SYM_FUNC_START(sm4_aesni_avx2_cbc_dec_blk16) + /* input: + * %rdi: round key array, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + * %rcx: iv + */ + FRAME_BEGIN + + vzeroupper; + + vmovdqu (0 * 32)(%rdx), RA0; + vmovdqu (1 * 32)(%rdx), RA1; + vmovdqu (2 * 32)(%rdx), RA2; + vmovdqu (3 * 32)(%rdx), RA3; + vmovdqu (4 * 32)(%rdx), RB0; + vmovdqu (5 * 32)(%rdx), RB1; + vmovdqu (6 * 32)(%rdx), RB2; + vmovdqu (7 * 32)(%rdx), RB3; + + call __sm4_crypt_blk16; + + vmovdqu (%rcx), RNOTx; + vinserti128 $1, (%rdx), RNOT, RNOT; + vpxor RNOT, RA0, RA0; + vpxor (0 * 32 + 16)(%rdx), RA1, RA1; + vpxor (1 * 32 + 16)(%rdx), RA2, RA2; + vpxor (2 * 32 + 16)(%rdx), RA3, RA3; + vpxor (3 * 32 + 16)(%rdx), RB0, RB0; + vpxor (4 * 32 + 16)(%rdx), RB1, RB1; + vpxor (5 * 32 + 16)(%rdx), RB2, RB2; + vpxor (6 * 32 + 16)(%rdx), RB3, RB3; + vmovdqu (7 * 32 + 16)(%rdx), RNOTx; + vmovdqu RNOTx, (%rcx); /* store new IV */ + + vmovdqu RA0, (0 * 32)(%rsi); + vmovdqu RA1, (1 * 32)(%rsi); + vmovdqu RA2, (2 * 32)(%rsi); + vmovdqu RA3, (3 * 32)(%rsi); + vmovdqu RB0, (4 * 32)(%rsi); + vmovdqu RB1, (5 * 32)(%rsi); + vmovdqu RB2, (6 * 32)(%rsi); + vmovdqu RB3, (7 * 32)(%rsi); + + vzeroall; + FRAME_END + ret; +SYM_FUNC_END(sm4_aesni_avx2_cbc_dec_blk16) + +/* + * void sm4_aesni_avx2_cfb_dec_blk16(const u32 *rk, u8 *dst, + * const u8 *src, u8 *iv) + */ +.align 8 +SYM_FUNC_START(sm4_aesni_avx2_cfb_dec_blk16) + /* input: + * %rdi: round key array, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + * %rcx: iv + */ + FRAME_BEGIN + + vzeroupper; + + /* Load input */ + vmovdqu (%rcx), RNOTx; + vinserti128 $1, (%rdx), RNOT, RA0; + vmovdqu (0 * 32 + 16)(%rdx), RA1; + vmovdqu (1 * 32 + 16)(%rdx), RA2; + vmovdqu (2 * 32 + 16)(%rdx), RA3; + vmovdqu (3 * 32 + 16)(%rdx), RB0; + vmovdqu (4 * 32 + 16)(%rdx), RB1; + vmovdqu (5 * 32 + 16)(%rdx), RB2; + vmovdqu (6 * 32 + 16)(%rdx), RB3; + + /* Update IV */ + vmovdqu (7 * 32 + 16)(%rdx), RNOTx; + vmovdqu RNOTx, (%rcx); + + call __sm4_crypt_blk16; + + vpxor (0 * 32)(%rdx), RA0, RA0; + vpxor (1 * 32)(%rdx), RA1, RA1; + vpxor (2 * 32)(%rdx), RA2, RA2; + vpxor (3 * 32)(%rdx), RA3, RA3; + vpxor (4 * 32)(%rdx), RB0, RB0; + vpxor (5 * 32)(%rdx), RB1, RB1; + vpxor (6 * 32)(%rdx), RB2, RB2; + vpxor (7 * 32)(%rdx), RB3, RB3; + + vmovdqu RA0, (0 * 32)(%rsi); + vmovdqu RA1, (1 * 32)(%rsi); + vmovdqu RA2, (2 * 32)(%rsi); + vmovdqu RA3, (3 * 32)(%rsi); + vmovdqu RB0, (4 * 32)(%rsi); + vmovdqu RB1, (5 * 32)(%rsi); + vmovdqu RB2, (6 * 32)(%rsi); + vmovdqu RB3, (7 * 32)(%rsi); + + vzeroall; + FRAME_END + ret; +SYM_FUNC_END(sm4_aesni_avx2_cfb_dec_blk16) diff --git a/arch/x86/crypto/sm4_aesni_avx2_glue.c b/arch/x86/crypto/sm4_aesni_avx2_glue.c new file mode 100644 index 0000000000000..84bc718f49a3d --- /dev/null +++ b/arch/x86/crypto/sm4_aesni_avx2_glue.c @@ -0,0 +1,169 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SM4 Cipher Algorithm, AES-NI/AVX2 optimized. + * as specified in + * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html + * + * Copyright (c) 2021, Alibaba Group. + * Copyright (c) 2021 Tianjia Zhang + */ + +#include +#include +#include +#include +#include +#include +#include +#include "sm4-avx.h" + +#define SM4_CRYPT16_BLOCK_SIZE (SM4_BLOCK_SIZE * 16) + +asmlinkage void sm4_aesni_avx2_ctr_enc_blk16(const u32 *rk, u8 *dst, + const u8 *src, u8 *iv); +asmlinkage void sm4_aesni_avx2_cbc_dec_blk16(const u32 *rk, u8 *dst, + const u8 *src, u8 *iv); +asmlinkage void sm4_aesni_avx2_cfb_dec_blk16(const u32 *rk, u8 *dst, + const u8 *src, u8 *iv); + +static int sm4_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int key_len) +{ + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + + return sm4_expandkey(ctx, key, key_len); +} + +static int cbc_decrypt(struct skcipher_request *req) +{ + return sm4_avx_cbc_decrypt(req, SM4_CRYPT16_BLOCK_SIZE, + sm4_aesni_avx2_cbc_dec_blk16); +} + + +static int cfb_decrypt(struct skcipher_request *req) +{ + return sm4_avx_cfb_decrypt(req, SM4_CRYPT16_BLOCK_SIZE, + sm4_aesni_avx2_cfb_dec_blk16); +} + +static int ctr_crypt(struct skcipher_request *req) +{ + return sm4_avx_ctr_crypt(req, SM4_CRYPT16_BLOCK_SIZE, + sm4_aesni_avx2_ctr_enc_blk16); +} + +static struct skcipher_alg sm4_aesni_avx2_skciphers[] = { + { + .base = { + .cra_name = "__ecb(sm4)", + .cra_driver_name = "__ecb-sm4-aesni-avx2", + .cra_priority = 500, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = SM4_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .walksize = 16 * SM4_BLOCK_SIZE, + .setkey = sm4_skcipher_setkey, + .encrypt = sm4_avx_ecb_encrypt, + .decrypt = sm4_avx_ecb_decrypt, + }, { + .base = { + .cra_name = "__cbc(sm4)", + .cra_driver_name = "__cbc-sm4-aesni-avx2", + .cra_priority = 500, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = SM4_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .walksize = 16 * SM4_BLOCK_SIZE, + .setkey = sm4_skcipher_setkey, + .encrypt = sm4_cbc_encrypt, + .decrypt = cbc_decrypt, + }, { + .base = { + .cra_name = "__cfb(sm4)", + .cra_driver_name = "__cfb-sm4-aesni-avx2", + .cra_priority = 500, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .chunksize = SM4_BLOCK_SIZE, + .walksize = 16 * SM4_BLOCK_SIZE, + .setkey = sm4_skcipher_setkey, + .encrypt = sm4_cfb_encrypt, + .decrypt = cfb_decrypt, + }, { + .base = { + .cra_name = "__ctr(sm4)", + .cra_driver_name = "__ctr-sm4-aesni-avx2", + .cra_priority = 500, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .chunksize = SM4_BLOCK_SIZE, + .walksize = 16 * SM4_BLOCK_SIZE, + .setkey = sm4_skcipher_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + } +}; + +static struct simd_skcipher_alg * +simd_sm4_aesni_avx2_skciphers[ARRAY_SIZE(sm4_aesni_avx2_skciphers)]; + +static int __init sm4_init(void) +{ + const char *feature_name; + + if (!boot_cpu_has(X86_FEATURE_AVX) || + !boot_cpu_has(X86_FEATURE_AVX2) || + !boot_cpu_has(X86_FEATURE_AES) || + !boot_cpu_has(X86_FEATURE_OSXSAVE)) { + pr_info("AVX2 or AES-NI instructions are not detected.\n"); + return -ENODEV; + } + + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); + return -ENODEV; + } + + return simd_register_skciphers_compat(sm4_aesni_avx2_skciphers, + ARRAY_SIZE(sm4_aesni_avx2_skciphers), + simd_sm4_aesni_avx2_skciphers); +} + +static void __exit sm4_exit(void) +{ + simd_unregister_skciphers(sm4_aesni_avx2_skciphers, + ARRAY_SIZE(sm4_aesni_avx2_skciphers), + simd_sm4_aesni_avx2_skciphers); +} + +module_init(sm4_init); +module_exit(sm4_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Tianjia Zhang "); +MODULE_DESCRIPTION("SM4 Cipher Algorithm, AES-NI/AVX2 optimized"); +MODULE_ALIAS_CRYPTO("sm4"); +MODULE_ALIAS_CRYPTO("sm4-aesni-avx2"); diff --git a/crypto/Kconfig b/crypto/Kconfig index 7e75808e4b9c3..4b397f32ba1e6 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1584,6 +1584,28 @@ config CRYPTO_SM4_AESNI_AVX_X86_64 If unsure, say N. +config CRYPTO_SM4_AESNI_AVX2_X86_64 + tristate "SM4 cipher algorithm (x86_64/AES-NI/AVX2)" + depends on X86 && 64BIT + select CRYPTO_SKCIPHER + select CRYPTO_SIMD + select CRYPTO_ALGAPI + select CRYPTO_LIB_SM4 + select CRYPTO_SM4_AESNI_AVX_X86_64 + help + SM4 cipher algorithms (OSCCA GB/T 32907-2016) (x86_64/AES-NI/AVX2). + + SM4 (GBT.32907-2016) is a cryptographic standard issued by the + Organization of State Commercial Administration of China (OSCCA) + as an authorized cryptographic algorithms for the use within China. + + This is SM4 optimized implementation using AES-NI/AVX2/x86_64 + instruction set for block cipher. Through two affine transforms, + we can use the AES S-Box to simulate the SM4 S-Box to achieve the + effect of instruction acceleration. + + If unsure, say N. + config CRYPTO_TEA tristate "TEA, XTEA and XETA cipher algorithms" select CRYPTO_ALGAPI From ec55f450456f8d15afc9c6977730bbe1ad1534b7 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 25 Aug 2021 13:38:59 -0700 Subject: [PATCH 0207/5344] crypto: sm4 - Do not change section of ck and sbox commit 4a7e1e5fc294687a8941fa3eeb4a7e8539ca5e2f upstream. When building with clang and GNU as, there is a warning about ignored changed section attributes: /tmp/sm4-c916c8.s: Assembler messages: /tmp/sm4-c916c8.s:677: Warning: ignoring changed section attributes for .data..cacheline_aligned "static const" places the data in .rodata but __cacheline_aligned has the section attribute to place it in .data..cacheline_aligned, in addition to the aligned attribute. To keep the alignment but avoid attempting to change sections, use the ____cacheline_aligned attribute, which is just the aligned attribute. Fixes: 2b31277af577 ("crypto: sm4 - create SM4 library based on sm4 generic code") Link: https://github.com/ClangBuiltLinux/linux/issues/1441 Signed-off-by: Nathan Chancellor Reviewed-by: Tianjia Zhang Signed-off-by: Herbert Xu Signed-off-by: Guixiong Wei --- lib/crypto/sm4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/crypto/sm4.c b/lib/crypto/sm4.c index 633b59fed9db8..284e62576d0c6 100644 --- a/lib/crypto/sm4.c +++ b/lib/crypto/sm4.c @@ -15,7 +15,7 @@ static const u32 fk[4] = { 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc }; -static const u32 __cacheline_aligned ck[32] = { +static const u32 ____cacheline_aligned ck[32] = { 0x00070e15, 0x1c232a31, 0x383f464d, 0x545b6269, 0x70777e85, 0x8c939aa1, 0xa8afb6bd, 0xc4cbd2d9, 0xe0e7eef5, 0xfc030a11, 0x181f262d, 0x343b4249, @@ -26,7 +26,7 @@ static const u32 __cacheline_aligned ck[32] = { 0x10171e25, 0x2c333a41, 0x484f565d, 0x646b7279 }; -static const u8 __cacheline_aligned sbox[256] = { +static const u8 ____cacheline_aligned sbox[256] = { 0xd6, 0x90, 0xe9, 0xfe, 0xcc, 0xe1, 0x3d, 0xb7, 0x16, 0xb6, 0x14, 0xc2, 0x28, 0xfb, 0x2c, 0x05, 0x2b, 0x67, 0x9a, 0x76, 0x2a, 0xbe, 0x04, 0xc3, From 3e854390c8262882156dffb3ead73eb7d0d9988d Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 21 Sep 2021 22:40:26 -0700 Subject: [PATCH 0208/5344] crypto: x86/sm4 - Fix frame pointer stack corruption commit 0e14ef38669ce4faa80589247fe8ed8a3780f414 upstream. sm4_aesni_avx_crypt8() sets up the frame pointer (which includes pushing RBP) before doing a conditional sibling call to sm4_aesni_avx_crypt4(), which sets up an additional frame pointer. Things will not go well when sm4_aesni_avx_crypt4() pops only the innermost single frame pointer and then tries to return to the outermost frame pointer. Sibling calls need to occur with an empty stack frame. Do the conditional sibling call *before* setting up the stack pointer. This fixes the following warning: arch/x86/crypto/sm4-aesni-avx-asm_64.o: warning: objtool: sm4_aesni_avx_crypt8()+0x8: sibling call from callable instruction with modified stack frame Fixes: a7ee22ee1445 ("crypto: x86/sm4 - add AES-NI/AVX/x86_64 implementation") Reported-by: kernel test robot Reported-by: Arnd Bergmann Acked-by: Peter Zijlstra (Intel) Reviewed-by: Tianjia Zhang Signed-off-by: Josh Poimboeuf Signed-off-by: Herbert Xu Signed-off-by: Guixiong Wei --- arch/x86/crypto/sm4-aesni-avx-asm_64.S | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/crypto/sm4-aesni-avx-asm_64.S b/arch/x86/crypto/sm4-aesni-avx-asm_64.S index fa2c3f50aecbd..18d2f51991944 100644 --- a/arch/x86/crypto/sm4-aesni-avx-asm_64.S +++ b/arch/x86/crypto/sm4-aesni-avx-asm_64.S @@ -367,10 +367,11 @@ SYM_FUNC_START(sm4_aesni_avx_crypt8) * %rdx: src (1..8 blocks) * %rcx: num blocks (1..8) */ - FRAME_BEGIN - cmpq $5, %rcx; jb sm4_aesni_avx_crypt4; + + FRAME_BEGIN + vmovdqu (0 * 16)(%rdx), RA0; vmovdqu (1 * 16)(%rdx), RA1; vmovdqu (2 * 16)(%rdx), RA2; From 7b4cb241dab19dc7d02ec27aac507c5e29f5501b Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Fri, 15 Oct 2021 11:47:33 +0800 Subject: [PATCH 0209/5344] crypto: x86/sm4 - Fix invalid section entry size commit f8690a4b5a1b64f74ae5c4f7c4ea880d8a8e1a0d upstream. This fixes the following warning: vmlinux.o: warning: objtool: elf_update: invalid section entry size The size of the rodata section is 164 bytes, directly using the entry_size of 164 bytes will cause errors in some versions of the gcc compiler, while using 16 bytes directly will cause errors in the clang compiler. This patch correct it by filling the size of rodata to a 16-byte boundary. Fixes: a7ee22ee1445 ("crypto: x86/sm4 - add AES-NI/AVX/x86_64 implementation") Fixes: 5b2efa2bb865 ("crypto: x86/sm4 - add AES-NI/AVX2/x86_64 implementation") Reported-by: Peter Zijlstra Reported-by: Abaci Robot Signed-off-by: Tianjia Zhang Tested-by: Heyuan Shi Signed-off-by: Herbert Xu Signed-off-by: Guixiong Wei --- arch/x86/crypto/sm4-aesni-avx-asm_64.S | 6 +++++- arch/x86/crypto/sm4-aesni-avx2-asm_64.S | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/crypto/sm4-aesni-avx-asm_64.S b/arch/x86/crypto/sm4-aesni-avx-asm_64.S index 18d2f51991944..1cc72b4804fab 100644 --- a/arch/x86/crypto/sm4-aesni-avx-asm_64.S +++ b/arch/x86/crypto/sm4-aesni-avx-asm_64.S @@ -78,7 +78,7 @@ vpxor tmp0, x, x; -.section .rodata.cst164, "aM", @progbits, 164 +.section .rodata.cst16, "aM", @progbits, 16 .align 16 /* @@ -133,6 +133,10 @@ .L0f0f0f0f: .long 0x0f0f0f0f +/* 12 bytes, only for padding */ +.Lpadding_deadbeef: + .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef + .text .align 16 diff --git a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S index d2ffd7f76ee23..9c5d3f3ad45a9 100644 --- a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S @@ -93,7 +93,7 @@ vpxor tmp0, x, x; -.section .rodata.cst164, "aM", @progbits, 164 +.section .rodata.cst16, "aM", @progbits, 16 .align 16 /* @@ -148,6 +148,10 @@ .L0f0f0f0f: .long 0x0f0f0f0f +/* 12 bytes, only for padding */ +.Lpadding_deadbeef: + .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef + .text .align 16 From 1798445897806fc0807929371fb57b65726f9294 Mon Sep 17 00:00:00 2001 From: Xu Jia Date: Wed, 22 Dec 2021 17:06:59 +0800 Subject: [PATCH 0210/5344] xfrm: Add support for SM4 symmetric cipher algorithm commit 23b6a6df94c6ce434e7947cfad14b1640fb9f794 upstream. This patch adds SM4 encryption algorithm entry to ealg_list. Signed-off-by: Xu Jia Signed-off-by: Steffen Klassert Signed-off-by: Guixiong Wei --- include/uapi/linux/pfkeyv2.h | 1 + net/xfrm/xfrm_algo.c | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/include/uapi/linux/pfkeyv2.h b/include/uapi/linux/pfkeyv2.h index d65b117852604..48174753d50da 100644 --- a/include/uapi/linux/pfkeyv2.h +++ b/include/uapi/linux/pfkeyv2.h @@ -329,6 +329,7 @@ struct sadb_x_filter { #define SADB_X_EALG_AES_GCM_ICV16 20 #define SADB_X_EALG_CAMELLIACBC 22 #define SADB_X_EALG_NULL_AES_GMAC 23 +#define SADB_X_EALG_SM4CBC 24 #define SADB_EALG_MAX 253 /* last EALG */ /* private allocations should use 249-255 (RFC2407) */ #define SADB_X_EALG_SERPENTCBC 252 /* draft-ietf-ipsec-ciph-aes-cbc-00 */ diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c index 32a378e7011ff..86df553395e0a 100644 --- a/net/xfrm/xfrm_algo.c +++ b/net/xfrm/xfrm_algo.c @@ -552,6 +552,27 @@ static struct xfrm_algo_desc ealg_list[] = { .sadb_alg_maxbits = 288 } }, +{ + .name = "cbc(sm4)", + .compat = "sm4", + + .uinfo = { + .encr = { + .geniv = "echainiv", + .blockbits = 128, + .defkeybits = 128, + } + }, + + .pfkey_supported = 1, + + .desc = { + .sadb_alg_id = SADB_X_EALG_SM4CBC, + .sadb_alg_ivlen = 16, + .sadb_alg_minbits = 128, + .sadb_alg_maxbits = 256 + } +}, }; static struct xfrm_algo_desc calg_list[] = { From 1bb3a88807436b4d5a5c254f992914a944359eaf Mon Sep 17 00:00:00 2001 From: Xu Jia Date: Wed, 22 Dec 2021 17:06:58 +0800 Subject: [PATCH 0211/5344] xfrm: Add support for SM3 secure hash commit e6911affa416dc4e0c0b3f04cbe6b02ce13277f1 upstream. This patch allows IPsec to use SM3 HMAC authentication algorithm. Signed-off-by: Xu Jia Signed-off-by: Steffen Klassert Signed-off-by: Guixiong Wei --- include/uapi/linux/pfkeyv2.h | 1 + net/xfrm/xfrm_algo.c | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/include/uapi/linux/pfkeyv2.h b/include/uapi/linux/pfkeyv2.h index 48174753d50da..8abae1f6749c5 100644 --- a/include/uapi/linux/pfkeyv2.h +++ b/include/uapi/linux/pfkeyv2.h @@ -309,6 +309,7 @@ struct sadb_x_filter { #define SADB_X_AALG_SHA2_512HMAC 7 #define SADB_X_AALG_RIPEMD160HMAC 8 #define SADB_X_AALG_AES_XCBC_MAC 9 +#define SADB_X_AALG_SM3_256HMAC 10 #define SADB_X_AALG_NULL 251 /* kame */ #define SADB_AALG_MAX 251 diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c index 86df553395e0a..1b138a9d2369a 100644 --- a/net/xfrm/xfrm_algo.c +++ b/net/xfrm/xfrm_algo.c @@ -341,6 +341,26 @@ static struct xfrm_algo_desc aalg_list[] = { .pfkey_supported = 0, }, +{ + .name = "hmac(sm3)", + .compat = "sm3", + + .uinfo = { + .auth = { + .icv_truncbits = 256, + .icv_fullbits = 256, + } + }, + + .pfkey_supported = 1, + + .desc = { + .sadb_alg_id = SADB_X_AALG_SM3_256HMAC, + .sadb_alg_ivlen = 0, + .sadb_alg_minbits = 256, + .sadb_alg_maxbits = 256 + } +}, }; static struct xfrm_algo_desc ealg_list[] = { From 9cb0e3bc5b668270d9c32f3ac735d123cbc86f57 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Fri, 7 Jan 2022 20:06:55 +0800 Subject: [PATCH 0212/5344] crypto: sm3 - create SM3 stand-alone library commit eb90686d5d10fef9cadd9c0eb30f3fee66d2b2a5 upstream. Stand-alone implementation of the SM3 algorithm. It is designed to have as little dependencies as possible. In other cases you should generally use the hash APIs from include/crypto/hash.h. Especially when hashing large amounts of data as those APIs may be hw-accelerated. In the new SM3 stand-alone library, sm3_transform() has also been optimized, instead of simply using the code in sm3_generic. Signed-off-by: Tianjia Zhang Reviewed-by: Gilad Ben-Yossef Signed-off-by: Herbert Xu --- crypto/Kconfig | 3 + include/crypto/sm3.h | 32 ++++++ lib/crypto/Makefile | 3 + lib/crypto/sm3.c | 246 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 284 insertions(+) create mode 100644 lib/crypto/sm3.c diff --git a/crypto/Kconfig b/crypto/Kconfig index 4b397f32ba1e6..30b0bbef5b0ad 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -961,6 +961,9 @@ config CRYPTO_SHA3 References: http://keccak.noekeon.org/ +config CRYPTO_LIB_SM3 + tristate + config CRYPTO_SM3 tristate "SM3 digest algorithm" select CRYPTO_HASH diff --git a/include/crypto/sm3.h b/include/crypto/sm3.h index 1438942dc7737..299417a1910c8 100644 --- a/include/crypto/sm3.h +++ b/include/crypto/sm3.h @@ -1,5 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Common values for SM3 algorithm + * + * Copyright (C) 2017 ARM Limited or its affiliates. + * Copyright (C) 2017 Gilad Ben-Yossef + * Copyright (C) 2021 Tianjia Zhang */ #ifndef _CRYPTO_SM3_H @@ -37,4 +42,31 @@ extern int crypto_sm3_update(struct shash_desc *desc, const u8 *data, extern int crypto_sm3_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *hash); + +/* + * Stand-alone implementation of the SM3 algorithm. It is designed to + * have as little dependencies as possible so it can be used in the + * kexec_file purgatory. In other cases you should generally use the + * hash APIs from include/crypto/hash.h. Especially when hashing large + * amounts of data as those APIs may be hw-accelerated. + * + * For details see lib/crypto/sm3.c + */ + +static inline void sm3_init(struct sm3_state *sctx) +{ + sctx->state[0] = SM3_IVA; + sctx->state[1] = SM3_IVB; + sctx->state[2] = SM3_IVC; + sctx->state[3] = SM3_IVD; + sctx->state[4] = SM3_IVE; + sctx->state[5] = SM3_IVF; + sctx->state[6] = SM3_IVG; + sctx->state[7] = SM3_IVH; + sctx->count = 0; +} + +void sm3_update(struct sm3_state *sctx, const u8 *data, unsigned int len); +void sm3_final(struct sm3_state *sctx, u8 *out); + #endif diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 5f9e4b3899948..e6f9ed44b463e 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -12,6 +12,9 @@ libdes-y := des.o obj-$(CONFIG_CRYPTO_LIB_SHA256) += libsha256.o libsha256-y := sha256.o +obj-$(CONFIG_CRYPTO_LIB_SM3) += libsm3.o +libsm3-y := sm3.o + obj-$(CONFIG_CRYPTO_LIB_SM4) += libsm4.o libsm4-y := sm4.o diff --git a/lib/crypto/sm3.c b/lib/crypto/sm3.c new file mode 100644 index 0000000000000..d473e358a873a --- /dev/null +++ b/lib/crypto/sm3.c @@ -0,0 +1,246 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * SM3 secure hash, as specified by OSCCA GM/T 0004-2012 SM3 and described + * at https://datatracker.ietf.org/doc/html/draft-sca-cfrg-sm3-02 + * + * Copyright (C) 2017 ARM Limited or its affiliates. + * Copyright (C) 2017 Gilad Ben-Yossef + * Copyright (C) 2021 Tianjia Zhang + */ + +#include +#include +#include + +static const u32 ____cacheline_aligned K[64] = { + 0x79cc4519, 0xf3988a32, 0xe7311465, 0xce6228cb, + 0x9cc45197, 0x3988a32f, 0x7311465e, 0xe6228cbc, + 0xcc451979, 0x988a32f3, 0x311465e7, 0x6228cbce, + 0xc451979c, 0x88a32f39, 0x11465e73, 0x228cbce6, + 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c, + 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce, + 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec, + 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5, + 0x7a879d8a, 0xf50f3b14, 0xea1e7629, 0xd43cec53, + 0xa879d8a7, 0x50f3b14f, 0xa1e7629e, 0x43cec53d, + 0x879d8a7a, 0x0f3b14f5, 0x1e7629ea, 0x3cec53d4, + 0x79d8a7a8, 0xf3b14f50, 0xe7629ea1, 0xcec53d43, + 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c, + 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce, + 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec, + 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5 +}; + +/* + * Transform the message X which consists of 16 32-bit-words. See + * GM/T 004-2012 for details. + */ +#define R(i, a, b, c, d, e, f, g, h, t, w1, w2) \ + do { \ + ss1 = rol32((rol32((a), 12) + (e) + (t)), 7); \ + ss2 = ss1 ^ rol32((a), 12); \ + d += FF ## i(a, b, c) + ss2 + ((w1) ^ (w2)); \ + h += GG ## i(e, f, g) + ss1 + (w1); \ + b = rol32((b), 9); \ + f = rol32((f), 19); \ + h = P0((h)); \ + } while (0) + +#define R1(a, b, c, d, e, f, g, h, t, w1, w2) \ + R(1, a, b, c, d, e, f, g, h, t, w1, w2) +#define R2(a, b, c, d, e, f, g, h, t, w1, w2) \ + R(2, a, b, c, d, e, f, g, h, t, w1, w2) + +#define FF1(x, y, z) (x ^ y ^ z) +#define FF2(x, y, z) ((x & y) | (x & z) | (y & z)) + +#define GG1(x, y, z) FF1(x, y, z) +#define GG2(x, y, z) ((x & y) | (~x & z)) + +/* Message expansion */ +#define P0(x) ((x) ^ rol32((x), 9) ^ rol32((x), 17)) +#define P1(x) ((x) ^ rol32((x), 15) ^ rol32((x), 23)) +#define I(i) (W[i] = get_unaligned_be32(data + i * 4)) +#define W1(i) (W[i & 0x0f]) +#define W2(i) (W[i & 0x0f] = \ + P1(W[i & 0x0f] \ + ^ W[(i-9) & 0x0f] \ + ^ rol32(W[(i-3) & 0x0f], 15)) \ + ^ rol32(W[(i-13) & 0x0f], 7) \ + ^ W[(i-6) & 0x0f]) + +static void sm3_transform(struct sm3_state *sctx, u8 const *data, u32 W[16]) +{ + u32 a, b, c, d, e, f, g, h, ss1, ss2; + + a = sctx->state[0]; + b = sctx->state[1]; + c = sctx->state[2]; + d = sctx->state[3]; + e = sctx->state[4]; + f = sctx->state[5]; + g = sctx->state[6]; + h = sctx->state[7]; + + R1(a, b, c, d, e, f, g, h, K[0], I(0), I(4)); + R1(d, a, b, c, h, e, f, g, K[1], I(1), I(5)); + R1(c, d, a, b, g, h, e, f, K[2], I(2), I(6)); + R1(b, c, d, a, f, g, h, e, K[3], I(3), I(7)); + R1(a, b, c, d, e, f, g, h, K[4], W1(4), I(8)); + R1(d, a, b, c, h, e, f, g, K[5], W1(5), I(9)); + R1(c, d, a, b, g, h, e, f, K[6], W1(6), I(10)); + R1(b, c, d, a, f, g, h, e, K[7], W1(7), I(11)); + R1(a, b, c, d, e, f, g, h, K[8], W1(8), I(12)); + R1(d, a, b, c, h, e, f, g, K[9], W1(9), I(13)); + R1(c, d, a, b, g, h, e, f, K[10], W1(10), I(14)); + R1(b, c, d, a, f, g, h, e, K[11], W1(11), I(15)); + R1(a, b, c, d, e, f, g, h, K[12], W1(12), W2(16)); + R1(d, a, b, c, h, e, f, g, K[13], W1(13), W2(17)); + R1(c, d, a, b, g, h, e, f, K[14], W1(14), W2(18)); + R1(b, c, d, a, f, g, h, e, K[15], W1(15), W2(19)); + + R2(a, b, c, d, e, f, g, h, K[16], W1(16), W2(20)); + R2(d, a, b, c, h, e, f, g, K[17], W1(17), W2(21)); + R2(c, d, a, b, g, h, e, f, K[18], W1(18), W2(22)); + R2(b, c, d, a, f, g, h, e, K[19], W1(19), W2(23)); + R2(a, b, c, d, e, f, g, h, K[20], W1(20), W2(24)); + R2(d, a, b, c, h, e, f, g, K[21], W1(21), W2(25)); + R2(c, d, a, b, g, h, e, f, K[22], W1(22), W2(26)); + R2(b, c, d, a, f, g, h, e, K[23], W1(23), W2(27)); + R2(a, b, c, d, e, f, g, h, K[24], W1(24), W2(28)); + R2(d, a, b, c, h, e, f, g, K[25], W1(25), W2(29)); + R2(c, d, a, b, g, h, e, f, K[26], W1(26), W2(30)); + R2(b, c, d, a, f, g, h, e, K[27], W1(27), W2(31)); + R2(a, b, c, d, e, f, g, h, K[28], W1(28), W2(32)); + R2(d, a, b, c, h, e, f, g, K[29], W1(29), W2(33)); + R2(c, d, a, b, g, h, e, f, K[30], W1(30), W2(34)); + R2(b, c, d, a, f, g, h, e, K[31], W1(31), W2(35)); + + R2(a, b, c, d, e, f, g, h, K[32], W1(32), W2(36)); + R2(d, a, b, c, h, e, f, g, K[33], W1(33), W2(37)); + R2(c, d, a, b, g, h, e, f, K[34], W1(34), W2(38)); + R2(b, c, d, a, f, g, h, e, K[35], W1(35), W2(39)); + R2(a, b, c, d, e, f, g, h, K[36], W1(36), W2(40)); + R2(d, a, b, c, h, e, f, g, K[37], W1(37), W2(41)); + R2(c, d, a, b, g, h, e, f, K[38], W1(38), W2(42)); + R2(b, c, d, a, f, g, h, e, K[39], W1(39), W2(43)); + R2(a, b, c, d, e, f, g, h, K[40], W1(40), W2(44)); + R2(d, a, b, c, h, e, f, g, K[41], W1(41), W2(45)); + R2(c, d, a, b, g, h, e, f, K[42], W1(42), W2(46)); + R2(b, c, d, a, f, g, h, e, K[43], W1(43), W2(47)); + R2(a, b, c, d, e, f, g, h, K[44], W1(44), W2(48)); + R2(d, a, b, c, h, e, f, g, K[45], W1(45), W2(49)); + R2(c, d, a, b, g, h, e, f, K[46], W1(46), W2(50)); + R2(b, c, d, a, f, g, h, e, K[47], W1(47), W2(51)); + + R2(a, b, c, d, e, f, g, h, K[48], W1(48), W2(52)); + R2(d, a, b, c, h, e, f, g, K[49], W1(49), W2(53)); + R2(c, d, a, b, g, h, e, f, K[50], W1(50), W2(54)); + R2(b, c, d, a, f, g, h, e, K[51], W1(51), W2(55)); + R2(a, b, c, d, e, f, g, h, K[52], W1(52), W2(56)); + R2(d, a, b, c, h, e, f, g, K[53], W1(53), W2(57)); + R2(c, d, a, b, g, h, e, f, K[54], W1(54), W2(58)); + R2(b, c, d, a, f, g, h, e, K[55], W1(55), W2(59)); + R2(a, b, c, d, e, f, g, h, K[56], W1(56), W2(60)); + R2(d, a, b, c, h, e, f, g, K[57], W1(57), W2(61)); + R2(c, d, a, b, g, h, e, f, K[58], W1(58), W2(62)); + R2(b, c, d, a, f, g, h, e, K[59], W1(59), W2(63)); + R2(a, b, c, d, e, f, g, h, K[60], W1(60), W2(64)); + R2(d, a, b, c, h, e, f, g, K[61], W1(61), W2(65)); + R2(c, d, a, b, g, h, e, f, K[62], W1(62), W2(66)); + R2(b, c, d, a, f, g, h, e, K[63], W1(63), W2(67)); + + sctx->state[0] ^= a; + sctx->state[1] ^= b; + sctx->state[2] ^= c; + sctx->state[3] ^= d; + sctx->state[4] ^= e; + sctx->state[5] ^= f; + sctx->state[6] ^= g; + sctx->state[7] ^= h; +} +#undef R +#undef R1 +#undef R2 +#undef I +#undef W1 +#undef W2 + +static inline void sm3_block(struct sm3_state *sctx, + u8 const *data, int blocks, u32 W[16]) +{ + while (blocks--) { + sm3_transform(sctx, data, W); + data += SM3_BLOCK_SIZE; + } +} + +void sm3_update(struct sm3_state *sctx, const u8 *data, unsigned int len) +{ + unsigned int partial = sctx->count % SM3_BLOCK_SIZE; + u32 W[16]; + + sctx->count += len; + + if ((partial + len) >= SM3_BLOCK_SIZE) { + int blocks; + + if (partial) { + int p = SM3_BLOCK_SIZE - partial; + + memcpy(sctx->buffer + partial, data, p); + data += p; + len -= p; + + sm3_block(sctx, sctx->buffer, 1, W); + } + + blocks = len / SM3_BLOCK_SIZE; + len %= SM3_BLOCK_SIZE; + + if (blocks) { + sm3_block(sctx, data, blocks, W); + data += blocks * SM3_BLOCK_SIZE; + } + + memzero_explicit(W, sizeof(W)); + + partial = 0; + } + if (len) + memcpy(sctx->buffer + partial, data, len); +} +EXPORT_SYMBOL_GPL(sm3_update); + +void sm3_final(struct sm3_state *sctx, u8 *out) +{ + const int bit_offset = SM3_BLOCK_SIZE - sizeof(u64); + __be64 *bits = (__be64 *)(sctx->buffer + bit_offset); + __be32 *digest = (__be32 *)out; + unsigned int partial = sctx->count % SM3_BLOCK_SIZE; + u32 W[16]; + int i; + + sctx->buffer[partial++] = 0x80; + if (partial > bit_offset) { + memset(sctx->buffer + partial, 0, SM3_BLOCK_SIZE - partial); + partial = 0; + + sm3_block(sctx, sctx->buffer, 1, W); + } + + memset(sctx->buffer + partial, 0, bit_offset - partial); + *bits = cpu_to_be64(sctx->count << 3); + sm3_block(sctx, sctx->buffer, 1, W); + + for (i = 0; i < 8; i++) + put_unaligned_be32(sctx->state[i], digest++); + + /* Zeroize sensitive information. */ + memzero_explicit(W, sizeof(W)); + memzero_explicit(sctx, sizeof(*sctx)); +} +EXPORT_SYMBOL_GPL(sm3_final); + +MODULE_DESCRIPTION("Generic SM3 library"); +MODULE_LICENSE("GPL v2"); From a33b3520c2920c983156458e45c7bff6e9b3908c Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Fri, 7 Jan 2022 20:06:56 +0800 Subject: [PATCH 0213/5344] crypto: arm64/sm3-ce - make dependent on sm3 library commit f3a03d319dbdbb206530ebfce977c334ee2f8765 upstream. SM3 generic library is stand-alone implementation, sm3-ce can depend on the SM3 library instead of sm3-generic. Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu --- arch/arm64/crypto/Kconfig | 2 +- arch/arm64/crypto/sm3-ce-glue.c | 28 ++++++++++++++++++++-------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 9a4aa1a419bb2..7163328213189 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -45,7 +45,7 @@ config CRYPTO_SM3_ARM64_CE tristate "SM3 digest algorithm (ARMv8.2 Crypto Extensions)" depends on KERNEL_MODE_NEON select CRYPTO_HASH - select CRYPTO_SM3 + select CRYPTO_LIB_SM3 config CRYPTO_SM4_ARM64_CE tristate "SM4 symmetric cipher (ARMv8.2 Crypto Extensions)" diff --git a/arch/arm64/crypto/sm3-ce-glue.c b/arch/arm64/crypto/sm3-ce-glue.c index d71faca322f2a..ee98954ae8ca6 100644 --- a/arch/arm64/crypto/sm3-ce-glue.c +++ b/arch/arm64/crypto/sm3-ce-glue.c @@ -26,8 +26,10 @@ asmlinkage void sm3_ce_transform(struct sm3_state *sst, u8 const *src, static int sm3_ce_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - if (!crypto_simd_usable()) - return crypto_sm3_update(desc, data, len); + if (!crypto_simd_usable()) { + sm3_update(shash_desc_ctx(desc), data, len); + return 0; + } kernel_neon_begin(); sm3_base_do_update(desc, data, len, sm3_ce_transform); @@ -38,8 +40,10 @@ static int sm3_ce_update(struct shash_desc *desc, const u8 *data, static int sm3_ce_final(struct shash_desc *desc, u8 *out) { - if (!crypto_simd_usable()) - return crypto_sm3_finup(desc, NULL, 0, out); + if (!crypto_simd_usable()) { + sm3_final(shash_desc_ctx(desc), out); + return 0; + } kernel_neon_begin(); sm3_base_do_finalize(desc, sm3_ce_transform); @@ -51,14 +55,22 @@ static int sm3_ce_final(struct shash_desc *desc, u8 *out) static int sm3_ce_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { - if (!crypto_simd_usable()) - return crypto_sm3_finup(desc, data, len, out); + if (!crypto_simd_usable()) { + struct sm3_state *sctx = shash_desc_ctx(desc); + + if (len) + sm3_update(sctx, data, len); + sm3_final(sctx, out); + return 0; + } kernel_neon_begin(); - sm3_base_do_update(desc, data, len, sm3_ce_transform); + if (len) + sm3_base_do_update(desc, data, len, sm3_ce_transform); + sm3_base_do_finalize(desc, sm3_ce_transform); kernel_neon_end(); - return sm3_ce_final(desc, out); + return sm3_base_finish(desc, out); } static struct shash_alg sm3_alg = { From 49e39f71c0ba5df2e23839f7da91c128c77276ab Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Fri, 7 Jan 2022 20:06:58 +0800 Subject: [PATCH 0214/5344] crypto: sm3 - make dependent on sm3 library commit b4784a45ea69577f21f89898c71127774a090a2a upstream. SM3 generic library is stand-alone implementation, it is necessary making the sm3-generic implementation to depends on SM3 library. The functions crypto_sm3_*() provided by sm3_generic is no longer exported. Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu --- crypto/Kconfig | 1 + crypto/sm3_generic.c | 143 +++++-------------------------------------- include/crypto/sm3.h | 8 --- 3 files changed, 17 insertions(+), 135 deletions(-) diff --git a/crypto/Kconfig b/crypto/Kconfig index 30b0bbef5b0ad..daea3358559b4 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -967,6 +967,7 @@ config CRYPTO_LIB_SM3 config CRYPTO_SM3 tristate "SM3 digest algorithm" select CRYPTO_HASH + select CRYPTO_LIB_SM3 help SM3 secure hash function as defined by OSCCA GM/T 0004-2012 SM3). It is part of the Chinese Commercial Cryptography suite. diff --git a/crypto/sm3_generic.c b/crypto/sm3_generic.c index 3468975215cae..a215c1c37e730 100644 --- a/crypto/sm3_generic.c +++ b/crypto/sm3_generic.c @@ -5,6 +5,7 @@ * * Copyright (C) 2017 ARM Limited or its affiliates. * Written by Gilad Ben-Yossef + * Copyright (C) 2021 Tianjia Zhang */ #include @@ -26,153 +27,41 @@ const u8 sm3_zero_message_hash[SM3_DIGEST_SIZE] = { }; EXPORT_SYMBOL_GPL(sm3_zero_message_hash); -static inline u32 p0(u32 x) -{ - return x ^ rol32(x, 9) ^ rol32(x, 17); -} - -static inline u32 p1(u32 x) -{ - return x ^ rol32(x, 15) ^ rol32(x, 23); -} - -static inline u32 ff(unsigned int n, u32 a, u32 b, u32 c) -{ - return (n < 16) ? (a ^ b ^ c) : ((a & b) | (a & c) | (b & c)); -} - -static inline u32 gg(unsigned int n, u32 e, u32 f, u32 g) -{ - return (n < 16) ? (e ^ f ^ g) : ((e & f) | ((~e) & g)); -} - -static inline u32 t(unsigned int n) -{ - return (n < 16) ? SM3_T1 : SM3_T2; -} - -static void sm3_expand(u32 *t, u32 *w, u32 *wt) -{ - int i; - unsigned int tmp; - - /* load the input */ - for (i = 0; i <= 15; i++) - w[i] = get_unaligned_be32((__u32 *)t + i); - - for (i = 16; i <= 67; i++) { - tmp = w[i - 16] ^ w[i - 9] ^ rol32(w[i - 3], 15); - w[i] = p1(tmp) ^ (rol32(w[i - 13], 7)) ^ w[i - 6]; - } - - for (i = 0; i <= 63; i++) - wt[i] = w[i] ^ w[i + 4]; -} - -static void sm3_compress(u32 *w, u32 *wt, u32 *m) -{ - u32 ss1; - u32 ss2; - u32 tt1; - u32 tt2; - u32 a, b, c, d, e, f, g, h; - int i; - - a = m[0]; - b = m[1]; - c = m[2]; - d = m[3]; - e = m[4]; - f = m[5]; - g = m[6]; - h = m[7]; - - for (i = 0; i <= 63; i++) { - - ss1 = rol32((rol32(a, 12) + e + rol32(t(i), i & 31)), 7); - - ss2 = ss1 ^ rol32(a, 12); - - tt1 = ff(i, a, b, c) + d + ss2 + *wt; - wt++; - - tt2 = gg(i, e, f, g) + h + ss1 + *w; - w++; - - d = c; - c = rol32(b, 9); - b = a; - a = tt1; - h = g; - g = rol32(f, 19); - f = e; - e = p0(tt2); - } - - m[0] = a ^ m[0]; - m[1] = b ^ m[1]; - m[2] = c ^ m[2]; - m[3] = d ^ m[3]; - m[4] = e ^ m[4]; - m[5] = f ^ m[5]; - m[6] = g ^ m[6]; - m[7] = h ^ m[7]; - - a = b = c = d = e = f = g = h = ss1 = ss2 = tt1 = tt2 = 0; -} - -static void sm3_transform(struct sm3_state *sst, u8 const *src) -{ - unsigned int w[68]; - unsigned int wt[64]; - - sm3_expand((u32 *)src, w, wt); - sm3_compress(w, wt, sst->state); - - memzero_explicit(w, sizeof(w)); - memzero_explicit(wt, sizeof(wt)); -} - -static void sm3_generic_block_fn(struct sm3_state *sst, u8 const *src, - int blocks) -{ - while (blocks--) { - sm3_transform(sst, src); - src += SM3_BLOCK_SIZE; - } -} - -int crypto_sm3_update(struct shash_desc *desc, const u8 *data, +static int crypto_sm3_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - return sm3_base_do_update(desc, data, len, sm3_generic_block_fn); + sm3_update(shash_desc_ctx(desc), data, len); + return 0; } -EXPORT_SYMBOL(crypto_sm3_update); -static int sm3_final(struct shash_desc *desc, u8 *out) +static int crypto_sm3_final(struct shash_desc *desc, u8 *out) { - sm3_base_do_finalize(desc, sm3_generic_block_fn); - return sm3_base_finish(desc, out); + sm3_final(shash_desc_ctx(desc), out); + return 0; } -int crypto_sm3_finup(struct shash_desc *desc, const u8 *data, +static int crypto_sm3_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *hash) { - sm3_base_do_update(desc, data, len, sm3_generic_block_fn); - return sm3_final(desc, hash); + struct sm3_state *sctx = shash_desc_ctx(desc); + + if (len) + sm3_update(sctx, data, len); + sm3_final(sctx, hash); + return 0; } -EXPORT_SYMBOL(crypto_sm3_finup); static struct shash_alg sm3_alg = { .digestsize = SM3_DIGEST_SIZE, .init = sm3_base_init, .update = crypto_sm3_update, - .final = sm3_final, + .final = crypto_sm3_final, .finup = crypto_sm3_finup, .descsize = sizeof(struct sm3_state), .base = { .cra_name = "sm3", .cra_driver_name = "sm3-generic", + .cra_priority = 100, .cra_blocksize = SM3_BLOCK_SIZE, .cra_module = THIS_MODULE, } diff --git a/include/crypto/sm3.h b/include/crypto/sm3.h index 299417a1910c8..1f021ad0533ff 100644 --- a/include/crypto/sm3.h +++ b/include/crypto/sm3.h @@ -35,14 +35,6 @@ struct sm3_state { u8 buffer[SM3_BLOCK_SIZE]; }; -struct shash_desc; - -extern int crypto_sm3_update(struct shash_desc *desc, const u8 *data, - unsigned int len); - -extern int crypto_sm3_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *hash); - /* * Stand-alone implementation of the SM3 algorithm. It is designed to * have as little dependencies as possible so it can be used in the From 805cddd84c9c4fdf8c5088de67db6497a89a3715 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Fri, 7 Jan 2022 20:06:59 +0800 Subject: [PATCH 0215/5344] crypto: x86/sm3 - add AVX assembly implementation commit 930ab34d906d9c44727c9dcfeafcfcd33e3639e7 upstream. This patch adds AVX assembly accelerated implementation of SM3 secure hash algorithm. From the benchmark data, compared to pure software implementation sm3-generic, the performance increase is up to 38%. The main algorithm implementation based on SM3 AES/BMI2 accelerated work by libgcrypt at: https://gnupg.org/software/libgcrypt/index.html Benchmark on Intel i5-6200U 2.30GHz, performance data of two implementations, pure software sm3-generic and sm3-avx acceleration. The data comes from the 326 mode and 422 mode of tcrypt. The abscissas are different lengths of per update. The data is tabulated and the unit is Mb/s: update-size | 16 64 256 1024 2048 4096 8192 ------------+------------------------------------------------------- sm3-generic | 105.97 129.60 182.12 189.62 188.06 193.66 194.88 sm3-avx | 119.87 163.05 244.44 260.92 257.60 264.87 265.88 Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 3 + arch/x86/crypto/sm3-avx-asm_64.S | 517 +++++++++++++++++++++++++++++++ arch/x86/crypto/sm3_avx_glue.c | 134 ++++++++ crypto/Kconfig | 13 + 4 files changed, 667 insertions(+) create mode 100644 arch/x86/crypto/sm3-avx-asm_64.S create mode 100644 arch/x86/crypto/sm3_avx_glue.c diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 97711d9b15bdc..e3412e06fd4bb 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -116,6 +116,9 @@ endif sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o +obj-$(CONFIG_CRYPTO_SM3_AVX_X86_64) += sm3-avx-x86_64.o +sm3-avx-x86_64-y := sm3-avx-asm_64.o sm3_avx_glue.o + obj-$(CONFIG_CRYPTO_SM4_AESNI_AVX_X86_64) += sm4-aesni-avx-x86_64.o sm4-aesni-avx-x86_64-y := sm4-aesni-avx-asm_64.o sm4_aesni_avx_glue.o diff --git a/arch/x86/crypto/sm3-avx-asm_64.S b/arch/x86/crypto/sm3-avx-asm_64.S new file mode 100644 index 0000000000000..71e6aae23e17c --- /dev/null +++ b/arch/x86/crypto/sm3-avx-asm_64.S @@ -0,0 +1,517 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SM3 AVX accelerated transform. + * specified in: https://datatracker.ietf.org/doc/html/draft-sca-cfrg-sm3-02 + * + * Copyright (C) 2021 Jussi Kivilinna + * Copyright (C) 2021 Tianjia Zhang + */ + +/* Based on SM3 AES/BMI2 accelerated work by libgcrypt at: + * https://gnupg.org/software/libgcrypt/index.html + */ + +#include +#include + +/* Context structure */ + +#define state_h0 0 +#define state_h1 4 +#define state_h2 8 +#define state_h3 12 +#define state_h4 16 +#define state_h5 20 +#define state_h6 24 +#define state_h7 28 + +/* Constants */ + +/* Round constant macros */ + +#define K0 2043430169 /* 0x79cc4519 */ +#define K1 -208106958 /* 0xf3988a32 */ +#define K2 -416213915 /* 0xe7311465 */ +#define K3 -832427829 /* 0xce6228cb */ +#define K4 -1664855657 /* 0x9cc45197 */ +#define K5 965255983 /* 0x3988a32f */ +#define K6 1930511966 /* 0x7311465e */ +#define K7 -433943364 /* 0xe6228cbc */ +#define K8 -867886727 /* 0xcc451979 */ +#define K9 -1735773453 /* 0x988a32f3 */ +#define K10 823420391 /* 0x311465e7 */ +#define K11 1646840782 /* 0x6228cbce */ +#define K12 -1001285732 /* 0xc451979c */ +#define K13 -2002571463 /* 0x88a32f39 */ +#define K14 289824371 /* 0x11465e73 */ +#define K15 579648742 /* 0x228cbce6 */ +#define K16 -1651869049 /* 0x9d8a7a87 */ +#define K17 991229199 /* 0x3b14f50f */ +#define K18 1982458398 /* 0x7629ea1e */ +#define K19 -330050500 /* 0xec53d43c */ +#define K20 -660100999 /* 0xd8a7a879 */ +#define K21 -1320201997 /* 0xb14f50f3 */ +#define K22 1654563303 /* 0x629ea1e7 */ +#define K23 -985840690 /* 0xc53d43ce */ +#define K24 -1971681379 /* 0x8a7a879d */ +#define K25 351604539 /* 0x14f50f3b */ +#define K26 703209078 /* 0x29ea1e76 */ +#define K27 1406418156 /* 0x53d43cec */ +#define K28 -1482130984 /* 0xa7a879d8 */ +#define K29 1330705329 /* 0x4f50f3b1 */ +#define K30 -1633556638 /* 0x9ea1e762 */ +#define K31 1027854021 /* 0x3d43cec5 */ +#define K32 2055708042 /* 0x7a879d8a */ +#define K33 -183551212 /* 0xf50f3b14 */ +#define K34 -367102423 /* 0xea1e7629 */ +#define K35 -734204845 /* 0xd43cec53 */ +#define K36 -1468409689 /* 0xa879d8a7 */ +#define K37 1358147919 /* 0x50f3b14f */ +#define K38 -1578671458 /* 0xa1e7629e */ +#define K39 1137624381 /* 0x43cec53d */ +#define K40 -2019718534 /* 0x879d8a7a */ +#define K41 255530229 /* 0x0f3b14f5 */ +#define K42 511060458 /* 0x1e7629ea */ +#define K43 1022120916 /* 0x3cec53d4 */ +#define K44 2044241832 /* 0x79d8a7a8 */ +#define K45 -206483632 /* 0xf3b14f50 */ +#define K46 -412967263 /* 0xe7629ea1 */ +#define K47 -825934525 /* 0xcec53d43 */ +#define K48 -1651869049 /* 0x9d8a7a87 */ +#define K49 991229199 /* 0x3b14f50f */ +#define K50 1982458398 /* 0x7629ea1e */ +#define K51 -330050500 /* 0xec53d43c */ +#define K52 -660100999 /* 0xd8a7a879 */ +#define K53 -1320201997 /* 0xb14f50f3 */ +#define K54 1654563303 /* 0x629ea1e7 */ +#define K55 -985840690 /* 0xc53d43ce */ +#define K56 -1971681379 /* 0x8a7a879d */ +#define K57 351604539 /* 0x14f50f3b */ +#define K58 703209078 /* 0x29ea1e76 */ +#define K59 1406418156 /* 0x53d43cec */ +#define K60 -1482130984 /* 0xa7a879d8 */ +#define K61 1330705329 /* 0x4f50f3b1 */ +#define K62 -1633556638 /* 0x9ea1e762 */ +#define K63 1027854021 /* 0x3d43cec5 */ + +/* Register macros */ + +#define RSTATE %rdi +#define RDATA %rsi +#define RNBLKS %rdx + +#define t0 %eax +#define t1 %ebx +#define t2 %ecx + +#define a %r8d +#define b %r9d +#define c %r10d +#define d %r11d +#define e %r12d +#define f %r13d +#define g %r14d +#define h %r15d + +#define W0 %xmm0 +#define W1 %xmm1 +#define W2 %xmm2 +#define W3 %xmm3 +#define W4 %xmm4 +#define W5 %xmm5 + +#define XTMP0 %xmm6 +#define XTMP1 %xmm7 +#define XTMP2 %xmm8 +#define XTMP3 %xmm9 +#define XTMP4 %xmm10 +#define XTMP5 %xmm11 +#define XTMP6 %xmm12 + +#define BSWAP_REG %xmm15 + +/* Stack structure */ + +#define STACK_W_SIZE (32 * 2 * 3) +#define STACK_REG_SAVE_SIZE (64) + +#define STACK_W (0) +#define STACK_REG_SAVE (STACK_W + STACK_W_SIZE) +#define STACK_SIZE (STACK_REG_SAVE + STACK_REG_SAVE_SIZE) + +/* Instruction helpers. */ + +#define roll2(v, reg) \ + roll $(v), reg; + +#define roll3mov(v, src, dst) \ + movl src, dst; \ + roll $(v), dst; + +#define roll3(v, src, dst) \ + rorxl $(32-(v)), src, dst; + +#define addl2(a, out) \ + leal (a, out), out; + +/* Round function macros. */ + +#define GG1(x, y, z, o, t) \ + movl x, o; \ + xorl y, o; \ + xorl z, o; + +#define FF1(x, y, z, o, t) GG1(x, y, z, o, t) + +#define GG2(x, y, z, o, t) \ + andnl z, x, o; \ + movl y, t; \ + andl x, t; \ + addl2(t, o); + +#define FF2(x, y, z, o, t) \ + movl y, o; \ + xorl x, o; \ + movl y, t; \ + andl x, t; \ + andl z, o; \ + xorl t, o; + +#define R(i, a, b, c, d, e, f, g, h, round, widx, wtype) \ + /* rol(a, 12) => t0 */ \ + roll3mov(12, a, t0); /* rorxl here would reduce perf by 6% on zen3 */ \ + /* rol (t0 + e + t), 7) => t1 */ \ + leal K##round(t0, e, 1), t1; \ + roll2(7, t1); \ + /* h + w1 => h */ \ + addl wtype##_W1_ADDR(round, widx), h; \ + /* h + t1 => h */ \ + addl2(t1, h); \ + /* t1 ^ t0 => t0 */ \ + xorl t1, t0; \ + /* w1w2 + d => d */ \ + addl wtype##_W1W2_ADDR(round, widx), d; \ + /* FF##i(a,b,c) => t1 */ \ + FF##i(a, b, c, t1, t2); \ + /* d + t1 => d */ \ + addl2(t1, d); \ + /* GG#i(e,f,g) => t2 */ \ + GG##i(e, f, g, t2, t1); \ + /* h + t2 => h */ \ + addl2(t2, h); \ + /* rol (f, 19) => f */ \ + roll2(19, f); \ + /* d + t0 => d */ \ + addl2(t0, d); \ + /* rol (b, 9) => b */ \ + roll2(9, b); \ + /* P0(h) => h */ \ + roll3(9, h, t2); \ + roll3(17, h, t1); \ + xorl t2, h; \ + xorl t1, h; + +#define R1(a, b, c, d, e, f, g, h, round, widx, wtype) \ + R(1, a, b, c, d, e, f, g, h, round, widx, wtype) + +#define R2(a, b, c, d, e, f, g, h, round, widx, wtype) \ + R(2, a, b, c, d, e, f, g, h, round, widx, wtype) + +/* Input expansion macros. */ + +/* Byte-swapped input address. */ +#define IW_W_ADDR(round, widx, offs) \ + (STACK_W + ((round) / 4) * 64 + (offs) + ((widx) * 4))(%rsp) + +/* Expanded input address. */ +#define XW_W_ADDR(round, widx, offs) \ + (STACK_W + ((((round) / 3) - 4) % 2) * 64 + (offs) + ((widx) * 4))(%rsp) + +/* Rounds 1-12, byte-swapped input block addresses. */ +#define IW_W1_ADDR(round, widx) IW_W_ADDR(round, widx, 0) +#define IW_W1W2_ADDR(round, widx) IW_W_ADDR(round, widx, 32) + +/* Rounds 1-12, expanded input block addresses. */ +#define XW_W1_ADDR(round, widx) XW_W_ADDR(round, widx, 0) +#define XW_W1W2_ADDR(round, widx) XW_W_ADDR(round, widx, 32) + +/* Input block loading. */ +#define LOAD_W_XMM_1() \ + vmovdqu 0*16(RDATA), XTMP0; /* XTMP0: w3, w2, w1, w0 */ \ + vmovdqu 1*16(RDATA), XTMP1; /* XTMP1: w7, w6, w5, w4 */ \ + vmovdqu 2*16(RDATA), XTMP2; /* XTMP2: w11, w10, w9, w8 */ \ + vmovdqu 3*16(RDATA), XTMP3; /* XTMP3: w15, w14, w13, w12 */ \ + vpshufb BSWAP_REG, XTMP0, XTMP0; \ + vpshufb BSWAP_REG, XTMP1, XTMP1; \ + vpshufb BSWAP_REG, XTMP2, XTMP2; \ + vpshufb BSWAP_REG, XTMP3, XTMP3; \ + vpxor XTMP0, XTMP1, XTMP4; \ + vpxor XTMP1, XTMP2, XTMP5; \ + vpxor XTMP2, XTMP3, XTMP6; \ + leaq 64(RDATA), RDATA; \ + vmovdqa XTMP0, IW_W1_ADDR(0, 0); \ + vmovdqa XTMP4, IW_W1W2_ADDR(0, 0); \ + vmovdqa XTMP1, IW_W1_ADDR(4, 0); \ + vmovdqa XTMP5, IW_W1W2_ADDR(4, 0); + +#define LOAD_W_XMM_2() \ + vmovdqa XTMP2, IW_W1_ADDR(8, 0); \ + vmovdqa XTMP6, IW_W1W2_ADDR(8, 0); + +#define LOAD_W_XMM_3() \ + vpshufd $0b00000000, XTMP0, W0; /* W0: xx, w0, xx, xx */ \ + vpshufd $0b11111001, XTMP0, W1; /* W1: xx, w3, w2, w1 */ \ + vmovdqa XTMP1, W2; /* W2: xx, w6, w5, w4 */ \ + vpalignr $12, XTMP1, XTMP2, W3; /* W3: xx, w9, w8, w7 */ \ + vpalignr $8, XTMP2, XTMP3, W4; /* W4: xx, w12, w11, w10 */ \ + vpshufd $0b11111001, XTMP3, W5; /* W5: xx, w15, w14, w13 */ + +/* Message scheduling. Note: 3 words per XMM register. */ +#define SCHED_W_0(round, w0, w1, w2, w3, w4, w5) \ + /* Load (w[i - 16]) => XTMP0 */ \ + vpshufd $0b10111111, w0, XTMP0; \ + vpalignr $12, XTMP0, w1, XTMP0; /* XTMP0: xx, w2, w1, w0 */ \ + /* Load (w[i - 13]) => XTMP1 */ \ + vpshufd $0b10111111, w1, XTMP1; \ + vpalignr $12, XTMP1, w2, XTMP1; \ + /* w[i - 9] == w3 */ \ + /* XMM3 ^ XTMP0 => XTMP0 */ \ + vpxor w3, XTMP0, XTMP0; + +#define SCHED_W_1(round, w0, w1, w2, w3, w4, w5) \ + /* w[i - 3] == w5 */ \ + /* rol(XMM5, 15) ^ XTMP0 => XTMP0 */ \ + vpslld $15, w5, XTMP2; \ + vpsrld $(32-15), w5, XTMP3; \ + vpxor XTMP2, XTMP3, XTMP3; \ + vpxor XTMP3, XTMP0, XTMP0; \ + /* rol(XTMP1, 7) => XTMP1 */ \ + vpslld $7, XTMP1, XTMP5; \ + vpsrld $(32-7), XTMP1, XTMP1; \ + vpxor XTMP5, XTMP1, XTMP1; \ + /* XMM4 ^ XTMP1 => XTMP1 */ \ + vpxor w4, XTMP1, XTMP1; \ + /* w[i - 6] == XMM4 */ \ + /* P1(XTMP0) ^ XTMP1 => XMM0 */ \ + vpslld $15, XTMP0, XTMP5; \ + vpsrld $(32-15), XTMP0, XTMP6; \ + vpslld $23, XTMP0, XTMP2; \ + vpsrld $(32-23), XTMP0, XTMP3; \ + vpxor XTMP0, XTMP1, XTMP1; \ + vpxor XTMP6, XTMP5, XTMP5; \ + vpxor XTMP3, XTMP2, XTMP2; \ + vpxor XTMP2, XTMP5, XTMP5; \ + vpxor XTMP5, XTMP1, w0; + +#define SCHED_W_2(round, w0, w1, w2, w3, w4, w5) \ + /* W1 in XMM12 */ \ + vpshufd $0b10111111, w4, XTMP4; \ + vpalignr $12, XTMP4, w5, XTMP4; \ + vmovdqa XTMP4, XW_W1_ADDR((round), 0); \ + /* W1 ^ W2 => XTMP1 */ \ + vpxor w0, XTMP4, XTMP1; \ + vmovdqa XTMP1, XW_W1W2_ADDR((round), 0); + + +.section .rodata.cst16, "aM", @progbits, 16 +.align 16 + +.Lbe32mask: + .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f + +.text + +/* + * Transform nblocks*64 bytes (nblocks*16 32-bit words) at DATA. + * + * void sm3_transform_avx(struct sm3_state *state, + * const u8 *data, int nblocks); + */ +.align 16 +SYM_FUNC_START(sm3_transform_avx) + /* input: + * %rdi: ctx, CTX + * %rsi: data (64*nblks bytes) + * %rdx: nblocks + */ + vzeroupper; + + pushq %rbp; + movq %rsp, %rbp; + + movq %rdx, RNBLKS; + + subq $STACK_SIZE, %rsp; + andq $(~63), %rsp; + + movq %rbx, (STACK_REG_SAVE + 0 * 8)(%rsp); + movq %r15, (STACK_REG_SAVE + 1 * 8)(%rsp); + movq %r14, (STACK_REG_SAVE + 2 * 8)(%rsp); + movq %r13, (STACK_REG_SAVE + 3 * 8)(%rsp); + movq %r12, (STACK_REG_SAVE + 4 * 8)(%rsp); + + vmovdqa .Lbe32mask (%rip), BSWAP_REG; + + /* Get the values of the chaining variables. */ + movl state_h0(RSTATE), a; + movl state_h1(RSTATE), b; + movl state_h2(RSTATE), c; + movl state_h3(RSTATE), d; + movl state_h4(RSTATE), e; + movl state_h5(RSTATE), f; + movl state_h6(RSTATE), g; + movl state_h7(RSTATE), h; + +.align 16 +.Loop: + /* Load data part1. */ + LOAD_W_XMM_1(); + + leaq -1(RNBLKS), RNBLKS; + + /* Transform 0-3 + Load data part2. */ + R1(a, b, c, d, e, f, g, h, 0, 0, IW); LOAD_W_XMM_2(); + R1(d, a, b, c, h, e, f, g, 1, 1, IW); + R1(c, d, a, b, g, h, e, f, 2, 2, IW); + R1(b, c, d, a, f, g, h, e, 3, 3, IW); LOAD_W_XMM_3(); + + /* Transform 4-7 + Precalc 12-14. */ + R1(a, b, c, d, e, f, g, h, 4, 0, IW); + R1(d, a, b, c, h, e, f, g, 5, 1, IW); + R1(c, d, a, b, g, h, e, f, 6, 2, IW); SCHED_W_0(12, W0, W1, W2, W3, W4, W5); + R1(b, c, d, a, f, g, h, e, 7, 3, IW); SCHED_W_1(12, W0, W1, W2, W3, W4, W5); + + /* Transform 8-11 + Precalc 12-17. */ + R1(a, b, c, d, e, f, g, h, 8, 0, IW); SCHED_W_2(12, W0, W1, W2, W3, W4, W5); + R1(d, a, b, c, h, e, f, g, 9, 1, IW); SCHED_W_0(15, W1, W2, W3, W4, W5, W0); + R1(c, d, a, b, g, h, e, f, 10, 2, IW); SCHED_W_1(15, W1, W2, W3, W4, W5, W0); + R1(b, c, d, a, f, g, h, e, 11, 3, IW); SCHED_W_2(15, W1, W2, W3, W4, W5, W0); + + /* Transform 12-14 + Precalc 18-20 */ + R1(a, b, c, d, e, f, g, h, 12, 0, XW); SCHED_W_0(18, W2, W3, W4, W5, W0, W1); + R1(d, a, b, c, h, e, f, g, 13, 1, XW); SCHED_W_1(18, W2, W3, W4, W5, W0, W1); + R1(c, d, a, b, g, h, e, f, 14, 2, XW); SCHED_W_2(18, W2, W3, W4, W5, W0, W1); + + /* Transform 15-17 + Precalc 21-23 */ + R1(b, c, d, a, f, g, h, e, 15, 0, XW); SCHED_W_0(21, W3, W4, W5, W0, W1, W2); + R2(a, b, c, d, e, f, g, h, 16, 1, XW); SCHED_W_1(21, W3, W4, W5, W0, W1, W2); + R2(d, a, b, c, h, e, f, g, 17, 2, XW); SCHED_W_2(21, W3, W4, W5, W0, W1, W2); + + /* Transform 18-20 + Precalc 24-26 */ + R2(c, d, a, b, g, h, e, f, 18, 0, XW); SCHED_W_0(24, W4, W5, W0, W1, W2, W3); + R2(b, c, d, a, f, g, h, e, 19, 1, XW); SCHED_W_1(24, W4, W5, W0, W1, W2, W3); + R2(a, b, c, d, e, f, g, h, 20, 2, XW); SCHED_W_2(24, W4, W5, W0, W1, W2, W3); + + /* Transform 21-23 + Precalc 27-29 */ + R2(d, a, b, c, h, e, f, g, 21, 0, XW); SCHED_W_0(27, W5, W0, W1, W2, W3, W4); + R2(c, d, a, b, g, h, e, f, 22, 1, XW); SCHED_W_1(27, W5, W0, W1, W2, W3, W4); + R2(b, c, d, a, f, g, h, e, 23, 2, XW); SCHED_W_2(27, W5, W0, W1, W2, W3, W4); + + /* Transform 24-26 + Precalc 30-32 */ + R2(a, b, c, d, e, f, g, h, 24, 0, XW); SCHED_W_0(30, W0, W1, W2, W3, W4, W5); + R2(d, a, b, c, h, e, f, g, 25, 1, XW); SCHED_W_1(30, W0, W1, W2, W3, W4, W5); + R2(c, d, a, b, g, h, e, f, 26, 2, XW); SCHED_W_2(30, W0, W1, W2, W3, W4, W5); + + /* Transform 27-29 + Precalc 33-35 */ + R2(b, c, d, a, f, g, h, e, 27, 0, XW); SCHED_W_0(33, W1, W2, W3, W4, W5, W0); + R2(a, b, c, d, e, f, g, h, 28, 1, XW); SCHED_W_1(33, W1, W2, W3, W4, W5, W0); + R2(d, a, b, c, h, e, f, g, 29, 2, XW); SCHED_W_2(33, W1, W2, W3, W4, W5, W0); + + /* Transform 30-32 + Precalc 36-38 */ + R2(c, d, a, b, g, h, e, f, 30, 0, XW); SCHED_W_0(36, W2, W3, W4, W5, W0, W1); + R2(b, c, d, a, f, g, h, e, 31, 1, XW); SCHED_W_1(36, W2, W3, W4, W5, W0, W1); + R2(a, b, c, d, e, f, g, h, 32, 2, XW); SCHED_W_2(36, W2, W3, W4, W5, W0, W1); + + /* Transform 33-35 + Precalc 39-41 */ + R2(d, a, b, c, h, e, f, g, 33, 0, XW); SCHED_W_0(39, W3, W4, W5, W0, W1, W2); + R2(c, d, a, b, g, h, e, f, 34, 1, XW); SCHED_W_1(39, W3, W4, W5, W0, W1, W2); + R2(b, c, d, a, f, g, h, e, 35, 2, XW); SCHED_W_2(39, W3, W4, W5, W0, W1, W2); + + /* Transform 36-38 + Precalc 42-44 */ + R2(a, b, c, d, e, f, g, h, 36, 0, XW); SCHED_W_0(42, W4, W5, W0, W1, W2, W3); + R2(d, a, b, c, h, e, f, g, 37, 1, XW); SCHED_W_1(42, W4, W5, W0, W1, W2, W3); + R2(c, d, a, b, g, h, e, f, 38, 2, XW); SCHED_W_2(42, W4, W5, W0, W1, W2, W3); + + /* Transform 39-41 + Precalc 45-47 */ + R2(b, c, d, a, f, g, h, e, 39, 0, XW); SCHED_W_0(45, W5, W0, W1, W2, W3, W4); + R2(a, b, c, d, e, f, g, h, 40, 1, XW); SCHED_W_1(45, W5, W0, W1, W2, W3, W4); + R2(d, a, b, c, h, e, f, g, 41, 2, XW); SCHED_W_2(45, W5, W0, W1, W2, W3, W4); + + /* Transform 42-44 + Precalc 48-50 */ + R2(c, d, a, b, g, h, e, f, 42, 0, XW); SCHED_W_0(48, W0, W1, W2, W3, W4, W5); + R2(b, c, d, a, f, g, h, e, 43, 1, XW); SCHED_W_1(48, W0, W1, W2, W3, W4, W5); + R2(a, b, c, d, e, f, g, h, 44, 2, XW); SCHED_W_2(48, W0, W1, W2, W3, W4, W5); + + /* Transform 45-47 + Precalc 51-53 */ + R2(d, a, b, c, h, e, f, g, 45, 0, XW); SCHED_W_0(51, W1, W2, W3, W4, W5, W0); + R2(c, d, a, b, g, h, e, f, 46, 1, XW); SCHED_W_1(51, W1, W2, W3, W4, W5, W0); + R2(b, c, d, a, f, g, h, e, 47, 2, XW); SCHED_W_2(51, W1, W2, W3, W4, W5, W0); + + /* Transform 48-50 + Precalc 54-56 */ + R2(a, b, c, d, e, f, g, h, 48, 0, XW); SCHED_W_0(54, W2, W3, W4, W5, W0, W1); + R2(d, a, b, c, h, e, f, g, 49, 1, XW); SCHED_W_1(54, W2, W3, W4, W5, W0, W1); + R2(c, d, a, b, g, h, e, f, 50, 2, XW); SCHED_W_2(54, W2, W3, W4, W5, W0, W1); + + /* Transform 51-53 + Precalc 57-59 */ + R2(b, c, d, a, f, g, h, e, 51, 0, XW); SCHED_W_0(57, W3, W4, W5, W0, W1, W2); + R2(a, b, c, d, e, f, g, h, 52, 1, XW); SCHED_W_1(57, W3, W4, W5, W0, W1, W2); + R2(d, a, b, c, h, e, f, g, 53, 2, XW); SCHED_W_2(57, W3, W4, W5, W0, W1, W2); + + /* Transform 54-56 + Precalc 60-62 */ + R2(c, d, a, b, g, h, e, f, 54, 0, XW); SCHED_W_0(60, W4, W5, W0, W1, W2, W3); + R2(b, c, d, a, f, g, h, e, 55, 1, XW); SCHED_W_1(60, W4, W5, W0, W1, W2, W3); + R2(a, b, c, d, e, f, g, h, 56, 2, XW); SCHED_W_2(60, W4, W5, W0, W1, W2, W3); + + /* Transform 57-59 + Precalc 63 */ + R2(d, a, b, c, h, e, f, g, 57, 0, XW); SCHED_W_0(63, W5, W0, W1, W2, W3, W4); + R2(c, d, a, b, g, h, e, f, 58, 1, XW); + R2(b, c, d, a, f, g, h, e, 59, 2, XW); SCHED_W_1(63, W5, W0, W1, W2, W3, W4); + + /* Transform 60-62 + Precalc 63 */ + R2(a, b, c, d, e, f, g, h, 60, 0, XW); + R2(d, a, b, c, h, e, f, g, 61, 1, XW); SCHED_W_2(63, W5, W0, W1, W2, W3, W4); + R2(c, d, a, b, g, h, e, f, 62, 2, XW); + + /* Transform 63 */ + R2(b, c, d, a, f, g, h, e, 63, 0, XW); + + /* Update the chaining variables. */ + xorl state_h0(RSTATE), a; + xorl state_h1(RSTATE), b; + xorl state_h2(RSTATE), c; + xorl state_h3(RSTATE), d; + movl a, state_h0(RSTATE); + movl b, state_h1(RSTATE); + movl c, state_h2(RSTATE); + movl d, state_h3(RSTATE); + xorl state_h4(RSTATE), e; + xorl state_h5(RSTATE), f; + xorl state_h6(RSTATE), g; + xorl state_h7(RSTATE), h; + movl e, state_h4(RSTATE); + movl f, state_h5(RSTATE); + movl g, state_h6(RSTATE); + movl h, state_h7(RSTATE); + + cmpq $0, RNBLKS; + jne .Loop; + + vzeroall; + + movq (STACK_REG_SAVE + 0 * 8)(%rsp), %rbx; + movq (STACK_REG_SAVE + 1 * 8)(%rsp), %r15; + movq (STACK_REG_SAVE + 2 * 8)(%rsp), %r14; + movq (STACK_REG_SAVE + 3 * 8)(%rsp), %r13; + movq (STACK_REG_SAVE + 4 * 8)(%rsp), %r12; + + vmovdqa %xmm0, IW_W1_ADDR(0, 0); + vmovdqa %xmm0, IW_W1W2_ADDR(0, 0); + vmovdqa %xmm0, IW_W1_ADDR(4, 0); + vmovdqa %xmm0, IW_W1W2_ADDR(4, 0); + vmovdqa %xmm0, IW_W1_ADDR(8, 0); + vmovdqa %xmm0, IW_W1W2_ADDR(8, 0); + + movq %rbp, %rsp; + popq %rbp; + ret; +SYM_FUNC_END(sm3_transform_avx) diff --git a/arch/x86/crypto/sm3_avx_glue.c b/arch/x86/crypto/sm3_avx_glue.c new file mode 100644 index 0000000000000..661b6f22ffcd8 --- /dev/null +++ b/arch/x86/crypto/sm3_avx_glue.c @@ -0,0 +1,134 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SM3 Secure Hash Algorithm, AVX assembler accelerated. + * specified in: https://datatracker.ietf.org/doc/html/draft-sca-cfrg-sm3-02 + * + * Copyright (C) 2021 Tianjia Zhang + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +asmlinkage void sm3_transform_avx(struct sm3_state *state, + const u8 *data, int nblocks); + +static int sm3_avx_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct sm3_state *sctx = shash_desc_ctx(desc); + + if (!crypto_simd_usable() || + (sctx->count % SM3_BLOCK_SIZE) + len < SM3_BLOCK_SIZE) { + sm3_update(sctx, data, len); + return 0; + } + + /* + * Make sure struct sm3_state begins directly with the SM3 + * 256-bit internal state, as this is what the asm functions expect. + */ + BUILD_BUG_ON(offsetof(struct sm3_state, state) != 0); + + kernel_fpu_begin(); + sm3_base_do_update(desc, data, len, sm3_transform_avx); + kernel_fpu_end(); + + return 0; +} + +static int sm3_avx_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + if (!crypto_simd_usable()) { + struct sm3_state *sctx = shash_desc_ctx(desc); + + if (len) + sm3_update(sctx, data, len); + + sm3_final(sctx, out); + return 0; + } + + kernel_fpu_begin(); + if (len) + sm3_base_do_update(desc, data, len, sm3_transform_avx); + sm3_base_do_finalize(desc, sm3_transform_avx); + kernel_fpu_end(); + + return sm3_base_finish(desc, out); +} + +static int sm3_avx_final(struct shash_desc *desc, u8 *out) +{ + if (!crypto_simd_usable()) { + sm3_final(shash_desc_ctx(desc), out); + return 0; + } + + kernel_fpu_begin(); + sm3_base_do_finalize(desc, sm3_transform_avx); + kernel_fpu_end(); + + return sm3_base_finish(desc, out); +} + +static struct shash_alg sm3_avx_alg = { + .digestsize = SM3_DIGEST_SIZE, + .init = sm3_base_init, + .update = sm3_avx_update, + .final = sm3_avx_final, + .finup = sm3_avx_finup, + .descsize = sizeof(struct sm3_state), + .base = { + .cra_name = "sm3", + .cra_driver_name = "sm3-avx", + .cra_priority = 300, + .cra_blocksize = SM3_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static int __init sm3_avx_mod_init(void) +{ + const char *feature_name; + + if (!boot_cpu_has(X86_FEATURE_AVX)) { + pr_info("AVX instruction are not detected.\n"); + return -ENODEV; + } + + if (!boot_cpu_has(X86_FEATURE_BMI2)) { + pr_info("BMI2 instruction are not detected.\n"); + return -ENODEV; + } + + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); + return -ENODEV; + } + + return crypto_register_shash(&sm3_avx_alg); +} + +static void __exit sm3_avx_mod_exit(void) +{ + crypto_unregister_shash(&sm3_avx_alg); +} + +module_init(sm3_avx_mod_init); +module_exit(sm3_avx_mod_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Tianjia Zhang "); +MODULE_DESCRIPTION("SM3 Secure Hash Algorithm, AVX assembler accelerated"); +MODULE_ALIAS_CRYPTO("sm3"); +MODULE_ALIAS_CRYPTO("sm3-avx"); diff --git a/crypto/Kconfig b/crypto/Kconfig index daea3358559b4..fda4faf5fb6d0 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -976,6 +976,19 @@ config CRYPTO_SM3 http://www.oscca.gov.cn/UpFile/20101222141857786.pdf https://datatracker.ietf.org/doc/html/draft-shen-sm3-hash +config CRYPTO_SM3_AVX_X86_64 + tristate "SM3 digest algorithm (x86_64/AVX)" + depends on X86 && 64BIT + select CRYPTO_HASH + select CRYPTO_LIB_SM3 + help + SM3 secure hash function as defined by OSCCA GM/T 0004-2012 SM3). + It is part of the Chinese Commercial Cryptography suite. This is + SM3 optimized implementation using Advanced Vector Extensions (AVX) + when available. + + If unsure, say N. + config CRYPTO_STREEBOG tristate "Streebog Hash Function" select CRYPTO_HASH From 146298487c27c917513ed165184aba27560268d4 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Fri, 7 Jan 2022 20:07:00 +0800 Subject: [PATCH 0216/5344] crypto: tcrypt - add asynchronous speed test for SM3 commit ba2c149d0812cee653a186c9cfe451699b211c91 upstream. tcrypt supports testing of SM3 hash algorithms that use AVX instruction acceleration. In order to add the sm3 asynchronous test to the appropriate position, shift the testcase sequence number of the multi buffer backward and start from 450. Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu --- crypto/tcrypt.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 8ff8e2f39df12..845982ea8cdd5 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -2562,31 +2562,35 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) if (mode > 400 && mode < 500) break; /* fall through */ case 422: + test_ahash_speed("sm3", sec, generic_hash_speed_template); + if (mode > 400 && mode < 500) break; + /* fall through */ + case 450: test_mb_ahash_speed("sha1", sec, generic_hash_speed_template, num_mb); if (mode > 400 && mode < 500) break; /* fall through */ - case 423: + case 451: test_mb_ahash_speed("sha256", sec, generic_hash_speed_template, num_mb); if (mode > 400 && mode < 500) break; /* fall through */ - case 424: + case 452: test_mb_ahash_speed("sha512", sec, generic_hash_speed_template, num_mb); if (mode > 400 && mode < 500) break; /* fall through */ - case 425: + case 453: test_mb_ahash_speed("sm3", sec, generic_hash_speed_template, num_mb); if (mode > 400 && mode < 500) break; /* fall through */ - case 426: + case 454: test_mb_ahash_speed("streebog256", sec, generic_hash_speed_template, num_mb); if (mode > 400 && mode < 500) break; /* fall through */ - case 427: + case 455: test_mb_ahash_speed("streebog512", sec, generic_hash_speed_template, num_mb); if (mode > 400 && mode < 500) break; From 5199a6a794da3ffacbfa0c6dbdf072905eeec2f2 Mon Sep 17 00:00:00 2001 From: Guixiong Wei Date: Mon, 23 May 2022 11:47:56 +0800 Subject: [PATCH 0217/5344] bytedance: config: enable SM3/SM4 config Enable SM3/SM4 algo module required by IPSEC. Signed-off-by: Guixiong Wei --- config.x86_64 | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/config.x86_64 b/config.x86_64 index 7517987f3226e..25e7f821d0c83 100644 --- a/config.x86_64 +++ b/config.x86_64 @@ -8672,7 +8672,9 @@ CONFIG_CRYPTO_LIB_SHA256=y CONFIG_CRYPTO_SHA256=y CONFIG_CRYPTO_SHA512=m CONFIG_CRYPTO_SHA3=m -# CONFIG_CRYPTO_SM3 is not set +CONFIG_CRYPTO_LIB_SM3=m +CONFIG_CRYPTO_SM3=m +CONFIG_CRYPTO_SM3_AVX_X86_64=m # CONFIG_CRYPTO_STREEBOG is not set CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m @@ -8713,7 +8715,10 @@ CONFIG_CRYPTO_SERPENT=m CONFIG_CRYPTO_SERPENT_SSE2_X86_64=m CONFIG_CRYPTO_SERPENT_AVX_X86_64=m CONFIG_CRYPTO_SERPENT_AVX2_X86_64=m -# CONFIG_CRYPTO_SM4 is not set +CONFIG_CRYPTO_LIB_SM4=m +CONFIG_CRYPTO_SM4=m +CONFIG_CRYPTO_SM4_AESNI_AVX_X86_64=m +CONFIG_CRYPTO_SM4_AESNI_AVX2_X86_64=m CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_TWOFISH=m CONFIG_CRYPTO_TWOFISH_COMMON=m From ca3949f381ae6819d5df178fabdafccc490798ba Mon Sep 17 00:00:00 2001 From: Guixiong Wei Date: Mon, 23 May 2022 22:17:28 +0800 Subject: [PATCH 0218/5344] bytedance: config: enable xfrm config Enable xfrm module required by IPSEC. Signed-off-by: Guixiong Wei --- config.x86_64 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/config.x86_64 b/config.x86_64 index 25e7f821d0c83..b31fc2bebd54f 100644 --- a/config.x86_64 +++ b/config.x86_64 @@ -1033,10 +1033,10 @@ CONFIG_XFRM=y CONFIG_XFRM_OFFLOAD=y CONFIG_XFRM_ALGO=m CONFIG_XFRM_USER=m -# CONFIG_XFRM_INTERFACE is not set +CONFIG_XFRM_INTERFACE=m CONFIG_XFRM_SUB_POLICY=y CONFIG_XFRM_MIGRATE=y -# CONFIG_XFRM_STATISTICS is not set +CONFIG_XFRM_STATISTICS=y CONFIG_XFRM_IPCOMP=m CONFIG_NET_KEY=m CONFIG_NET_KEY_MIGRATE=y @@ -1212,7 +1212,7 @@ CONFIG_NFT_COMPAT=m CONFIG_NFT_HASH=m CONFIG_NFT_FIB=m CONFIG_NFT_FIB_INET=m -# CONFIG_NFT_XFRM is not set +CONFIG_NFT_XFRM=m CONFIG_NFT_SOCKET=m CONFIG_NFT_OSF=m CONFIG_NFT_TPROXY=m From d399676d5b13e69bf6025ac51cdbe5db800b09aa Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 20 Apr 2022 18:24:45 +0800 Subject: [PATCH 0219/5344] bytedance: arm64: cpuinfo: Add physical id Add physical id for arm64, just like x86 does. This is needed by TCE. Actually, the user should use a more generic approach to detect physical id, however, TCE cannot do a upgrade quickly. So this is workaround in kernel. We could remove this once TCE is ready. Signed-off-by: Muchun Song --- arch/arm64/kernel/cpuinfo.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 05933c065732b..f28d69da97802 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -139,6 +139,9 @@ static int c_show(struct seq_file *m, void *v) * "processor". Give glibc what it expects. */ seq_printf(m, "processor\t: %d\n", i); +#ifdef CONFIG_NUMA + seq_printf(m, "physical id\t: %d\n", topology_physical_package_id(i)); +#endif if (compat) seq_printf(m, "model name\t: ARMv8 Processor rev %d (%s)\n", MIDR_REVISION(midr), COMPAT_ELF_PLATFORM); From cafaf2f0b9c1473418656147ca65e01a4514c7aa Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Tue, 17 Mar 2020 18:22:54 +0000 Subject: [PATCH 0220/5344] arm64: perf: Clean up enable/disable calls commit 29227d6ea1572b160e5bea45b3c93a0346444dfa upstream. Reading this code bordered on painful, what with all the repetition and pointless return values. More fundamentally, dribbling the hardware enables and disables in one bit at a time incurs needless system register overhead for chained events and on reset. We already use bitmask values for the KVM hooks, so consolidate all the register accesses to match, and make a reasonable saving in both source and object code. Signed-off-by: Robin Murphy Signed-off-by: Will Deacon Signed-off-by: Feng Zhou --- arch/arm64/kernel/perf_event.c | 87 ++++++++++++++-------------------- 1 file changed, 35 insertions(+), 52 deletions(-) diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 19128d994ee97..d2de291ed2f0d 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -512,86 +512,74 @@ static inline void armv8pmu_write_event_type(struct perf_event *event) } } -static inline int armv8pmu_enable_counter(int idx) +static u32 armv8pmu_event_cnten_mask(struct perf_event *event) { - u32 counter = ARMV8_IDX_TO_COUNTER(idx); - write_sysreg(BIT(counter), pmcntenset_el0); - return idx; + int counter = ARMV8_IDX_TO_COUNTER(event->hw.idx); + u32 mask = BIT(counter); + + if (armv8pmu_event_is_chained(event)) + mask |= BIT(counter - 1); + return mask; +} + +static inline void armv8pmu_enable_counter(u32 mask) +{ + write_sysreg(mask, pmcntenset_el0); } static inline void armv8pmu_enable_event_counter(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; - int idx = event->hw.idx; - u32 counter_bits = BIT(ARMV8_IDX_TO_COUNTER(idx)); - - if (armv8pmu_event_is_chained(event)) - counter_bits |= BIT(ARMV8_IDX_TO_COUNTER(idx - 1)); + u32 mask = armv8pmu_event_cnten_mask(event); - kvm_set_pmu_events(counter_bits, attr); + kvm_set_pmu_events(mask, attr); /* We rely on the hypervisor switch code to enable guest counters */ - if (!kvm_pmu_counter_deferred(attr)) { - armv8pmu_enable_counter(idx); - if (armv8pmu_event_is_chained(event)) - armv8pmu_enable_counter(idx - 1); - } + if (!kvm_pmu_counter_deferred(attr)) + armv8pmu_enable_counter(mask); } -static inline int armv8pmu_disable_counter(int idx) +static inline void armv8pmu_disable_counter(u32 mask) { - u32 counter = ARMV8_IDX_TO_COUNTER(idx); - write_sysreg(BIT(counter), pmcntenclr_el0); - return idx; + write_sysreg(mask, pmcntenclr_el0); } static inline void armv8pmu_disable_event_counter(struct perf_event *event) { - struct hw_perf_event *hwc = &event->hw; struct perf_event_attr *attr = &event->attr; - int idx = hwc->idx; - u32 counter_bits = BIT(ARMV8_IDX_TO_COUNTER(idx)); + u32 mask = armv8pmu_event_cnten_mask(event); - if (armv8pmu_event_is_chained(event)) - counter_bits |= BIT(ARMV8_IDX_TO_COUNTER(idx - 1)); - - kvm_clr_pmu_events(counter_bits); + kvm_clr_pmu_events(mask); /* We rely on the hypervisor switch code to disable guest counters */ - if (!kvm_pmu_counter_deferred(attr)) { - if (armv8pmu_event_is_chained(event)) - armv8pmu_disable_counter(idx - 1); - armv8pmu_disable_counter(idx); - } + if (!kvm_pmu_counter_deferred(attr)) + armv8pmu_disable_counter(mask); } -static inline int armv8pmu_enable_intens(int idx) +static inline void armv8pmu_enable_intens(u32 mask) { - u32 counter = ARMV8_IDX_TO_COUNTER(idx); - write_sysreg(BIT(counter), pmintenset_el1); - return idx; + write_sysreg(mask, pmintenset_el1); } -static inline int armv8pmu_enable_event_irq(struct perf_event *event) +static inline void armv8pmu_enable_event_irq(struct perf_event *event) { - return armv8pmu_enable_intens(event->hw.idx); + u32 counter = ARMV8_IDX_TO_COUNTER(event->hw.idx); + armv8pmu_enable_intens(BIT(counter)); } -static inline int armv8pmu_disable_intens(int idx) +static inline void armv8pmu_disable_intens(u32 mask) { - u32 counter = ARMV8_IDX_TO_COUNTER(idx); - write_sysreg(BIT(counter), pmintenclr_el1); + write_sysreg(mask, pmintenclr_el1); isb(); /* Clear the overflow flag in case an interrupt is pending. */ - write_sysreg(BIT(counter), pmovsclr_el0); + write_sysreg(mask, pmovsclr_el0); isb(); - - return idx; } -static inline int armv8pmu_disable_event_irq(struct perf_event *event) +static inline void armv8pmu_disable_event_irq(struct perf_event *event) { - return armv8pmu_disable_intens(event->hw.idx); + u32 counter = ARMV8_IDX_TO_COUNTER(event->hw.idx); + armv8pmu_disable_intens(BIT(counter)); } static inline u32 armv8pmu_getreset_flags(void) @@ -876,14 +864,9 @@ static int armv8pmu_filter_match(struct perf_event *event) static void armv8pmu_reset(void *info) { - struct arm_pmu *cpu_pmu = (struct arm_pmu *)info; - u32 idx, nb_cnt = cpu_pmu->num_events; - /* The counter and interrupt enable registers are unknown at reset. */ - for (idx = ARMV8_IDX_CYCLE_COUNTER; idx < nb_cnt; ++idx) { - armv8pmu_disable_counter(idx); - armv8pmu_disable_intens(idx); - } + armv8pmu_disable_counter(U32_MAX); + armv8pmu_disable_intens(U32_MAX); /* Clear the counters we flip at guest entry/exit */ kvm_clr_pmu_events(U32_MAX); From a94b53c4da20093fe16d13a1963956700291e579 Mon Sep 17 00:00:00 2001 From: Alexandru Elisei Date: Thu, 24 Sep 2020 12:07:00 +0100 Subject: [PATCH 0221/5344] arm64: perf: Add missing ISB in armv8pmu_enable_counter() commit 490d7b7c0845eacf5593db333fd2ae7715416e16 upstream. Writes to the PMXEVTYPER_EL0 register are not self-synchronising. In armv8pmu_enable_event(), the PE can reorder configuring the event type after we have enabled the counter and the interrupt. This can lead to an interrupt being asserted because of the previous event type that we were counting using the same counter, not the one that we've just configured. The same rationale applies to writes to the PMINTENSET_EL1 register. The PE can reorder enabling the interrupt at any point in the future after we have enabled the event. Prevent both situations from happening by adding an ISB just before we enable the event counter. Fixes: 030896885ade ("arm64: Performance counters support") Reported-by: Julien Thierry Signed-off-by: Alexandru Elisei Tested-by: Sumit Garg (Developerbox) Cc: Julien Thierry Cc: Will Deacon Cc: Mark Rutland Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Catalin Marinas Link: https://lore.kernel.org/r/20200924110706.254996-2-alexandru.elisei@arm.com Signed-off-by: Will Deacon Signed-off-by: Feng Zhou --- arch/arm64/kernel/perf_event.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index d2de291ed2f0d..1e5dc4fcfbc0d 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -524,6 +524,11 @@ static u32 armv8pmu_event_cnten_mask(struct perf_event *event) static inline void armv8pmu_enable_counter(u32 mask) { + /* + * Make sure event configuration register writes are visible before we + * enable the counter. + * */ + isb(); write_sysreg(mask, pmcntenset_el0); } From d2127bb9372f444a17f4f3e1cc20b5ac5ec9fb83 Mon Sep 17 00:00:00 2001 From: Andrew Murray Date: Mon, 2 Mar 2020 18:17:50 +0000 Subject: [PATCH 0222/5344] arm64: cpufeature: Extract capped perfmon fields commit 8e35aa642ee4dab01b16cc4b2df59d1936f3b3c2 upstream. When emulating ID registers there is often a need to cap the version bits of a feature such that the guest will not use features that the host is not aware of. For example, when KVM mediates access to the PMU by emulating register accesses. Let's add a helper that extracts a performance monitors ID field and caps the version to a given value. Fields that identify the version of the Performance Monitors Extension do not follow the standard ID scheme, and instead follow the scheme described in ARM DDI 0487E.a page D13-2825 "Alternative ID scheme used for the Performance Monitors Extension version". The value 0xF means an IMPLEMENTATION DEFINED PMU is present, and values 0x0-OxE can be treated the same as an unsigned field with 0x0 meaning no PMU is present. Signed-off-by: Andrew Murray Reviewed-by: Suzuki K Poulose [Mark: rework to handle perfmon fields] Signed-off-by: Mark Rutland Signed-off-by: Will Deacon Signed-off-by: Feng Zhou --- arch/arm64/include/asm/cpufeature.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index ccae05da98a7f..22c5fd330ebd6 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -449,6 +449,29 @@ cpuid_feature_extract_unsigned_field(u64 features, int field) return cpuid_feature_extract_unsigned_field_width(features, field, 4); } +/* + * Fields that identify the version of the Performance Monitors Extension do + * not follow the standard ID scheme. See ARM DDI 0487E.a page D13-2825, + * "Alternative ID scheme used for the Performance Monitors Extension version". + */ +static inline u64 __attribute_const__ +cpuid_feature_cap_perfmon_field(u64 features, int field, u64 cap) +{ + u64 val = cpuid_feature_extract_unsigned_field(features, field); + u64 mask = GENMASK_ULL(field + 3, field); + + /* Treat IMPLEMENTATION DEFINED functionality as unimplemented */ + if (val == 0xf) + val = 0; + + if (val > cap) { + features &= ~mask; + features |= (cap << field) & mask; + } + + return features; +} + static inline u64 arm64_ftr_mask(const struct arm64_ftr_bits *ftrp) { return (u64)GENMASK(ftrp->shift + ftrp->width - 1, ftrp->shift); From 66ba1e630ed6242901bc94f4872035904af3308b Mon Sep 17 00:00:00 2001 From: Andrew Murray Date: Mon, 2 Mar 2020 18:17:51 +0000 Subject: [PATCH 0223/5344] KVM: arm64: limit PMU version to PMUv3 for ARMv8.1 commit c854188ea01062f5a5fd7f05658feb1863774eaa upstream. We currently expose the PMU version of the host to the guest via emulation of the DFR0_EL1 and AA64DFR0_EL1 debug feature registers. However many of the features offered beyond PMUv3 for 8.1 are not supported in KVM. Examples of this include support for the PMMIR registers (added in PMUv3 for ARMv8.4) and 64-bit event counters added in (PMUv3 for ARMv8.5). Let's trap the Debug Feature Registers in order to limit PMUVer/PerfMon in the Debug Feature Registers to PMUv3 for ARMv8.1 to avoid unexpected behaviour. Both ID_AA64DFR0.PMUVer and ID_DFR0.PerfMon follow the "Alternative ID scheme used for the Performance Monitors Extension version" where 0xF means an IMPLEMENTATION DEFINED PMU is implemented, and values 0x0-0xE are treated as with an unsigned field (with 0x0 meaning no PMU is present). As we don't expect to expose an IMPLEMENTATION DEFINED PMU, and our cap is below 0xF, we can treat these fields as unsigned when applying the cap. Signed-off-by: Andrew Murray Reviewed-by: Suzuki K Poulose [Mark: make field names consistent, use perfmon cap] Signed-off-by: Mark Rutland Signed-off-by: Will Deacon Signed-off-by: Feng Zhou --- arch/arm64/include/asm/sysreg.h | 6 ++++++ arch/arm64/kvm/sys_regs.c | 10 ++++++++++ 2 files changed, 16 insertions(+) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 9b68f1b3915ec..0aacbccefe76d 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -674,6 +674,12 @@ #define ID_AA64DFR0_TRACEVER_SHIFT 4 #define ID_AA64DFR0_DEBUGVER_SHIFT 0 +#define ID_AA64DFR0_PMUVER_8_1 0x4 + +#define ID_DFR0_PERFMON_SHIFT 24 + +#define ID_DFR0_PERFMON_8_1 0x4 + #define ID_ISAR5_RDM_SHIFT 24 #define ID_ISAR5_CRC32_SHIFT 16 #define ID_ISAR5_SHA2_SHIFT 12 diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index da649e90240c8..cb483863d25c0 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1089,6 +1089,16 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, (0xfUL << ID_AA64ISAR1_API_SHIFT) | (0xfUL << ID_AA64ISAR1_GPA_SHIFT) | (0xfUL << ID_AA64ISAR1_GPI_SHIFT)); + } else if (id == SYS_ID_AA64DFR0_EL1) { + /* Limit guests to PMUv3 for ARMv8.1 */ + val = cpuid_feature_cap_perfmon_field(val, + ID_AA64DFR0_PMUVER_SHIFT, + ID_AA64DFR0_PMUVER_8_1); + } else if (id == SYS_ID_DFR0_EL1) { + /* Limit guests to PMUv3 for ARMv8.1 */ + val = cpuid_feature_cap_perfmon_field(val, + ID_DFR0_PERFMON_SHIFT, + ID_DFR0_PERFMON_8_1); } return val; From b743d27195661cad6b6afd01e9b8a43a905b827d Mon Sep 17 00:00:00 2001 From: Andrew Murray Date: Mon, 2 Mar 2020 18:17:52 +0000 Subject: [PATCH 0224/5344] arm64: perf: Add support for ARMv8.5-PMU 64-bit counters commit 8673e02e58410e6c4cefa499efa846286e45a991 upstream. At present ARMv8 event counters are limited to 32-bits, though by using the CHAIN event it's possible to combine adjacent counters to achieve 64-bits. The perf config1:0 bit can be set to use such a configuration. With the introduction of ARMv8.5-PMU support, all event counters can now be used as 64-bit counters. Let's enable 64-bit event counters where support exists. Unless the user sets config1:0 we will adjust the counter value such that it overflows upon 32-bit overflow. This follows the same behaviour as the cycle counter which has always been (and remains) 64-bits. Signed-off-by: Andrew Murray Reviewed-by: Suzuki K Poulose [Mark: fix ID field names, compare with 8.5 value] Signed-off-by: Mark Rutland Signed-off-by: Will Deacon Signed-off-by: Feng Zhou --- arch/arm64/include/asm/perf_event.h | 3 +- arch/arm64/include/asm/sysreg.h | 4 ++ arch/arm64/kernel/perf_event.c | 87 +++++++++++++++++++++++------ include/linux/perf/arm_pmu.h | 1 + 4 files changed, 78 insertions(+), 17 deletions(-) diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h index 2bdbc79bbd01b..e7765b62c7125 100644 --- a/arch/arm64/include/asm/perf_event.h +++ b/arch/arm64/include/asm/perf_event.h @@ -176,9 +176,10 @@ #define ARMV8_PMU_PMCR_X (1 << 4) /* Export to ETM */ #define ARMV8_PMU_PMCR_DP (1 << 5) /* Disable CCNT if non-invasive debug*/ #define ARMV8_PMU_PMCR_LC (1 << 6) /* Overflow on 64 bit cycle counter */ +#define ARMV8_PMU_PMCR_LP (1 << 7) /* Long event counter enable */ #define ARMV8_PMU_PMCR_N_SHIFT 11 /* Number of counters supported */ #define ARMV8_PMU_PMCR_N_MASK 0x1f -#define ARMV8_PMU_PMCR_MASK 0x7f /* Mask for writable bits */ +#define ARMV8_PMU_PMCR_MASK 0xff /* Mask for writable bits */ /* * PMOVSR: counters overflow flag status reg diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 0aacbccefe76d..e96d3457849a1 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -674,7 +674,11 @@ #define ID_AA64DFR0_TRACEVER_SHIFT 4 #define ID_AA64DFR0_DEBUGVER_SHIFT 0 +#define ID_AA64DFR0_PMUVER_8_0 0x1 #define ID_AA64DFR0_PMUVER_8_1 0x4 +#define ID_AA64DFR0_PMUVER_8_4 0x5 +#define ID_AA64DFR0_PMUVER_8_5 0x6 +#define ID_AA64DFR0_PMUVER_IMP_DEF 0xf #define ID_DFR0_PERFMON_SHIFT 24 diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 1e5dc4fcfbc0d..9841dd2db2ff1 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -347,6 +347,17 @@ static struct attribute_group armv8_pmuv3_format_attr_group = { #define ARMV8_IDX_COUNTER_LAST(cpu_pmu) \ (ARMV8_IDX_CYCLE_COUNTER + cpu_pmu->num_events - 1) + +/* + * We unconditionally enable ARMv8.5-PMU long event counter support + * (64-bit events) where supported. Indicate if this arm_pmu has long + * event counter support. + */ +static bool armv8pmu_has_long_event(struct arm_pmu *cpu_pmu) +{ + return (cpu_pmu->pmuver >= ID_AA64DFR0_PMUVER_8_5); +} + /* * We must chain two programmable counters for 64 bit events, * except when we have allocated the 64bit cycle counter (for CPU @@ -356,9 +367,11 @@ static struct attribute_group armv8_pmuv3_format_attr_group = { static inline bool armv8pmu_event_is_chained(struct perf_event *event) { int idx = event->hw.idx; + struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); return !WARN_ON(idx < 0) && armv8pmu_event_is_64bit(event) && + !armv8pmu_has_long_event(cpu_pmu) && (idx != ARMV8_IDX_CYCLE_COUNTER); } @@ -407,7 +420,7 @@ static inline void armv8pmu_select_counter(int idx) isb(); } -static inline u32 armv8pmu_read_evcntr(int idx) +static inline u64 armv8pmu_read_evcntr(int idx) { armv8pmu_select_counter(idx); return read_sysreg(pmxevcntr_el0); @@ -424,6 +437,44 @@ static inline u64 armv8pmu_read_hw_counter(struct perf_event *event) return val; } +/* + * The cycle counter is always a 64-bit counter. When ARMV8_PMU_PMCR_LP + * is set the event counters also become 64-bit counters. Unless the + * user has requested a long counter (attr.config1) then we want to + * interrupt upon 32-bit overflow - we achieve this by applying a bias. + */ +static bool armv8pmu_event_needs_bias(struct perf_event *event) +{ + struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + + if (armv8pmu_event_is_64bit(event)) + return false; + + if (armv8pmu_has_long_event(cpu_pmu) || + idx == ARMV8_IDX_CYCLE_COUNTER) + return true; + + return false; +} + +static u64 armv8pmu_bias_long_counter(struct perf_event *event, u64 value) +{ + if (armv8pmu_event_needs_bias(event)) + value |= GENMASK(63, 32); + + return value; +} + +static u64 armv8pmu_unbias_long_counter(struct perf_event *event, u64 value) +{ + if (armv8pmu_event_needs_bias(event)) + value &= ~GENMASK(63, 32); + + return value; +} + static u64 armv8pmu_read_counter(struct perf_event *event) { struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); @@ -439,10 +490,10 @@ static u64 armv8pmu_read_counter(struct perf_event *event) else value = armv8pmu_read_hw_counter(event); - return value; + return armv8pmu_unbias_long_counter(event, value); } -static inline void armv8pmu_write_evcntr(int idx, u32 value) +static inline void armv8pmu_write_evcntr(int idx, u64 value) { armv8pmu_select_counter(idx); write_sysreg(value, pmxevcntr_el0); @@ -467,20 +518,14 @@ static void armv8pmu_write_counter(struct perf_event *event, u64 value) struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; + value = armv8pmu_bias_long_counter(event, value); + if (!armv8pmu_counter_valid(cpu_pmu, idx)) pr_err("CPU%u writing wrong counter %d\n", smp_processor_id(), idx); - else if (idx == ARMV8_IDX_CYCLE_COUNTER) { - /* - * The cycles counter is really a 64-bit counter. - * When treating it as a 32-bit counter, we only count - * the lower 32 bits, and set the upper 32-bits so that - * we get an interrupt upon 32-bit overflow. - */ - if (!armv8pmu_event_is_64bit(event)) - value |= 0xffffffff00000000ULL; + else if (idx == ARMV8_IDX_CYCLE_COUNTER) write_sysreg(value, pmccntr_el0); - } else + else armv8pmu_write_hw_counter(event, value); } @@ -798,7 +843,8 @@ static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc, /* * Otherwise use events counters */ - if (armv8pmu_event_is_64bit(event)) + if (armv8pmu_event_is_64bit(event) && + !armv8pmu_has_long_event(cpu_pmu)) return armv8pmu_get_chain_idx(cpuc, cpu_pmu); else return armv8pmu_get_single_idx(cpuc, cpu_pmu); @@ -869,6 +915,9 @@ static int armv8pmu_filter_match(struct perf_event *event) static void armv8pmu_reset(void *info) { + struct arm_pmu *cpu_pmu = (struct arm_pmu *)info; + u32 pmcr; + /* The counter and interrupt enable registers are unknown at reset. */ armv8pmu_disable_counter(U32_MAX); armv8pmu_disable_intens(U32_MAX); @@ -880,8 +929,13 @@ static void armv8pmu_reset(void *info) * Initialize & Reset PMNC. Request overflow interrupt for * 64 bit cycle counter but cheat in armv8pmu_write_counter(). */ - armv8pmu_pmcr_write(ARMV8_PMU_PMCR_P | ARMV8_PMU_PMCR_C | - ARMV8_PMU_PMCR_LC); + pmcr = ARMV8_PMU_PMCR_P | ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_LC; + + /* Enable long event counter support where available */ + if (armv8pmu_has_long_event(cpu_pmu)) + pmcr |= ARMV8_PMU_PMCR_LP; + + armv8pmu_pmcr_write(pmcr); } static int __armv8_pmuv3_map_event(struct perf_event *event, @@ -964,6 +1018,7 @@ static void __armv8pmu_probe_pmu(void *info) if (pmuver == 0xf || pmuver == 0) return; + cpu_pmu->pmuver = pmuver; probe->present = true; /* Read the nb of CNTx counters supported from PMNC */ diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 71f525a35ac2b..5b616dde9a4c2 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -80,6 +80,7 @@ struct arm_pmu { struct pmu pmu; cpumask_t supported_cpus; char *name; + int pmuver; irqreturn_t (*handle_irq)(struct arm_pmu *pmu); void (*enable)(struct perf_event *event); void (*disable)(struct perf_event *event); From 7661cfebbe3b5d7070726fc4f582fce7d606c7ff Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 24 Sep 2020 12:07:01 +0100 Subject: [PATCH 0225/5344] arm64: perf: Avoid PMXEV* indirection commit 0fdf1bb75953a67e63e5055a7709c629ab6d7692 upstream. Currently we access the counter registers and their respective type registers indirectly. This requires us to write to PMSELR, issue an ISB, then access the relevant PMXEV* registers. This is unfortunate, because: * Under virtualization, accessing one register requires two traps to the hypervisor, even though we could access the register directly with a single trap. * We have to issue an ISB which we could otherwise avoid the cost of. * When we use NMIs, the NMI handler will have to save/restore the select register in case the code it preempted was attempting to access a counter or its type register. We can avoid these issues by directly accessing the relevant registers. This patch adds helpers to do so. In armv8pmu_enable_event() we still need the ISB to prevent the PE from reordering the write to PMINTENSET_EL1 register. If the interrupt is enabled before we disable the counter and the new event is configured, we might get an interrupt triggered by the previously programmed event overflowing, but which we wrongly attribute to the event that we are enabling. Execute an ISB after we disable the counter. In the process, remove the comment that refers to the ARMv7 PMU. [Julien T.: Don't inline read/write functions to avoid big code-size increase, remove unused read_pmevtypern function, fix counter index issue.] [Alexandru E.: Removed comment, removed trailing semicolons in macros, added ISB] Signed-off-by: Mark Rutland Signed-off-by: Julien Thierry Signed-off-by: Alexandru Elisei Tested-by: Sumit Garg (Developerbox) Cc: Julien Thierry Cc: Will Deacon Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Catalin Marinas Link: https://lore.kernel.org/r/20200924110706.254996-3-alexandru.elisei@arm.com Signed-off-by: Will Deacon Signed-off-by: Feng Zhou --- arch/arm64/kernel/perf_event.c | 99 +++++++++++++++++++++++++++++----- 1 file changed, 85 insertions(+), 14 deletions(-) diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 9841dd2db2ff1..d93d4527c9a22 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -385,6 +385,73 @@ static inline bool armv8pmu_event_is_chained(struct perf_event *event) #define ARMV8_IDX_TO_COUNTER(x) \ (((x) - ARMV8_IDX_COUNTER0) & ARMV8_PMU_COUNTER_MASK) +/* + * This code is really good + */ + +#define PMEVN_CASE(n, case_macro) \ + case n: case_macro(n); break + +#define PMEVN_SWITCH(x, case_macro) \ + do { \ + switch (x) { \ + PMEVN_CASE(0, case_macro); \ + PMEVN_CASE(1, case_macro); \ + PMEVN_CASE(2, case_macro); \ + PMEVN_CASE(3, case_macro); \ + PMEVN_CASE(4, case_macro); \ + PMEVN_CASE(5, case_macro); \ + PMEVN_CASE(6, case_macro); \ + PMEVN_CASE(7, case_macro); \ + PMEVN_CASE(8, case_macro); \ + PMEVN_CASE(9, case_macro); \ + PMEVN_CASE(10, case_macro); \ + PMEVN_CASE(11, case_macro); \ + PMEVN_CASE(12, case_macro); \ + PMEVN_CASE(13, case_macro); \ + PMEVN_CASE(14, case_macro); \ + PMEVN_CASE(15, case_macro); \ + PMEVN_CASE(16, case_macro); \ + PMEVN_CASE(17, case_macro); \ + PMEVN_CASE(18, case_macro); \ + PMEVN_CASE(19, case_macro); \ + PMEVN_CASE(20, case_macro); \ + PMEVN_CASE(21, case_macro); \ + PMEVN_CASE(22, case_macro); \ + PMEVN_CASE(23, case_macro); \ + PMEVN_CASE(24, case_macro); \ + PMEVN_CASE(25, case_macro); \ + PMEVN_CASE(26, case_macro); \ + PMEVN_CASE(27, case_macro); \ + PMEVN_CASE(28, case_macro); \ + PMEVN_CASE(29, case_macro); \ + PMEVN_CASE(30, case_macro); \ + default: WARN(1, "Invalid PMEV* index\n"); \ + } \ + } while (0) + +#define RETURN_READ_PMEVCNTRN(n) \ + return read_sysreg(pmevcntr##n##_el0) +static unsigned long read_pmevcntrn(int n) +{ + PMEVN_SWITCH(n, RETURN_READ_PMEVCNTRN); + return 0; +} + +#define WRITE_PMEVCNTRN(n) \ + write_sysreg(val, pmevcntr##n##_el0) +static void write_pmevcntrn(int n, unsigned long val) +{ + PMEVN_SWITCH(n, WRITE_PMEVCNTRN); +} + +#define WRITE_PMEVTYPERN(n) \ + write_sysreg(val, pmevtyper##n##_el0) +static void write_pmevtypern(int n, unsigned long val) +{ + PMEVN_SWITCH(n, WRITE_PMEVTYPERN); +} + static inline u32 armv8pmu_pmcr_read(void) { return read_sysreg(pmcr_el0); @@ -413,17 +480,11 @@ static inline int armv8pmu_counter_has_overflowed(u32 pmnc, int idx) return pmnc & BIT(ARMV8_IDX_TO_COUNTER(idx)); } -static inline void armv8pmu_select_counter(int idx) +static inline u32 armv8pmu_read_evcntr(int idx) { u32 counter = ARMV8_IDX_TO_COUNTER(idx); - write_sysreg(counter, pmselr_el0); - isb(); -} -static inline u64 armv8pmu_read_evcntr(int idx) -{ - armv8pmu_select_counter(idx); - return read_sysreg(pmxevcntr_el0); + return read_pmevcntrn(counter); } static inline u64 armv8pmu_read_hw_counter(struct perf_event *event) @@ -495,8 +556,9 @@ static u64 armv8pmu_read_counter(struct perf_event *event) static inline void armv8pmu_write_evcntr(int idx, u64 value) { - armv8pmu_select_counter(idx); - write_sysreg(value, pmxevcntr_el0); + u32 counter = ARMV8_IDX_TO_COUNTER(idx); + + write_pmevcntrn(counter, value); } static inline void armv8pmu_write_hw_counter(struct perf_event *event, @@ -531,9 +593,10 @@ static void armv8pmu_write_counter(struct perf_event *event, u64 value) static inline void armv8pmu_write_evtype(int idx, u32 val) { - armv8pmu_select_counter(idx); + u32 counter = ARMV8_IDX_TO_COUNTER(idx); + val &= ARMV8_PMU_EVTYPE_MASK; - write_sysreg(val, pmxevtyper_el0); + write_pmevtypern(counter, val); } static inline void armv8pmu_write_event_type(struct perf_event *event) @@ -553,7 +616,10 @@ static inline void armv8pmu_write_event_type(struct perf_event *event) armv8pmu_write_evtype(idx - 1, hwc->config_base); armv8pmu_write_evtype(idx, chain_evt); } else { - armv8pmu_write_evtype(idx, hwc->config_base); + if (idx == ARMV8_IDX_CYCLE_COUNTER) + write_sysreg(hwc->config_base, pmccfiltr_el0); + else + armv8pmu_write_evtype(idx, hwc->config_base); } } @@ -592,6 +658,11 @@ static inline void armv8pmu_enable_event_counter(struct perf_event *event) static inline void armv8pmu_disable_counter(u32 mask) { write_sysreg(mask, pmcntenclr_el0); + /* + * Make sure the effects of disabling the counter are visible before we + * start configuring the event. + */ + isb(); } static inline void armv8pmu_disable_event_counter(struct perf_event *event) @@ -664,7 +735,7 @@ static void armv8pmu_enable_event(struct perf_event *event) armv8pmu_disable_event_counter(event); /* - * Set event (if destined for PMNx counters). + * Set event. */ armv8pmu_write_event_type(event); From bd3e13af8160d9620e1a6bf93cd75bd7428c00c7 Mon Sep 17 00:00:00 2001 From: Julien Thierry Date: Thu, 24 Sep 2020 12:07:02 +0100 Subject: [PATCH 0226/5344] arm64: perf: Remove PMU locking commit 2a0e2a02e4b719174547d6f04c27410c6fe456f5 upstream. The PMU is disabled and enabled, and the counters are programmed from contexts where interrupts or preemption is disabled. The functions to toggle the PMU and to program the PMU counters access the registers directly and don't access data modified by the interrupt handler. That, and the fact that they're always called from non-preemptible contexts, means that we don't need to disable interrupts or use a spinlock. [Alexandru E.: Explained why locking is not needed, removed WARN_ONs] Signed-off-by: Julien Thierry Signed-off-by: Alexandru Elisei Tested-by: Sumit Garg (Developerbox) Cc: Will Deacon Cc: Mark Rutland Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Catalin Marinas Link: https://lore.kernel.org/r/20200924110706.254996-4-alexandru.elisei@arm.com Signed-off-by: Will Deacon Signed-off-by: Feng Zhou --- arch/arm64/kernel/perf_event.c | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index d93d4527c9a22..529f380d63757 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -719,15 +719,10 @@ static inline u32 armv8pmu_getreset_flags(void) static void armv8pmu_enable_event(struct perf_event *event) { - unsigned long flags; - struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); - /* * Enable counter and interrupt, and set the counter to count * the event that we're interested in. */ - raw_spin_lock_irqsave(&events->pmu_lock, flags); /* * Disable counter @@ -748,21 +743,10 @@ static void armv8pmu_enable_event(struct perf_event *event) * Enable counter */ armv8pmu_enable_event_counter(event); - - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); } static void armv8pmu_disable_event(struct perf_event *event) { - unsigned long flags; - struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); - - /* - * Disable counter and interrupt - */ - raw_spin_lock_irqsave(&events->pmu_lock, flags); - /* * Disable counter */ @@ -772,30 +756,18 @@ static void armv8pmu_disable_event(struct perf_event *event) * Disable interrupt for this counter */ armv8pmu_disable_event_irq(event); - - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); } static void armv8pmu_start(struct arm_pmu *cpu_pmu) { - unsigned long flags; - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); - - raw_spin_lock_irqsave(&events->pmu_lock, flags); /* Enable all counters */ armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E); - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); } static void armv8pmu_stop(struct arm_pmu *cpu_pmu) { - unsigned long flags; - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); - - raw_spin_lock_irqsave(&events->pmu_lock, flags); /* Disable all counters */ armv8pmu_pmcr_write(armv8pmu_pmcr_read() & ~ARMV8_PMU_PMCR_E); - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); } static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu) From 996872eda3190e11e796d8e7d1e9d62466abf051 Mon Sep 17 00:00:00 2001 From: Julien Thierry Date: Thu, 24 Sep 2020 12:07:03 +0100 Subject: [PATCH 0227/5344] arm64: perf: Defer irq_work to IPI_IRQ_WORK commit 05ab72813340d11205556c0d1bc08e6857a3856c upstream. When handling events, armv8pmu_handle_irq() calls perf_event_overflow(), and subsequently calls irq_work_run() to handle any work queued by perf_event_overflow(). As perf_event_overflow() raises IPI_IRQ_WORK when queuing the work, this isn't strictly necessary and the work could be handled as part of the IPI_IRQ_WORK handler. In the common case the IPI handler will run immediately after the PMU IRQ handler, and where the PE is heavily loaded with interrupts other handlers may run first, widening the window where some counters are disabled. In practice this window is unlikely to be a significant issue, and removing the call to irq_work_run() would make the PMU IRQ handler NMI safe in addition to making it simpler, so let's do that. [Alexandru E.: Reworded commit message] Signed-off-by: Julien Thierry Signed-off-by: Alexandru Elisei Cc: Julien Thierry Cc: Will Deacon Cc: Mark Rutland Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Catalin Marinas Link: https://lore.kernel.org/r/20200924110706.254996-5-alexandru.elisei@arm.com Signed-off-by: Will Deacon Signed-off-by: Feng Zhou --- arch/arm64/kernel/perf_event.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 529f380d63757..a084bb95363f7 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -820,20 +820,16 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu) if (!armpmu_event_set_period(event)) continue; + /* + * Perf event overflow will queue the processing of the event as + * an irq_work which will be taken care of in the handling of + * IPI_IRQ_WORK. + */ if (perf_event_overflow(event, &data, regs)) cpu_pmu->disable(event); } armv8pmu_start(cpu_pmu); - /* - * Handle the pending perf events. - * - * Note: this call *must* be run with interrupts disabled. For - * platforms that can have the PMU interrupts raised as an NMI, this - * will not work. - */ - irq_work_run(); - return IRQ_HANDLED; } From 9d86f4732416176807e744237eed215526a3cf1a Mon Sep 17 00:00:00 2001 From: Julien Thierry Date: Thu, 24 Sep 2020 12:07:04 +0100 Subject: [PATCH 0228/5344] KVM: arm64: pmu: Make overflow handler NMI safe commit 95e92e45a454a10a8114294d0f7aec930fb85891 upstream. conflict: functions in the different file. kvm_vcpu_kick() is not NMI safe. When the overflow handler is called from NMI context, defer waking the vcpu to an irq_work queue. A vcpu can be freed while it's not running by kvm_destroy_vm(). Prevent running the irq_work for a non-existent vcpu by calling irq_work_sync() on the PMU destroy path. [Alexandru E.: Added irq_work_sync()] Signed-off-by: Julien Thierry Signed-off-by: Alexandru Elisei Tested-by: Sumit Garg (Developerbox) Cc: Julien Thierry Cc: Marc Zyngier Cc: Will Deacon Cc: Mark Rutland Cc: Catalin Marinas Cc: James Morse Cc: Suzuki K Pouloze Cc: kvm@vger.kernel.org Cc: kvmarm@lists.cs.columbia.edu Link: https://lore.kernel.org/r/20200924110706.254996-6-alexandru.elisei@arm.com Signed-off-by: Will Deacon Signed-off-by: Feng Zhou --- include/kvm/arm_pmu.h | 1 + virt/kvm/arm/pmu.c | 26 +++++++++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 6db030439e29c..dbf4f08d42e52 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -27,6 +27,7 @@ struct kvm_pmu { bool ready; bool created; bool irq_level; + struct irq_work overflow_work; }; #define kvm_arm_pmu_v3_ready(v) ((v)->arch.pmu.ready) diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c index 4c08fd009768c..d522a1000497c 100644 --- a/virt/kvm/arm/pmu.c +++ b/virt/kvm/arm/pmu.c @@ -259,6 +259,7 @@ void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) kvm_pmu_release_perf_event(&pmu->pmc[i]); + irq_work_sync(&vcpu->arch.pmu.overflow_work); } u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu) @@ -435,6 +436,22 @@ void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu) kvm_pmu_update_state(vcpu); } +/** + * When perf interrupt is an NMI, we cannot safely notify the vcpu corresponding + * to the event. + * This is why we need a callback to do it once outside of the NMI context. + */ +static void kvm_pmu_perf_overflow_notify_vcpu(struct irq_work *work) +{ + struct kvm_vcpu *vcpu; + struct kvm_pmu *pmu; + + pmu = container_of(work, struct kvm_pmu, overflow_work); + vcpu = kvm_pmc_to_vcpu(pmu->pmc); + + kvm_vcpu_kick(vcpu); +} + /** * When the perf event overflows, set the overflow status and inform the vcpu. */ @@ -467,7 +484,11 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event, if (kvm_pmu_overflow_status(vcpu)) { kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); - kvm_vcpu_kick(vcpu); + + if (!in_nmi()) + kvm_vcpu_kick(vcpu); + else + irq_work_queue(&vcpu->arch.pmu.overflow_work); } cpu_pmu->pmu.start(perf_event, PERF_EF_RELOAD); @@ -760,6 +781,9 @@ static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu) return ret; } + init_irq_work(&vcpu->arch.pmu.overflow_work, + kvm_pmu_perf_overflow_notify_vcpu); + vcpu->arch.pmu.created = true; return 0; } From 5d56af17b72e3beeff0e9add2ce757f443361a82 Mon Sep 17 00:00:00 2001 From: Julien Thierry Date: Thu, 24 Sep 2020 12:07:05 +0100 Subject: [PATCH 0229/5344] arm_pmu: Introduce pmu_irq_ops commit f76b130bdb8949eac002b8e0ddb85576ed137838 upstream. Currently the PMU interrupt can either be a normal irq or a percpu irq. Supporting NMI will introduce two cases for each existing one. It becomes a mess of 'if's when managing the interrupt. Define sets of callbacks for operations commonly done on the interrupt. The appropriate set of callbacks is selected at interrupt request time and simplifies interrupt enabling/disabling and freeing. Signed-off-by: Julien Thierry Signed-off-by: Alexandru Elisei Tested-by: Sumit Garg (Developerbox) Cc: Julien Thierry Cc: Will Deacon Cc: Mark Rutland Link: https://lore.kernel.org/r/20200924110706.254996-7-alexandru.elisei@arm.com Signed-off-by: Will Deacon Signed-off-by: Feng Zhou --- drivers/perf/arm_pmu.c | 90 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 74 insertions(+), 16 deletions(-) diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index df352b334ea77..a770726e98d4d 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -26,8 +26,46 @@ #include +static int armpmu_count_irq_users(const int irq); + +struct pmu_irq_ops { + void (*enable_pmuirq)(unsigned int irq); + void (*disable_pmuirq)(unsigned int irq); + void (*free_pmuirq)(unsigned int irq, int cpu, void __percpu *devid); +}; + +static void armpmu_free_pmuirq(unsigned int irq, int cpu, void __percpu *devid) +{ + free_irq(irq, per_cpu_ptr(devid, cpu)); +} + +static const struct pmu_irq_ops pmuirq_ops = { + .enable_pmuirq = enable_irq, + .disable_pmuirq = disable_irq_nosync, + .free_pmuirq = armpmu_free_pmuirq +}; + +static void armpmu_enable_percpu_pmuirq(unsigned int irq) +{ + enable_percpu_irq(irq, IRQ_TYPE_NONE); +} + +static void armpmu_free_percpu_pmuirq(unsigned int irq, int cpu, + void __percpu *devid) +{ + if (armpmu_count_irq_users(irq) == 1) + free_percpu_irq(irq, devid); +} + +static const struct pmu_irq_ops percpu_pmuirq_ops = { + .enable_pmuirq = armpmu_enable_percpu_pmuirq, + .disable_pmuirq = disable_percpu_irq, + .free_pmuirq = armpmu_free_percpu_pmuirq +}; + static DEFINE_PER_CPU(struct arm_pmu *, cpu_armpmu); static DEFINE_PER_CPU(int, cpu_irq); +static DEFINE_PER_CPU(const struct pmu_irq_ops *, cpu_irq_ops); static inline u64 arm_pmu_event_max_period(struct perf_event *event) { @@ -544,6 +582,23 @@ static int armpmu_count_irq_users(const int irq) return count; } +static const struct pmu_irq_ops *armpmu_find_irq_ops(int irq) +{ + const struct pmu_irq_ops *ops = NULL; + int cpu; + + for_each_possible_cpu(cpu) { + if (per_cpu(cpu_irq, cpu) != irq) + continue; + + ops = per_cpu(cpu_irq_ops, cpu); + if (ops) + break; + } + + return ops; +} + void armpmu_free_irq(int irq, int cpu) { if (per_cpu(cpu_irq, cpu) == 0) @@ -551,18 +606,18 @@ void armpmu_free_irq(int irq, int cpu) if (WARN_ON(irq != per_cpu(cpu_irq, cpu))) return; - if (!irq_is_percpu_devid(irq)) - free_irq(irq, per_cpu_ptr(&cpu_armpmu, cpu)); - else if (armpmu_count_irq_users(irq) == 1) - free_percpu_irq(irq, &cpu_armpmu); + per_cpu(cpu_irq_ops, cpu)->free_pmuirq(irq, cpu, &cpu_armpmu); per_cpu(cpu_irq, cpu) = 0; + per_cpu(cpu_irq_ops, cpu) = NULL; } int armpmu_request_irq(int irq, int cpu) { int err = 0; const irq_handler_t handler = armpmu_dispatch_irq; + const struct pmu_irq_ops *irq_ops; + if (!irq) return 0; @@ -584,15 +639,26 @@ int armpmu_request_irq(int irq, int cpu) irq_set_status_flags(irq, IRQ_NOAUTOEN); err = request_irq(irq, handler, irq_flags, "arm-pmu", per_cpu_ptr(&cpu_armpmu, cpu)); + + irq_ops = &pmuirq_ops; } else if (armpmu_count_irq_users(irq) == 0) { err = request_percpu_irq(irq, handler, "arm-pmu", &cpu_armpmu); + + irq_ops = &percpu_pmuirq_ops; + } else { + /* Per cpudevid irq was already requested by another CPU */ + irq_ops = armpmu_find_irq_ops(irq); + + if (WARN_ON(!irq_ops)) + err = -EINVAL; } if (err) goto err_out; per_cpu(cpu_irq, cpu) = irq; + per_cpu(cpu_irq_ops, cpu) = irq_ops; return 0; err_out: @@ -625,12 +691,8 @@ static int arm_perf_starting_cpu(unsigned int cpu, struct hlist_node *node) per_cpu(cpu_armpmu, cpu) = pmu; irq = armpmu_get_cpu_irq(pmu, cpu); - if (irq) { - if (irq_is_percpu_devid(irq)) - enable_percpu_irq(irq, IRQ_TYPE_NONE); - else - enable_irq(irq); - } + if (irq) + per_cpu(cpu_irq_ops, cpu)->enable_pmuirq(irq); return 0; } @@ -644,12 +706,8 @@ static int arm_perf_teardown_cpu(unsigned int cpu, struct hlist_node *node) return 0; irq = armpmu_get_cpu_irq(pmu, cpu); - if (irq) { - if (irq_is_percpu_devid(irq)) - disable_percpu_irq(irq); - else - disable_irq_nosync(irq); - } + if (irq) + per_cpu(cpu_irq_ops, cpu)->disable_pmuirq(irq); per_cpu(cpu_armpmu, cpu) = NULL; From 3076ffe07948ba9c7878772433aed91873ec0a8a Mon Sep 17 00:00:00 2001 From: Julien Thierry Date: Thu, 24 Sep 2020 12:07:06 +0100 Subject: [PATCH 0230/5344] arm_pmu: arm64: Use NMIs for PMU commit d8f6267f7ce5dc7b8920910e7e75216f77e06d21 upstream. Add required PMU interrupt operations for NMIs. Request interrupt lines as NMIs when possible, otherwise fall back to normal interrupts. NMIs are only supported on the arm64 architecture with a GICv3 irqchip. [Alexandru E.: Added that NMIs only work on arm64 + GICv3, print message when PMU is using NMIs] Signed-off-by: Julien Thierry Signed-off-by: Alexandru Elisei Tested-by: Sumit Garg (Developerbox) Cc: Julien Thierry Cc: Will Deacon Cc: Mark Rutland Link: https://lore.kernel.org/r/20200924110706.254996-8-alexandru.elisei@arm.com Signed-off-by: Will Deacon Signed-off-by: Feng Zhou --- drivers/perf/arm_pmu.c | 71 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 63 insertions(+), 8 deletions(-) diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index a770726e98d4d..cb2f55f450e4a 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -45,6 +45,17 @@ static const struct pmu_irq_ops pmuirq_ops = { .free_pmuirq = armpmu_free_pmuirq }; +static void armpmu_free_pmunmi(unsigned int irq, int cpu, void __percpu *devid) +{ + free_nmi(irq, per_cpu_ptr(devid, cpu)); +} + +static const struct pmu_irq_ops pmunmi_ops = { + .enable_pmuirq = enable_nmi, + .disable_pmuirq = disable_nmi_nosync, + .free_pmuirq = armpmu_free_pmunmi +}; + static void armpmu_enable_percpu_pmuirq(unsigned int irq) { enable_percpu_irq(irq, IRQ_TYPE_NONE); @@ -63,10 +74,37 @@ static const struct pmu_irq_ops percpu_pmuirq_ops = { .free_pmuirq = armpmu_free_percpu_pmuirq }; +static void armpmu_enable_percpu_pmunmi(unsigned int irq) +{ + if (!prepare_percpu_nmi(irq)) + enable_percpu_nmi(irq, IRQ_TYPE_NONE); +} + +static void armpmu_disable_percpu_pmunmi(unsigned int irq) +{ + disable_percpu_nmi(irq); + teardown_percpu_nmi(irq); +} + +static void armpmu_free_percpu_pmunmi(unsigned int irq, int cpu, + void __percpu *devid) +{ + if (armpmu_count_irq_users(irq) == 1) + free_percpu_nmi(irq, devid); +} + +static const struct pmu_irq_ops percpu_pmunmi_ops = { + .enable_pmuirq = armpmu_enable_percpu_pmunmi, + .disable_pmuirq = armpmu_disable_percpu_pmunmi, + .free_pmuirq = armpmu_free_percpu_pmunmi +}; + static DEFINE_PER_CPU(struct arm_pmu *, cpu_armpmu); static DEFINE_PER_CPU(int, cpu_irq); static DEFINE_PER_CPU(const struct pmu_irq_ops *, cpu_irq_ops); +static bool has_nmi; + static inline u64 arm_pmu_event_max_period(struct perf_event *event) { if (event->hw.flags & ARMPMU_EVT_64BIT) @@ -637,15 +675,31 @@ int armpmu_request_irq(int irq, int cpu) IRQF_NO_THREAD; irq_set_status_flags(irq, IRQ_NOAUTOEN); - err = request_irq(irq, handler, irq_flags, "arm-pmu", + + err = request_nmi(irq, handler, irq_flags, "arm-pmu", per_cpu_ptr(&cpu_armpmu, cpu)); - irq_ops = &pmuirq_ops; + /* If cannot get an NMI, get a normal interrupt */ + if (err) { + err = request_irq(irq, handler, irq_flags, "arm-pmu", + per_cpu_ptr(&cpu_armpmu, cpu)); + irq_ops = &pmuirq_ops; + } else { + has_nmi = true; + irq_ops = &pmunmi_ops; + } } else if (armpmu_count_irq_users(irq) == 0) { - err = request_percpu_irq(irq, handler, "arm-pmu", - &cpu_armpmu); - - irq_ops = &percpu_pmuirq_ops; + err = request_percpu_nmi(irq, handler, "arm-pmu", &cpu_armpmu); + + /* If cannot get an NMI, get a normal interrupt */ + if (err) { + err = request_percpu_irq(irq, handler, "arm-pmu", + &cpu_armpmu); + irq_ops = &percpu_pmuirq_ops; + } else { + has_nmi= true; + irq_ops = &percpu_pmunmi_ops; + } } else { /* Per cpudevid irq was already requested by another CPU */ irq_ops = armpmu_find_irq_ops(irq); @@ -928,8 +982,9 @@ int armpmu_register(struct arm_pmu *pmu) if (!__oprofile_cpu_pmu) __oprofile_cpu_pmu = pmu; - pr_info("enabled with %s PMU driver, %d counters available\n", - pmu->name, pmu->num_events); + pr_info("enabled with %s PMU driver, %d counters available%s\n", + pmu->name, pmu->num_events, + has_nmi ? ", using NMIs" : ""); return 0; From 21d7c2918f9ff21cebe650d937840880276d8870 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 15 Jun 2021 12:12:24 +0100 Subject: [PATCH 0231/5344] arm64: Add cpuidle context save/restore helpers commit 8848f0665b3cd4fbb3107b384f5205380c90634d upstream. As we need to start doing some additional work on all idle paths, let's introduce a set of macros that will perform the work related to the GICv3 pseudo-NMI idle entry exit. Stubs are introduced to 32bit ARM for compatibility. As these helpers are currently unused, there is no functional change. Tested-by: Valentin Schneider Reviewed-by: Lorenzo Pieralisi Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210615111227.2454465-2-maz@kernel.org Signed-off-by: Will Deacon Signed-off-by: Feng Zhou --- arch/arm/include/asm/cpuidle.h | 5 +++++ arch/arm64/include/asm/cpuidle.h | 35 ++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/arch/arm/include/asm/cpuidle.h b/arch/arm/include/asm/cpuidle.h index 6b2ff7243b4b2..0230c60e7df0f 100644 --- a/arch/arm/include/asm/cpuidle.h +++ b/arch/arm/include/asm/cpuidle.h @@ -49,4 +49,9 @@ extern int arm_cpuidle_suspend(int index); extern int arm_cpuidle_init(int cpu); +struct arm_cpuidle_irq_context { }; + +#define arm_cpuidle_save_irq_context(c) (void)c +#define arm_cpuidle_restore_irq_context(c) (void)c + #endif diff --git a/arch/arm64/include/asm/cpuidle.h b/arch/arm64/include/asm/cpuidle.h index 3c5ddb429ea29..14a19d1141bd2 100644 --- a/arch/arm64/include/asm/cpuidle.h +++ b/arch/arm64/include/asm/cpuidle.h @@ -18,4 +18,39 @@ static inline int arm_cpuidle_suspend(int index) return -EOPNOTSUPP; } #endif + +#ifdef CONFIG_ARM64_PSEUDO_NMI +#include + +struct arm_cpuidle_irq_context { + unsigned long pmr; + unsigned long daif_bits; +}; + +#define arm_cpuidle_save_irq_context(__c) \ + do { \ + struct arm_cpuidle_irq_context *c = __c; \ + if (system_uses_irq_prio_masking()) { \ + c->daif_bits = read_sysreg(daif); \ + write_sysreg(c->daif_bits | PSR_I_BIT | PSR_F_BIT, \ + daif); \ + c->pmr = gic_read_pmr(); \ + gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); \ + } \ + } while (0) + +#define arm_cpuidle_restore_irq_context(__c) \ + do { \ + struct arm_cpuidle_irq_context *c = __c; \ + if (system_uses_irq_prio_masking()) { \ + gic_write_pmr(c->pmr); \ + write_sysreg(c->daif_bits, daif); \ + } \ + } while (0) +#else +struct arm_cpuidle_irq_context { }; + +#define arm_cpuidle_save_irq_context(c) (void)c +#define arm_cpuidle_restore_irq_context(c) (void)c +#endif #endif From 8964c6f657bc1cc6ee2467a3923acb1ddccbf3ea Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 30 Nov 2020 11:59:41 +0000 Subject: [PATCH 0232/5344] arm64: mark idle code as noinstr commit 114e0a684753516ef4b71ccb55a8ebcfa8735edb upstream. Core code disables RCU when calling arch_cpu_idle(), so it's not safe for arch_cpu_idle() or its calees to be instrumented, as the instrumentation callbacks may attempt to use RCU or other features which are unsafe to use in this context. Mark them noinstr to prevent issues. The use of local_irq_enable() in arch_cpu_idle() is similarly problematic, and the "sched/idle: Fix arch_cpu_idle() vs tracing" patch queued in the tip tree addresses that case. Reported-by: Marco Elver Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: James Morse Cc: Will Deacon Link: https://lore.kernel.org/r/20201130115950.22492-3-mark.rutland@arm.com Signed-off-by: Will Deacon Signed-off-by: Feng Zhou --- arch/arm64/kernel/process.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 6e89d283d188e..c2fe6d549d056 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -68,13 +68,13 @@ EXPORT_SYMBOL_GPL(pm_power_off); void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd); -static void __cpu_do_idle(void) +static void noinstr __cpu_do_idle(void) { dsb(sy); wfi(); } -static void __cpu_do_idle_irqprio(void) +static void noinstr __cpu_do_idle_irqprio(void) { unsigned long pmr; unsigned long daif_bits; @@ -104,7 +104,7 @@ static void __cpu_do_idle_irqprio(void) * ensure that interrupts are not masked at the PMR (because the core will * not wake up if we block the wake up signal in the interrupt controller). */ -void cpu_do_idle(void) +void noinstr cpu_do_idle(void) { if (system_uses_irq_prio_masking()) __cpu_do_idle_irqprio(); @@ -115,7 +115,7 @@ void cpu_do_idle(void) /* * This is our default idle handler. */ -void arch_cpu_idle(void) +void noinstr arch_cpu_idle(void) { /* * This should do all the clock switching and wait for interrupt From adbc40fb77c7b68466f292d20b05c013327af7c0 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 2 Oct 2019 10:06:12 +0100 Subject: [PATCH 0233/5344] arm64: Relax ICC_PMR_EL1 accesses when ICC_CTLR_EL1.PMHE is clear commit f226650494c6aa87526d12135b7de8b8c074f3de upstream. The GICv3 architecture specification is incredibly misleading when it comes to PMR and the requirement for a DSB. It turns out that this DSB is only required if the CPU interface sends an Upstream Control message to the redistributor in order to update the RD's view of PMR. This message is only sent when ICC_CTLR_EL1.PMHE is set, which isn't the case in Linux. It can still be set from EL3, so some special care is required. But the upshot is that in the (hopefuly large) majority of the cases, we can drop the DSB altogether. This relies on a new static key being set if the boot CPU has PMHE set. The drawback is that this static key has to be exported to modules. Cc: Will Deacon Cc: James Morse Cc: Julien Thierry Cc: Suzuki K Poulose Signed-off-by: Marc Zyngier Signed-off-by: Catalin Marinas Signed-off-by: Feng Zhou --- arch/arm64/include/asm/barrier.h | 12 ++++++++++++ arch/arm64/include/asm/daifflags.h | 3 ++- arch/arm64/include/asm/irqflags.h | 19 ++++++++++--------- arch/arm64/include/asm/kvm_host.h | 3 +-- arch/arm64/kernel/entry.S | 6 ++++-- arch/arm64/kvm/hyp/switch.c | 4 ++-- drivers/irqchip/irq-gic-v3.c | 20 ++++++++++++++++++++ include/linux/irqchip/arm-gic-v3.h | 2 ++ 8 files changed, 53 insertions(+), 16 deletions(-) diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h index 0fcd854fc95f3..ee67911dab69e 100644 --- a/arch/arm64/include/asm/barrier.h +++ b/arch/arm64/include/asm/barrier.h @@ -29,6 +29,18 @@ SB_BARRIER_INSN"nop\n", \ ARM64_HAS_SB)) +#ifdef CONFIG_ARM64_PSEUDO_NMI +#define pmr_sync() \ + do { \ + extern struct static_key_false gic_pmr_sync; \ + \ + if (static_branch_unlikely(&gic_pmr_sync)) \ + dsb(sy); \ + } while(0) +#else +#define pmr_sync() do {} while (0) +#endif + #define mb() dsb(sy) #define rmb() dsb(ld) #define wmb() dsb(st) diff --git a/arch/arm64/include/asm/daifflags.h b/arch/arm64/include/asm/daifflags.h index 48bfbf70dbb07..f96d1adc52db9 100644 --- a/arch/arm64/include/asm/daifflags.h +++ b/arch/arm64/include/asm/daifflags.h @@ -8,6 +8,7 @@ #include #include +#include #include #define DAIF_PROCCTX 0 @@ -74,7 +75,7 @@ static inline void local_daif_restore(unsigned long flags) if (system_uses_irq_prio_masking()) { gic_write_pmr(GIC_PRIO_IRQON); - dsb(sy); + pmr_sync(); } } else if (system_uses_irq_prio_masking()) { u64 pmr; diff --git a/arch/arm64/include/asm/irqflags.h b/arch/arm64/include/asm/irqflags.h index 1a59f0ed1ae39..aa4b6521ef144 100644 --- a/arch/arm64/include/asm/irqflags.h +++ b/arch/arm64/include/asm/irqflags.h @@ -6,6 +6,7 @@ #define __ASM_IRQFLAGS_H #include +#include #include #include @@ -34,14 +35,14 @@ static inline void arch_local_irq_enable(void) } asm volatile(ALTERNATIVE( - "msr daifclr, #2 // arch_local_irq_enable\n" - "nop", - __msr_s(SYS_ICC_PMR_EL1, "%0") - "dsb sy", + "msr daifclr, #2 // arch_local_irq_enable", + __msr_s(SYS_ICC_PMR_EL1, "%0"), ARM64_HAS_IRQ_PRIO_MASKING) : : "r" ((unsigned long) GIC_PRIO_IRQON) : "memory"); + + pmr_sync(); } static inline void arch_local_irq_disable(void) @@ -116,14 +117,14 @@ static inline unsigned long arch_local_irq_save(void) static inline void arch_local_irq_restore(unsigned long flags) { asm volatile(ALTERNATIVE( - "msr daif, %0\n" - "nop", - __msr_s(SYS_ICC_PMR_EL1, "%0") - "dsb sy", - ARM64_HAS_IRQ_PRIO_MASKING) + "msr daif, %0", + __msr_s(SYS_ICC_PMR_EL1, "%0"), + ARM64_HAS_IRQ_PRIO_MASKING) : : "r" (flags) : "memory"); + + pmr_sync(); } #endif /* __ASM_IRQFLAGS_H */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 697702a1a1ff1..cba22a5e289a9 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -605,8 +605,7 @@ static inline void kvm_arm_vhe_guest_enter(void) * local_daif_mask() already sets GIC_PRIO_PSR_I_SET, we just need a * dsb to ensure the redistributor is forwards EL2 IRQs to the CPU. */ - if (system_uses_irq_prio_masking()) - dsb(sy); + pmr_sync(); } static inline void kvm_arm_vhe_guest_exit(void) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index db137746c6fa3..d6c60274c730a 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -269,8 +269,10 @@ alternative_else_nop_endif alternative_if ARM64_HAS_IRQ_PRIO_MASKING ldr x20, [sp, #S_PMR_SAVE] msr_s SYS_ICC_PMR_EL1, x20 - /* Ensure priority change is seen by redistributor */ - dsb sy + mrs_s x21, SYS_ICC_CTLR_EL1 + tbz x21, #6, .L__skip_pmr_sync\@ // Check for ICC_CTLR_EL1.PMHE + dsb sy // Ensure priority change is seen by redistributor +.L__skip_pmr_sync\@: alternative_else_nop_endif ldp x21, x22, [sp, #S_PC] // load ELR, SPSR diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 14607fac7ca38..211dd63f5901a 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -12,7 +12,7 @@ #include -#include +#include #include #include #include @@ -669,7 +669,7 @@ int __hyp_text __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu) */ if (system_uses_irq_prio_masking()) { gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); - dsb(sy); + pmr_sync(); } vcpu = kern_hyp_va(vcpu); diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 9d0b42cb9903e..17b3173d904e0 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -87,6 +87,15 @@ static DEFINE_STATIC_KEY_TRUE(supports_deactivate_key); */ static DEFINE_STATIC_KEY_FALSE(supports_pseudo_nmis); +/* + * Global static key controlling whether an update to PMR allowing more + * interrupts requires to be propagated to the redistributor (DSB SY). + * And this needs to be exported for modules to be able to enable + * interrupts... + */ +DEFINE_STATIC_KEY_FALSE(gic_pmr_sync); +EXPORT_SYMBOL(gic_pmr_sync); + /* ppi_nmi_refs[n] == number of cpus having ppi[n + 16] set as NMI */ static refcount_t *ppi_nmi_refs; @@ -1502,6 +1511,17 @@ static void gic_enable_nmi_support(void) for (i = 0; i < gic_data.ppi_nr; i++) refcount_set(&ppi_nmi_refs[i], 0); + /* + * Linux itself doesn't use 1:N distribution, so has no need to + * set PMHE. The only reason to have it set is if EL3 requires it + * (and we can't change it). + */ + if (gic_read_ctlr() & ICC_CTLR_EL1_PMHE_MASK) + static_branch_enable(&gic_pmr_sync); + + pr_info("%s ICC_PMR_EL1 synchronisation\n", + static_branch_unlikely(&gic_pmr_sync) ? "Forcing" : "Relaxing"); + static_branch_enable(&supports_pseudo_nmis); if (static_branch_likely(&supports_deactivate_key)) diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 5cc10cf7cb3e6..a0bde9e12efa3 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -487,6 +487,8 @@ #define ICC_CTLR_EL1_EOImode_MASK (1 << ICC_CTLR_EL1_EOImode_SHIFT) #define ICC_CTLR_EL1_CBPR_SHIFT 0 #define ICC_CTLR_EL1_CBPR_MASK (1 << ICC_CTLR_EL1_CBPR_SHIFT) +#define ICC_CTLR_EL1_PMHE_SHIFT 6 +#define ICC_CTLR_EL1_PMHE_MASK (1 << ICC_CTLR_EL1_PMHE_SHIFT) #define ICC_CTLR_EL1_PRI_BITS_SHIFT 8 #define ICC_CTLR_EL1_PRI_BITS_MASK (0x7 << ICC_CTLR_EL1_PRI_BITS_SHIFT) #define ICC_CTLR_EL1_ID_BITS_SHIFT 11 From f8f3faf8dfe8ca5da47d180b1dc091b6b41fea5b Mon Sep 17 00:00:00 2001 From: Hector Martin Date: Mon, 15 Mar 2021 11:56:28 +0000 Subject: [PATCH 0234/5344] arm64: Always keep DAIF.[IF] in sync commit f0098155d337cab638cf18e37a3e9257d653d481 upstream. Apple SoCs (A11 and newer) have some interrupt sources hardwired to the FIQ line. We implement support for this by simply treating IRQs and FIQs the same way in the interrupt vectors. To support these systems, the FIQ mask bit needs to be kept in sync with the IRQ mask bit, so both kinds of exceptions are masked together. No other platforms should be delivering FIQ exceptions right now, and we already unmask FIQ in normal process context, so this should not have an effect on other systems - if spurious FIQs were arriving, they would already panic the kernel. Signed-off-by: Hector Martin Signed-off-by: Mark Rutland Tested-by: Hector Martin Cc: James Morse Cc: Marc Zyngier Cc: Thomas Gleixner Cc: Will Deacon Acked-by: Will Deacon Link: https://lore.kernel.org/r/20210315115629.57191-6-mark.rutland@arm.com Signed-off-by: Catalin Marinas Signed-off-by: Feng Zhou --- arch/arm64/include/asm/arch_gicv3.h | 2 +- arch/arm64/include/asm/assembler.h | 8 ++++---- arch/arm64/include/asm/daifflags.h | 10 +++++----- arch/arm64/include/asm/irqflags.h | 16 +++++++--------- arch/arm64/kernel/entry.S | 18 ++++++++++-------- arch/arm64/kernel/process.c | 2 +- arch/arm64/kernel/smp.c | 1 + 7 files changed, 29 insertions(+), 28 deletions(-) diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h index ee9bdaa40532d..a7b5f2230de1b 100644 --- a/arch/arm64/include/asm/arch_gicv3.h +++ b/arch/arm64/include/asm/arch_gicv3.h @@ -166,7 +166,7 @@ static inline void gic_pmr_mask_irqs(void) static inline void gic_arch_enable_irqs(void) { - asm volatile ("msr daifclr, #2" : : : "memory"); + asm volatile ("msr daifclr, #3" : : : "memory"); } #endif /* __ASSEMBLY__ */ diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 4a4258f17c868..8cb4a2fd19137 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -46,9 +46,9 @@ msr daif, \tmp .endm - /* IRQ is the lowest priority flag, unconditionally unmask the rest. */ - .macro enable_da_f - msr daifclr, #(8 | 4 | 1) + /* IRQ/FIQ are the lowest priority flags, unconditionally unmask the rest. */ + .macro enable_da + msr daifclr, #(8 | 4) .endm /* @@ -56,7 +56,7 @@ */ .macro save_and_disable_irq, flags mrs \flags, daif - msr daifset, #2 + msr daifset, #3 .endm .macro restore_irq, flags diff --git a/arch/arm64/include/asm/daifflags.h b/arch/arm64/include/asm/daifflags.h index f96d1adc52db9..009abc5b9e231 100644 --- a/arch/arm64/include/asm/daifflags.h +++ b/arch/arm64/include/asm/daifflags.h @@ -12,8 +12,8 @@ #include #define DAIF_PROCCTX 0 -#define DAIF_PROCCTX_NOIRQ PSR_I_BIT -#define DAIF_ERRCTX (PSR_I_BIT | PSR_A_BIT) +#define DAIF_PROCCTX_NOIRQ (PSR_I_BIT | PSR_F_BIT) +#define DAIF_ERRCTX (PSR_A_BIT | PSR_I_BIT | PSR_F_BIT) #define DAIF_MASK (PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT) @@ -46,7 +46,7 @@ static inline unsigned long local_daif_save_flags(void) if (system_uses_irq_prio_masking()) { /* If IRQs are masked with PMR, reflect it in the flags */ if (read_sysreg_s(SYS_ICC_PMR_EL1) != GIC_PRIO_IRQON) - flags |= PSR_I_BIT; + flags |= PSR_I_BIT | PSR_F_BIT; } return flags; @@ -68,7 +68,7 @@ static inline void local_daif_restore(unsigned long flags) bool irq_disabled = flags & PSR_I_BIT; WARN_ON(system_has_prio_mask_debugging() && - !(read_sysreg(daif) & PSR_I_BIT)); + (read_sysreg(daif) & (PSR_I_BIT | PSR_F_BIT)) != (PSR_I_BIT | PSR_F_BIT)); if (!irq_disabled) { trace_hardirqs_on(); @@ -85,7 +85,7 @@ static inline void local_daif_restore(unsigned long flags) * If interrupts are disabled but we can take * asynchronous errors, we can take NMIs */ - flags &= ~PSR_I_BIT; + flags &= ~(PSR_I_BIT | PSR_F_BIT); pmr = GIC_PRIO_IRQOFF; } else { pmr = GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET; diff --git a/arch/arm64/include/asm/irqflags.h b/arch/arm64/include/asm/irqflags.h index aa4b6521ef144..d1930bbd95e49 100644 --- a/arch/arm64/include/asm/irqflags.h +++ b/arch/arm64/include/asm/irqflags.h @@ -12,15 +12,13 @@ /* * Aarch64 has flags for masking: Debug, Asynchronous (serror), Interrupts and - * FIQ exceptions, in the 'daif' register. We mask and unmask them in 'dai' + * FIQ exceptions, in the 'daif' register. We mask and unmask them in 'daif' * order: * Masking debug exceptions causes all other exceptions to be masked too/ - * Masking SError masks irq, but not debug exceptions. Masking irqs has no - * side effects for other flags. Keeping to this order makes it easier for - * entry.S to know which exceptions should be unmasked. - * - * FIQ is never expected, but we mask it when we disable debug exceptions, and - * unmask it at all other times. + * Masking SError masks IRQ/FIQ, but not debug exceptions. IRQ and FIQ are + * always masked and unmasked together, and have no side effects for other + * flags. Keeping to this order makes it easier for entry.S to know which + * exceptions should be unmasked. */ /* @@ -35,7 +33,7 @@ static inline void arch_local_irq_enable(void) } asm volatile(ALTERNATIVE( - "msr daifclr, #2 // arch_local_irq_enable", + "msr daifclr, #3 // arch_local_irq_enable", __msr_s(SYS_ICC_PMR_EL1, "%0"), ARM64_HAS_IRQ_PRIO_MASKING) : @@ -54,7 +52,7 @@ static inline void arch_local_irq_disable(void) } asm volatile(ALTERNATIVE( - "msr daifset, #2 // arch_local_irq_disable", + "msr daifset, #3 // arch_local_irq_disable", __msr_s(SYS_ICC_PMR_EL1, "%0"), ARM64_HAS_IRQ_PRIO_MASKING) : diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index d6c60274c730a..4fcd65381f604 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -656,7 +656,7 @@ ENDPROC(el1_sync) el1_irq: kernel_entry 1 gic_prio_irq_setup pmr=x20, tmp=x1 - enable_da_f + enable_da #ifdef CONFIG_ARM64_PSEUDO_NMI test_irqs_unmasked res=x0, pmr=x20 @@ -675,8 +675,10 @@ el1_irq: ldr x24, [tsk, #TSK_TI_PREEMPT] // get preempt count alternative_if ARM64_HAS_IRQ_PRIO_MASKING /* - * DA_F were cleared at start of handling. If anything is set in DAIF, - * we come back from an NMI, so skip preemption + * DA were cleared at start of handling, and IF are cleared by + * the GIC irqchip driver using gic_arch_enable_irqs() for + * normal IRQs. If anything is set, it means we come back from + * an NMI instead of a normal IRQ, so skip preemption */ mrs x0, daif orr x24, x24, x0 @@ -822,7 +824,7 @@ el0_ia: mrs x26, far_el1 gic_prio_kentry_setup tmp=x0 ct_user_exit_irqoff - enable_da_f + enable_da #ifdef CONFIG_TRACE_IRQFLAGS bl trace_hardirqs_off #endif @@ -872,7 +874,7 @@ el0_sp_pc: */ gic_prio_kentry_setup tmp=x0 ct_user_exit_irqoff - enable_da_f + enable_da #ifdef CONFIG_TRACE_IRQFLAGS bl trace_hardirqs_off #endif @@ -912,7 +914,7 @@ el0_dbg: mov x1, x25 mov x2, sp bl do_debug_exception - enable_da_f + enable_da b ret_to_user el0_inv: ct_user_exit_irqoff @@ -930,7 +932,7 @@ el0_irq: el0_irq_naked: gic_prio_irq_setup pmr=x20, tmp=x0 ct_user_exit_irqoff - enable_da_f + enable_da #ifdef CONFIG_TRACE_IRQFLAGS bl trace_hardirqs_off @@ -969,7 +971,7 @@ el0_error_naked: mov x0, sp mov x1, x25 bl do_serror - enable_da_f + enable_da b ret_to_user ENDPROC(el0_error) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index c2fe6d549d056..c23510c69ec04 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -80,7 +80,7 @@ static void noinstr __cpu_do_idle_irqprio(void) unsigned long daif_bits; daif_bits = read_sysreg(daif); - write_sysreg(daif_bits | PSR_I_BIT, daif); + write_sysreg(daif_bits | PSR_I_BIT | PSR_F_BIT, daif); /* * Unmask PMR before going idle to make sure interrupts can diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 987220ac4cfef..f9a402436795c 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -182,6 +182,7 @@ static void init_gic_priority_masking(void) cpuflags = read_sysreg(daif); WARN_ON(!(cpuflags & PSR_I_BIT)); + WARN_ON(!(cpuflags & PSR_F_BIT)); gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); } From c70c14ab89b97392fad81ee6910cfee0772f8f7e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 15 Jun 2021 12:12:25 +0100 Subject: [PATCH 0235/5344] arm64: Convert cpu_do_idle() to using cpuidle context helpers commit d4dc10277255afc303de4f00cbee0b9ce74d870f upstream. Now that we have helpers that are aware of the pseudo-NMI feature, introduce them to cpu_do_idle(). This allows for some nice cleanup. No functional change intended. Tested-by: Valentin Schneider Reviewed-by: Lorenzo Pieralisi Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210615111227.2454465-3-maz@kernel.org Signed-off-by: Will Deacon Signed-off-by: Feng Zhou --- arch/arm64/kernel/process.c | 41 ++++++++----------------------------- 1 file changed, 9 insertions(+), 32 deletions(-) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index c23510c69ec04..09974216384fd 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -43,9 +43,9 @@ #include #include -#include #include #include +#include #include #include #include @@ -68,33 +68,6 @@ EXPORT_SYMBOL_GPL(pm_power_off); void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd); -static void noinstr __cpu_do_idle(void) -{ - dsb(sy); - wfi(); -} - -static void noinstr __cpu_do_idle_irqprio(void) -{ - unsigned long pmr; - unsigned long daif_bits; - - daif_bits = read_sysreg(daif); - write_sysreg(daif_bits | PSR_I_BIT | PSR_F_BIT, daif); - - /* - * Unmask PMR before going idle to make sure interrupts can - * be raised. - */ - pmr = gic_read_pmr(); - gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); - - __cpu_do_idle(); - - gic_write_pmr(pmr); - write_sysreg(daif_bits, daif); -} - /* * cpu_do_idle() * @@ -106,10 +79,14 @@ static void noinstr __cpu_do_idle_irqprio(void) */ void noinstr cpu_do_idle(void) { - if (system_uses_irq_prio_masking()) - __cpu_do_idle_irqprio(); - else - __cpu_do_idle(); + struct arm_cpuidle_irq_context context; + + arm_cpuidle_save_irq_context(&context); + + dsb(sy); + wfi(); + + arm_cpuidle_restore_irq_context(&context); } /* From eac5e43320b250561486157ba33854b67ab9b344 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 15 Jun 2021 12:12:26 +0100 Subject: [PATCH 0236/5344] PSCI: Use cpuidle context helpers in psci_cpu_suspend_enter() commit c9223b616298c3d0e6ff5dd20d14d65c2131c535 upstream. The PSCI CPU suspend code isn't aware of the PMR vs DAIF game, resulting in a system that locks up if entering CPU suspend with GICv3 pNMI enabled. To save the day, teach the suspend code about our new cpuidle context helpers, which will do everything that's required just like the usual WFI cpuidle code. This fixes my Altra system, which would otherwise lock-up at boot time when booted with irqchip.gicv3_pseudo_nmi=1. Tested-by: Valentin Schneider Reviewed-by: Lorenzo Pieralisi Signed-off-by: Marc Zyngier Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/20210615111227.2454465-4-maz@kernel.org Signed-off-by: Will Deacon Signed-off-by: Feng Zhou --- drivers/firmware/psci/psci.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c index eb797081d1596..bbea136fa40be 100644 --- a/drivers/firmware/psci/psci.c +++ b/drivers/firmware/psci/psci.c @@ -303,10 +303,15 @@ int psci_cpu_suspend_enter(u32 state) { int ret; - if (!psci_power_state_loses_context(state)) + if (!psci_power_state_loses_context(state)) { + struct arm_cpuidle_irq_context context; + + arm_cpuidle_save_irq_context(&context); ret = psci_ops.cpu_suspend(state, 0); - else + arm_cpuidle_restore_irq_context(&context); + } else { ret = cpu_suspend(state, psci_suspend_finisher); + } return ret; } From dc18ff9451c2480531478482839e78978af2b07a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 15 Jun 2021 12:12:27 +0100 Subject: [PATCH 0237/5344] arm64: suspend: Use cpuidle context helpers in cpu_suspend() commit 77345ef70445a8f16e0685dade0d68bdf41f19d7 upstream. Use cpuidle context helpers to switch to using DAIF.IF instead of PMR to mask interrupts, ensuring that we suspend with interrupts being able to reach the CPU interface. Signed-off-by: Marc Zyngier Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/20210615111227.2454465-5-maz@kernel.org Signed-off-by: Will Deacon Signed-off-by: Feng Zhou --- arch/arm64/kernel/suspend.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c index 9405d1b7f4b03..9ef999949c66a 100644 --- a/arch/arm64/kernel/suspend.c +++ b/arch/arm64/kernel/suspend.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -88,6 +89,7 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) int ret = 0; unsigned long flags; struct sleep_stack_data state; + struct arm_cpuidle_irq_context context; /* * From this point debug exceptions are disabled to prevent @@ -97,12 +99,18 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) flags = local_daif_save(); /* - * Function graph tracer state gets incosistent when the kernel + * Function graph tracer state gets inconsistent when the kernel * calls functions that never return (aka suspend finishers) hence * disable graph tracing during their execution. */ pause_graph_tracing(); + /* + * Switch to using DAIF.IF instead of PMR in order to reliably + * resume if we're using pseudo-NMIs. + */ + arm_cpuidle_save_irq_context(&context); + if (__cpu_suspend_enter(&state)) { /* Call the suspend finisher */ ret = fn(arg); @@ -120,6 +128,8 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) __cpu_suspend_exit(); } + arm_cpuidle_restore_irq_context(&context); + unpause_graph_tracing(); /* From 45096989acab6525ec14f52bd529a556de522587 Mon Sep 17 00:00:00 2001 From: Feng Zhou Date: Fri, 15 Apr 2022 11:28:55 +0800 Subject: [PATCH 0238/5344] bytedance: enable CONFIG_ARM64_PSEUDO_NMI for arm64 enable CONFIG_ARM64_PSEUDO_NMI for arm64. Signed-off-by: Feng Zhou --- config.aarch64 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.aarch64 b/config.aarch64 index 95d54dd58f8cf..497d4b9f794c3 100644 --- a/config.aarch64 +++ b/config.aarch64 @@ -420,7 +420,7 @@ CONFIG_ARM64_PTR_AUTH=y CONFIG_ARM64_SVE=y CONFIG_ARM64_MODULE_PLTS=y -# CONFIG_ARM64_PSEUDO_NMI is not set +CONFIG_ARM64_PSEUDO_NMI=y CONFIG_RELOCATABLE=y CONFIG_RANDOMIZE_BASE=y CONFIG_RANDOMIZE_MODULE_REGION_FULL=y From d96a235fe682e45ccd40aa1fe44621461136f5d8 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 9 Mar 2021 17:44:12 -0700 Subject: [PATCH 0239/5344] arm64: perf: Fix 64-bit event counter read truncation commit 7bb8bc6eb550116c504fb25af8678b9d7ca2abc5 upstream. Commit 0fdf1bb75953 ("arm64: perf: Avoid PMXEV* indirection") changed armv8pmu_read_evcntr() to return a u32 instead of u64. The result is silent truncation of the event counter when using 64-bit counters. Given the offending commit appears to have passed thru several folks, it seems likely this was a bad rebase after v8.5 PMU 64-bit counters landed. Cc: Alexandru Elisei Cc: Julien Thierry Cc: Mark Rutland Cc: Will Deacon Cc: Catalin Marinas Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Fixes: 0fdf1bb75953 ("arm64: perf: Avoid PMXEV* indirection") Signed-off-by: Rob Herring Acked-by: Mark Rutland Reviewed-by: Alexandru Elisei Link: https://lore.kernel.org/r/20210310004412.1450128-1-robh@kernel.org Signed-off-by: Will Deacon Signed-off-by: Feng Zhou --- arch/arm64/kernel/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index a084bb95363f7..38a09cd1992ae 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -480,7 +480,7 @@ static inline int armv8pmu_counter_has_overflowed(u32 pmnc, int idx) return pmnc & BIT(ARMV8_IDX_TO_COUNTER(idx)); } -static inline u32 armv8pmu_read_evcntr(int idx) +static inline u64 armv8pmu_read_evcntr(int idx) { u32 counter = ARMV8_IDX_TO_COUNTER(idx); From e65410012f771d47648cc69b2965542d96baba02 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 29 Apr 2022 14:36:59 -0700 Subject: [PATCH 0240/5344] memcg: introduce per-memcg reclaim interface commit 94968384dde15d48263bfc59d280cd71b1259d8c upstream. This patch series adds a memory.reclaim proactive reclaim interface. The rationale behind the interface and how it works are in the first patch. This patch (of 4): Introduce a memcg interface to trigger memory reclaim on a memory cgroup. Use case: Proactive Reclaim --------------------------- A userspace proactive reclaimer can continuously probe the memcg to reclaim a small amount of memory. This gives more accurate and up-to-date workingset estimation as the LRUs are continuously sorted and can potentially provide more deterministic memory overcommit behavior. The memory overcommit controller can provide more proactive response to the changing behavior of the running applications instead of being reactive. A userspace reclaimer's purpose in this case is not a complete replacement for kswapd or direct reclaim, it is to proactively identify memory savings opportunities and reclaim some amount of cold pages set by the policy to free up the memory for more demanding jobs or scheduling new jobs. A user space proactive reclaimer is used in Google data centers. Additionally, Meta's TMO paper recently referenced a very similar interface used for user space proactive reclaim: https://dl.acm.org/doi/pdf/10.1145/3503222.3507731 Benefits of a user space reclaimer: ----------------------------------- 1) More flexible on who should be charged for the cpu of the memory reclaim. For proactive reclaim, it makes more sense to be centralized. 2) More flexible on dedicating the resources (like cpu). The memory overcommit controller can balance the cost between the cpu usage and the memory reclaimed. 3) Provides a way to the applications to keep their LRUs sorted, so, under memory pressure better reclaim candidates are selected. This also gives more accurate and uptodate notion of working set for an application. Why memory.high is not enough? ------------------------------ - memory.high can be used to trigger reclaim in a memcg and can potentially be used for proactive reclaim. However there is a big downside in using memory.high. It can potentially introduce high reclaim stalls in the target application as the allocations from the processes or the threads of the application can hit the temporary memory.high limit. - Userspace proactive reclaimers usually use feedback loops to decide how much memory to proactively reclaim from a workload. The metrics used for this are usually either refaults or PSI, and these metrics will become messy if the application gets throttled by hitting the high limit. - memory.high is a stateful interface, if the userspace proactive reclaimer crashes for any reason while triggering reclaim it can leave the application in a bad state. - If a workload is rapidly expanding, setting memory.high to proactively reclaim memory can result in actually reclaiming more memory than intended. The benefits of such interface and shortcomings of existing interface were further discussed in this RFC thread: https://lore.kernel.org/linux-mm/5df21376-7dd1-bf81-8414-32a73cea45dd@google.com/ Interface: ---------- Introducing a very simple memcg interface 'echo 10M > memory.reclaim' to trigger reclaim in the target memory cgroup. The interface is introduced as a nested-keyed file to allow for future optional arguments to be easily added to configure the behavior of reclaim. Possible Extensions: -------------------- - This interface can be extended with an additional parameter or flags to allow specifying one or more types of memory to reclaim from (e.g. file, anon, ..). - The interface can also be extended with a node mask to reclaim from specific nodes. This has use cases for reclaim-based demotion in memory tiering systens. - A similar per-node interface can also be added to support proactive reclaim and reclaim-based demotion in systems without memcg. - Add a timeout parameter to make it easier for user space to call the interface without worrying about being blocked for an undefined amount of time. For now, let's keep things simple by adding the basic functionality. [yosryahmed@google.com: worked on versions v2 onwards, refreshed to current master, updated commit message based on recent discussions and use cases] Link: https://lkml.kernel.org/r/20220425190040.2475377-1-yosryahmed@google.com Link: https://lkml.kernel.org/r/20220425190040.2475377-2-yosryahmed@google.com Signed-off-by: Shakeel Butt Co-developed-by: Yosry Ahmed Signed-off-by: Yosry Ahmed Acked-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: Wei Xu Acked-by: Roman Gushchin Acked-by: David Rientjes Cc: Tejun Heo Cc: Zefan Li Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yu Zhao Cc: Dave Hansen Cc: Greg Thelen Cc: Chen Wandun Cc: Vaibhav Jain Cc: "Michal Koutn" Cc: Tim Chen Signed-off-by: Andrew Morton Signed-off-by: Qi Zheng --- Documentation/admin-guide/cgroup-v2.rst | 21 ++++++++++++ mm/memcontrol.c | 45 +++++++++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 65a4a213bd48c..3cc4222f631ae 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1174,6 +1174,27 @@ PAGE_SIZE multiple when read back. It will broke low boundary. Because it tries to reclaim the memory many times, until the memory drops to a certain level. + memory.reclaim + A write-only nested-keyed file which exists for all cgroups. + + This is a simple interface to trigger memory reclaim in the + target cgroup. + + This file accepts a single key, the number of bytes to reclaim. + No nested keys are currently supported. + + Example:: + + echo "1G" > memory.reclaim + + The interface can be later extended with nested keys to + configure the reclaim behavior. For example, specify the + type of memory to reclaim from (anon, file, ..). + + Please note that the kernel can over or under reclaim from + the target cgroup. If less bytes are reclaimed than the + specified amount, -EAGAIN is returned. + memory.oom.group A read-write single value file which exists on non-root cgroups. The default value is "0". diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4231d9ecf179d..f036e946077d6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6853,6 +6853,46 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of, return nbytes; } +static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned int nr_retries = MAX_RECLAIM_RETRIES; + unsigned long nr_to_reclaim, nr_reclaimed = 0; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "", &nr_to_reclaim); + if (err) + return err; + + while (nr_reclaimed < nr_to_reclaim) { + unsigned long reclaimed; + + if (signal_pending(current)) + return -EINTR; + + /* + * This is the final attempt, drain percpu lru caches in the + * hope of introducing more evictable pages for + * try_to_free_mem_cgroup_pages(). + */ + if (!nr_retries) + lru_add_drain_all(); + + reclaimed = try_to_free_mem_cgroup_pages(memcg, + nr_to_reclaim - nr_reclaimed, + GFP_KERNEL, true); + + if (!reclaimed && !nr_retries--) + return -EAGAIN; + + nr_reclaimed += reclaimed; + } + + return nbytes; +} + static struct cftype memory_files[] = { { .name = "current", @@ -6932,6 +6972,11 @@ static struct cftype memory_files[] = { .seq_show = memory_wmark_read, }, #endif + { + .name = "reclaim", + .flags = CFTYPE_NS_DELEGATABLE, + .write = memory_reclaim, + }, { } /* terminate */ }; From 5c9978ab56a5840e9ed5165b8d0a5c2b92c5bb8e Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 29 Apr 2022 14:36:59 -0700 Subject: [PATCH 0241/5344] selftests: cgroup: return -errno from cg_read()/cg_write() on failure commit 6c26df84e1f2f9181c0741865105a53537da842c upstream. Currently, cg_read()/cg_write() returns 0 on success and -1 on failure. Modify them to return the -errno on failure. Link: https://lkml.kernel.org/r/20220425190040.2475377-3-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Shakeel Butt Acked-by: David Rientjes Acked-by: Roman Gushchin Cc: Chen Wandun Cc: Dave Hansen Cc: Greg Thelen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michal Hocko Cc: "Michal Koutn" Cc: Shuah Khan Cc: Tejun Heo Cc: Tim Chen Cc: Vaibhav Jain Cc: Wei Xu Cc: Yu Zhao Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Qi Zheng --- tools/testing/selftests/cgroup/cgroup_util.c | 44 +++++++++----------- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c index 5e939ff1e3f95..0240dba89d76d 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.c +++ b/tools/testing/selftests/cgroup/cgroup_util.c @@ -16,6 +16,7 @@ #include "cgroup_util.h" +/* Returns read len on success, or -errno on failure. */ static ssize_t read_text(const char *path, char *buf, size_t max_len) { ssize_t len; @@ -23,35 +24,29 @@ static ssize_t read_text(const char *path, char *buf, size_t max_len) fd = open(path, O_RDONLY); if (fd < 0) - return fd; + return -errno; len = read(fd, buf, max_len - 1); - if (len < 0) - goto out; - buf[len] = 0; -out: + if (len >= 0) + buf[len] = 0; + close(fd); - return len; + return len < 0 ? -errno : len; } +/* Returns written len on success, or -errno on failure. */ static ssize_t write_text(const char *path, char *buf, ssize_t len) { int fd; fd = open(path, O_WRONLY | O_APPEND); if (fd < 0) - return fd; + return -errno; len = write(fd, buf, len); - if (len < 0) { - close(fd); - return len; - } - close(fd); - - return len; + return len < 0 ? -errno : len; } char *cg_name(const char *root, const char *name) @@ -84,16 +79,16 @@ char *cg_control(const char *cgroup, const char *control) return ret; } +/* Returns 0 on success, or -errno on failure. */ int cg_read(const char *cgroup, const char *control, char *buf, size_t len) { char path[PATH_MAX]; + ssize_t ret; snprintf(path, sizeof(path), "%s/%s", cgroup, control); - if (read_text(path, buf, len) >= 0) - return 0; - - return -1; + ret = read_text(path, buf, len); + return ret >= 0 ? 0 : ret; } int cg_read_strcmp(const char *cgroup, const char *control, @@ -158,17 +153,15 @@ long cg_read_key_long(const char *cgroup, const char *control, const char *key) return atol(ptr + strlen(key)); } +/* Returns 0 on success, or -errno on failure. */ int cg_write(const char *cgroup, const char *control, char *buf) { char path[PATH_MAX]; - ssize_t len = strlen(buf); + ssize_t len = strlen(buf), ret; snprintf(path, sizeof(path), "%s/%s", cgroup, control); - - if (write_text(path, buf, len) == len) - return 0; - - return -1; + ret = write_text(path, buf, len); + return ret == len ? 0 : ret; } int cg_find_unified_root(char *root, size_t len) @@ -416,5 +409,6 @@ char proc_read_text(int pid, const char *item, char *buf, size_t size) snprintf(path, sizeof(path), "/proc/%d/%s", pid, item); - return read_text(path, buf, size); + size = read_text(path, buf, size); + return size < 0 ? -1 : size; } From 2275777bdb5cdc6f0e30c020f43220f6a7a24507 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 29 Apr 2022 14:36:59 -0700 Subject: [PATCH 0242/5344] selftests: cgroup: fix alloc_anon_noexit() instantly freeing memory commit a3622a53e620700053b648478dbc638ad373be3b upstream. Currently, alloc_anon_noexit() calls alloc_anon() which instantly frees the allocated memory. alloc_anon_noexit() is usually used with cg_run_nowait() to run a process in the background that allocates memory. It makes sense for the background process to keep the memory allocated and not instantly free it (otherwise there is no point of running it in the background). Link: https://lkml.kernel.org/r/20220425190040.2475377-4-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Roman Gushchin Acked-by: Shakeel Butt Acked-by: David Rientjes Cc: Chen Wandun Cc: Dave Hansen Cc: Greg Thelen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michal Hocko Cc: "Michal Koutn" Cc: Shuah Khan Cc: Tejun Heo Cc: Tim Chen Cc: Vaibhav Jain Cc: Wei Xu Cc: Yu Zhao Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Qi Zheng --- tools/testing/selftests/cgroup/test_memcontrol.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index c19a97dd02d49..9f5cdcf0593f7 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -210,13 +210,17 @@ static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg) static int alloc_anon_noexit(const char *cgroup, void *arg) { int ppid = getppid(); + size_t size = (unsigned long)arg; + char *buf, *ptr; - if (alloc_anon(cgroup, arg)) - return -1; + buf = malloc(size); + for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) + *ptr = 0; while (getppid() == ppid) sleep(1); + free(buf); return 0; } From 4870668cc4353942bd43819c1e9888863ce32d95 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 29 Apr 2022 14:37:00 -0700 Subject: [PATCH 0243/5344] selftests: cgroup: add a selftest for memory.reclaim commit eae3cb2e87ff84547e66211b81301a8f9122840f upstream. Add a new test for memory.reclaim that verifies that the interface correctly reclaims memory as intended, from both anon and file pages. Link: https://lkml.kernel.org/r/20220425190040.2475377-5-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Roman Gushchin Acked-by: David Rientjes Cc: Chen Wandun Cc: Dave Hansen Cc: Greg Thelen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michal Hocko Cc: "Michal Koutn" Cc: Shakeel Butt Cc: Shuah Khan Cc: Tejun Heo Cc: Tim Chen Cc: Vaibhav Jain Cc: Wei Xu Cc: Yu Zhao Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Qi Zheng --- .../selftests/cgroup/test_memcontrol.c | 106 ++++++++++++++++++ 1 file changed, 106 insertions(+) diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index 9f5cdcf0593f7..94e16e383bcf8 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -683,6 +683,111 @@ static int test_memcg_max(const char *root) return ret; } +/* + * This test checks that memory.reclaim reclaims the given + * amount of memory (from both anon and file, if possible). + */ +static int test_memcg_reclaim(const char *root) +{ + int ret = KSFT_FAIL, fd, retries; + char *memcg; + long current, expected_usage, to_reclaim; + char buf[64]; + + memcg = cg_name(root, "memcg_test"); + if (!memcg) + goto cleanup; + + if (cg_create(memcg)) + goto cleanup; + + current = cg_read_long(memcg, "memory.current"); + if (current != 0) + goto cleanup; + + fd = get_temp_fd(); + if (fd < 0) + goto cleanup; + + cg_run_nowait(memcg, alloc_pagecache_50M_noexit, (void *)(long)fd); + + /* + * If swap is enabled, try to reclaim from both anon and file, else try + * to reclaim from file only. + */ + if (is_swap_enabled()) { + cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(50)); + expected_usage = MB(100); + } else + expected_usage = MB(50); + + /* + * Wait until current usage reaches the expected usage (or we run out of + * retries). + */ + retries = 5; + while (!values_close(cg_read_long(memcg, "memory.current"), + expected_usage, 10)) { + if (retries--) { + sleep(1); + continue; + } else { + fprintf(stderr, + "failed to allocate %ld for memcg reclaim test\n", + expected_usage); + goto cleanup; + } + } + + /* + * Reclaim until current reaches 30M, this makes sure we hit both anon + * and file if swap is enabled. + */ + retries = 5; + while (true) { + int err; + + current = cg_read_long(memcg, "memory.current"); + to_reclaim = current - MB(30); + + /* + * We only keep looping if we get EAGAIN, which means we could + * not reclaim the full amount. + */ + if (to_reclaim <= 0) + goto cleanup; + + + snprintf(buf, sizeof(buf), "%ld", to_reclaim); + err = cg_write(memcg, "memory.reclaim", buf); + if (!err) { + /* + * If writing succeeds, then the written amount should have been + * fully reclaimed (and maybe more). + */ + current = cg_read_long(memcg, "memory.current"); + if (!values_close(current, MB(30), 3) && current > MB(30)) + goto cleanup; + break; + } + + /* The kernel could not reclaim the full amount, try again. */ + if (err == -EAGAIN && retries--) + continue; + + /* We got an unexpected error or ran out of retries. */ + goto cleanup; + } + + ret = KSFT_PASS; +cleanup: + cg_destroy(memcg); + free(memcg); + close(fd); + + return ret; +} + static int alloc_anon_50M_check_swap(const char *cgroup, void *arg) { long mem_max = (long)arg; @@ -1185,6 +1290,7 @@ struct memcg_test { T(test_memcg_low), T(test_memcg_high), T(test_memcg_max), + T(test_memcg_reclaim), T(test_memcg_oom_events), T(test_memcg_swap_max), T(test_memcg_sock), From 88212dd354201fda4ce14e91993016699e8de1cd Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 17 Jun 2022 07:56:22 +0000 Subject: [PATCH 0244/5344] Revert "selftests/bpf: Test BPF_CGROUP_INET_SOCK_RELEASE" This reverts commit 4e81c6803ca84ea9c3af8c4f72b253cdd28b2064. Signed-off-by: Qi Zheng --- .../selftests/bpf/prog_tests/udp_limit.c | 75 ------------------- tools/testing/selftests/bpf/progs/udp_limit.c | 42 ----------- 2 files changed, 117 deletions(-) delete mode 100644 tools/testing/selftests/bpf/prog_tests/udp_limit.c delete mode 100644 tools/testing/selftests/bpf/progs/udp_limit.c diff --git a/tools/testing/selftests/bpf/prog_tests/udp_limit.c b/tools/testing/selftests/bpf/prog_tests/udp_limit.c deleted file mode 100644 index 2aba09d4d01bf..0000000000000 --- a/tools/testing/selftests/bpf/prog_tests/udp_limit.c +++ /dev/null @@ -1,75 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include "udp_limit.skel.h" - -#include -#include - -static int duration; - -void test_udp_limit(void) -{ - struct udp_limit *skel; - int fd1 = -1, fd2 = -1; - int cgroup_fd; - - cgroup_fd = test__join_cgroup("/udp_limit"); - if (CHECK(cgroup_fd < 0, "cg-join", "errno %d", errno)) - return; - - skel = udp_limit__open_and_load(); - if (CHECK(!skel, "skel-load", "errno %d", errno)) - goto close_cgroup_fd; - - skel->links.sock = bpf_program__attach_cgroup(skel->progs.sock, cgroup_fd); - skel->links.sock_release = bpf_program__attach_cgroup(skel->progs.sock_release, cgroup_fd); - if (CHECK(IS_ERR(skel->links.sock) || IS_ERR(skel->links.sock_release), - "cg-attach", "sock %ld sock_release %ld", - PTR_ERR(skel->links.sock), - PTR_ERR(skel->links.sock_release))) - goto close_skeleton; - - /* BPF program enforces a single UDP socket per cgroup, - * verify that. - */ - fd1 = socket(AF_INET, SOCK_DGRAM, 0); - if (CHECK(fd1 < 0, "fd1", "errno %d", errno)) - goto close_skeleton; - - fd2 = socket(AF_INET, SOCK_DGRAM, 0); - if (CHECK(fd2 >= 0, "fd2", "errno %d", errno)) - goto close_skeleton; - - /* We can reopen again after close. */ - close(fd1); - fd1 = -1; - - fd1 = socket(AF_INET, SOCK_DGRAM, 0); - if (CHECK(fd1 < 0, "fd1-again", "errno %d", errno)) - goto close_skeleton; - - /* Make sure the program was invoked the expected - * number of times: - * - open fd1 - BPF_CGROUP_INET_SOCK_CREATE - * - attempt to openfd2 - BPF_CGROUP_INET_SOCK_CREATE - * - close fd1 - BPF_CGROUP_INET_SOCK_RELEASE - * - open fd1 again - BPF_CGROUP_INET_SOCK_CREATE - */ - if (CHECK(skel->bss->invocations != 4, "bss-invocations", - "invocations=%d", skel->bss->invocations)) - goto close_skeleton; - - /* We should still have a single socket in use */ - if (CHECK(skel->bss->in_use != 1, "bss-in_use", - "in_use=%d", skel->bss->in_use)) - goto close_skeleton; - -close_skeleton: - if (fd1 >= 0) - close(fd1); - if (fd2 >= 0) - close(fd2); - udp_limit__destroy(skel); -close_cgroup_fd: - close(cgroup_fd); -} diff --git a/tools/testing/selftests/bpf/progs/udp_limit.c b/tools/testing/selftests/bpf/progs/udp_limit.c deleted file mode 100644 index 8429b22525a79..0000000000000 --- a/tools/testing/selftests/bpf/progs/udp_limit.c +++ /dev/null @@ -1,42 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only - -#include -#include -#include - -int invocations = 0, in_use = 0; - -SEC("cgroup/sock_create") -int sock(struct bpf_sock *ctx) -{ - __u32 key; - - if (ctx->type != SOCK_DGRAM) - return 1; - - __sync_fetch_and_add(&invocations, 1); - - if (in_use > 0) { - /* BPF_CGROUP_INET_SOCK_RELEASE is _not_ called - * when we return an error from the BPF - * program! - */ - return 0; - } - - __sync_fetch_and_add(&in_use, 1); - return 1; -} - -SEC("cgroup/sock_release") -int sock_release(struct bpf_sock *ctx) -{ - __u32 key; - - if (ctx->type != SOCK_DGRAM) - return 1; - - __sync_fetch_and_add(&invocations, 1); - __sync_fetch_and_add(&in_use, -1); - return 1; -} From 10bf5b28fc043e84a7d2af58ffcab12f72799ac8 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 17 Jun 2022 07:58:31 +0000 Subject: [PATCH 0245/5344] Revert "bpf, selftests: Add redirect_peer selftest" This reverts commit 39238139a321353156cf82741e6a7ecd3175a7b2. Signed-off-by: Qi Zheng --- .../selftests/bpf/progs/test_tc_peer.c | 45 ------------------- .../testing/selftests/bpf/test_tc_redirect.sh | 25 ++++------- 2 files changed, 9 insertions(+), 61 deletions(-) delete mode 100644 tools/testing/selftests/bpf/progs/test_tc_peer.c diff --git a/tools/testing/selftests/bpf/progs/test_tc_peer.c b/tools/testing/selftests/bpf/progs/test_tc_peer.c deleted file mode 100644 index fc84a7685aa2c..0000000000000 --- a/tools/testing/selftests/bpf/progs/test_tc_peer.c +++ /dev/null @@ -1,45 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include - -#include -#include -#include - -#include - -enum { - dev_src, - dev_dst, -}; - -struct bpf_map_def SEC("maps") ifindex_map = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(int), - .max_entries = 2, -}; - -static __always_inline int get_dev_ifindex(int which) -{ - int *ifindex = bpf_map_lookup_elem(&ifindex_map, &which); - - return ifindex ? *ifindex : 0; -} - -SEC("chk_egress") int tc_chk(struct __sk_buff *skb) -{ - return TC_ACT_SHOT; -} - -SEC("dst_ingress") int tc_dst(struct __sk_buff *skb) -{ - return bpf_redirect_peer(get_dev_ifindex(dev_src), 0); -} - -SEC("src_ingress") int tc_src(struct __sk_buff *skb) -{ - return bpf_redirect_peer(get_dev_ifindex(dev_dst), 0); -} - -char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_tc_redirect.sh b/tools/testing/selftests/bpf/test_tc_redirect.sh index 6d74825621401..6ad4414051323 100755 --- a/tools/testing/selftests/bpf/test_tc_redirect.sh +++ b/tools/testing/selftests/bpf/test_tc_redirect.sh @@ -4,10 +4,9 @@ # This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link # between src and dst. The netns fwd has veth links to each src and dst. The # client is in src and server in dst. The test installs a TC BPF program to each -# host facing veth in fwd which calls into i) bpf_redirect_neigh() to perform the -# neigh addr population and redirect or ii) bpf_redirect_peer() for namespace -# switch from ingress side; it also installs a checker prog on the egress side -# to drop unexpected traffic. +# host facing veth in fwd which calls into bpf_redirect_peer() to perform the +# neigh addr population and redirect; it also installs a dropper prog on the +# egress side to drop skbs if neigh addrs were not populated. if [[ $EUID -ne 0 ]]; then echo "This script must be run as root" @@ -167,17 +166,15 @@ hex_mem_str() perl -e 'print join(" ", unpack("(H2)8", pack("L", @ARGV)))' $1 } -netns_setup_bpf() +netns_setup_neigh() { - local obj=$1 - ip netns exec ${NS_FWD} tc qdisc add dev veth_src_fwd clsact - ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd ingress bpf da obj $obj sec src_ingress - ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd egress bpf da obj $obj sec chk_egress + ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd ingress bpf da obj test_tc_neigh.o sec src_ingress + ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd egress bpf da obj test_tc_neigh.o sec chk_egress ip netns exec ${NS_FWD} tc qdisc add dev veth_dst_fwd clsact - ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd ingress bpf da obj $obj sec dst_ingress - ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd egress bpf da obj $obj sec chk_egress + ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd ingress bpf da obj test_tc_neigh.o sec dst_ingress + ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd egress bpf da obj test_tc_neigh.o sec chk_egress veth_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/ifindex) veth_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/ifindex) @@ -196,9 +193,5 @@ trap netns_cleanup EXIT set -e netns_setup -netns_setup_bpf test_tc_neigh.o -netns_test_connectivity -netns_cleanup -netns_setup -netns_setup_bpf test_tc_peer.o +netns_setup_neigh netns_test_connectivity From 2e196967a252894992bbed7412ab7f2451515eee Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 17 Jun 2022 08:01:57 +0000 Subject: [PATCH 0246/5344] Revert "bpf, selftests: Make redirect_neigh test more extensible" This reverts commit be68291edc4a950c3332adaa4f808c6d2264e147. Signed-off-by: Qi Zheng --- .../selftests/bpf/progs/test_tc_neigh.c | 40 ++-- tools/testing/selftests/bpf/test_tc_neigh.sh | 168 +++++++++++++++ .../testing/selftests/bpf/test_tc_redirect.sh | 197 ------------------ 3 files changed, 186 insertions(+), 219 deletions(-) create mode 100755 tools/testing/selftests/bpf/test_tc_neigh.sh delete mode 100755 tools/testing/selftests/bpf/test_tc_redirect.sh diff --git a/tools/testing/selftests/bpf/progs/test_tc_neigh.c b/tools/testing/selftests/bpf/progs/test_tc_neigh.c index fe182616b112a..889a72c3024fd 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_neigh.c +++ b/tools/testing/selftests/bpf/progs/test_tc_neigh.c @@ -13,10 +13,17 @@ #include #include +#ifndef barrier_data +# define barrier_data(ptr) asm volatile("": :"r"(ptr) :"memory") +#endif + #ifndef ctx_ptr # define ctx_ptr(field) (void *)(long)(field) #endif +#define dst_to_src_tmp 0xeeddddeeU +#define src_to_dst_tmp 0xeeffffeeU + #define ip4_src 0xac100164 /* 172.16.1.100 */ #define ip4_dst 0xac100264 /* 172.16.2.100 */ @@ -32,18 +39,6 @@ a.s6_addr32[3] == b.s6_addr32[3]) #endif -enum { - dev_src, - dev_dst, -}; - -struct bpf_map_def SEC("maps") ifindex_map = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(int), - .max_entries = 2, -}; - static __always_inline bool is_remote_ep_v4(struct __sk_buff *skb, __be32 addr) { @@ -78,14 +73,7 @@ static __always_inline bool is_remote_ep_v6(struct __sk_buff *skb, return v6_equal(ip6h->daddr, addr); } -static __always_inline int get_dev_ifindex(int which) -{ - int *ifindex = bpf_map_lookup_elem(&ifindex_map, &which); - - return ifindex ? *ifindex : 0; -} - -SEC("chk_egress") int tc_chk(struct __sk_buff *skb) +SEC("chk_neigh") int tc_chk(struct __sk_buff *skb) { void *data_end = ctx_ptr(skb->data_end); void *data = ctx_ptr(skb->data); @@ -99,6 +87,7 @@ SEC("chk_egress") int tc_chk(struct __sk_buff *skb) SEC("dst_ingress") int tc_dst(struct __sk_buff *skb) { + int idx = dst_to_src_tmp; __u8 zero[ETH_ALEN * 2]; bool redirect = false; @@ -114,15 +103,19 @@ SEC("dst_ingress") int tc_dst(struct __sk_buff *skb) if (!redirect) return TC_ACT_OK; + barrier_data(&idx); + idx = bpf_ntohl(idx); + __builtin_memset(&zero, 0, sizeof(zero)); if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0) return TC_ACT_SHOT; - return bpf_redirect_neigh(get_dev_ifindex(dev_src), 0); + return bpf_redirect_neigh(idx, 0); } SEC("src_ingress") int tc_src(struct __sk_buff *skb) { + int idx = src_to_dst_tmp; __u8 zero[ETH_ALEN * 2]; bool redirect = false; @@ -138,11 +131,14 @@ SEC("src_ingress") int tc_src(struct __sk_buff *skb) if (!redirect) return TC_ACT_OK; + barrier_data(&idx); + idx = bpf_ntohl(idx); + __builtin_memset(&zero, 0, sizeof(zero)); if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0) return TC_ACT_SHOT; - return bpf_redirect_neigh(get_dev_ifindex(dev_dst), 0); + return bpf_redirect_neigh(idx, 0); } char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_tc_neigh.sh b/tools/testing/selftests/bpf/test_tc_neigh.sh new file mode 100755 index 0000000000000..31d8c3df8b242 --- /dev/null +++ b/tools/testing/selftests/bpf/test_tc_neigh.sh @@ -0,0 +1,168 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link +# between src and dst. The netns fwd has veth links to each src and dst. The +# client is in src and server in dst. The test installs a TC BPF program to each +# host facing veth in fwd which calls into bpf_redirect_peer() to perform the +# neigh addr population and redirect; it also installs a dropper prog on the +# egress side to drop skbs if neigh addrs were not populated. + +if [[ $EUID -ne 0 ]]; then + echo "This script must be run as root" + echo "FAIL" + exit 1 +fi + +# check that nc, dd, ping, ping6 and timeout are present +command -v nc >/dev/null 2>&1 || \ + { echo >&2 "nc is not available"; exit 1; } +command -v dd >/dev/null 2>&1 || \ + { echo >&2 "dd is not available"; exit 1; } +command -v timeout >/dev/null 2>&1 || \ + { echo >&2 "timeout is not available"; exit 1; } +command -v ping >/dev/null 2>&1 || \ + { echo >&2 "ping is not available"; exit 1; } +command -v ping6 >/dev/null 2>&1 || \ + { echo >&2 "ping6 is not available"; exit 1; } + +readonly GREEN='\033[0;92m' +readonly RED='\033[0;31m' +readonly NC='\033[0m' # No Color + +readonly PING_ARG="-c 3 -w 10 -q" + +readonly TIMEOUT=10 + +readonly NS_SRC="ns-src-$(mktemp -u XXXXXX)" +readonly NS_FWD="ns-fwd-$(mktemp -u XXXXXX)" +readonly NS_DST="ns-dst-$(mktemp -u XXXXXX)" + +readonly IP4_SRC="172.16.1.100" +readonly IP4_DST="172.16.2.100" + +readonly IP6_SRC="::1:dead:beef:cafe" +readonly IP6_DST="::2:dead:beef:cafe" + +readonly IP4_SLL="169.254.0.1" +readonly IP4_DLL="169.254.0.2" +readonly IP4_NET="169.254.0.0" + +cleanup() +{ + ip netns del ${NS_SRC} + ip netns del ${NS_FWD} + ip netns del ${NS_DST} +} + +trap cleanup EXIT + +set -e + +ip netns add "${NS_SRC}" +ip netns add "${NS_FWD}" +ip netns add "${NS_DST}" + +ip link add veth_src type veth peer name veth_src_fwd +ip link add veth_dst type veth peer name veth_dst_fwd + +ip link set veth_src netns ${NS_SRC} +ip link set veth_src_fwd netns ${NS_FWD} + +ip link set veth_dst netns ${NS_DST} +ip link set veth_dst_fwd netns ${NS_FWD} + +ip -netns ${NS_SRC} addr add ${IP4_SRC}/32 dev veth_src +ip -netns ${NS_DST} addr add ${IP4_DST}/32 dev veth_dst + +# The fwd netns automatically get a v6 LL address / routes, but also needs v4 +# one in order to start ARP probing. IP4_NET route is added to the endpoints +# so that the ARP processing will reply. + +ip -netns ${NS_FWD} addr add ${IP4_SLL}/32 dev veth_src_fwd +ip -netns ${NS_FWD} addr add ${IP4_DLL}/32 dev veth_dst_fwd + +ip -netns ${NS_SRC} addr add ${IP6_SRC}/128 dev veth_src nodad +ip -netns ${NS_DST} addr add ${IP6_DST}/128 dev veth_dst nodad + +ip -netns ${NS_SRC} link set dev veth_src up +ip -netns ${NS_FWD} link set dev veth_src_fwd up + +ip -netns ${NS_DST} link set dev veth_dst up +ip -netns ${NS_FWD} link set dev veth_dst_fwd up + +ip -netns ${NS_SRC} route add ${IP4_DST}/32 dev veth_src scope global +ip -netns ${NS_SRC} route add ${IP4_NET}/16 dev veth_src scope global +ip -netns ${NS_FWD} route add ${IP4_SRC}/32 dev veth_src_fwd scope global + +ip -netns ${NS_SRC} route add ${IP6_DST}/128 dev veth_src scope global +ip -netns ${NS_FWD} route add ${IP6_SRC}/128 dev veth_src_fwd scope global + +ip -netns ${NS_DST} route add ${IP4_SRC}/32 dev veth_dst scope global +ip -netns ${NS_DST} route add ${IP4_NET}/16 dev veth_dst scope global +ip -netns ${NS_FWD} route add ${IP4_DST}/32 dev veth_dst_fwd scope global + +ip -netns ${NS_DST} route add ${IP6_SRC}/128 dev veth_dst scope global +ip -netns ${NS_FWD} route add ${IP6_DST}/128 dev veth_dst_fwd scope global + +fmac_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/address) +fmac_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/address) + +ip -netns ${NS_SRC} neigh add ${IP4_DST} dev veth_src lladdr $fmac_src +ip -netns ${NS_DST} neigh add ${IP4_SRC} dev veth_dst lladdr $fmac_dst + +ip -netns ${NS_SRC} neigh add ${IP6_DST} dev veth_src lladdr $fmac_src +ip -netns ${NS_DST} neigh add ${IP6_SRC} dev veth_dst lladdr $fmac_dst + +veth_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/ifindex | awk '{printf "%08x\n", $1}') +veth_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/ifindex | awk '{printf "%08x\n", $1}') + +xxd -p < test_tc_neigh.o | sed "s/eeddddee/$veth_src/g" | xxd -r -p > test_tc_neigh.x.o +xxd -p < test_tc_neigh.x.o | sed "s/eeffffee/$veth_dst/g" | xxd -r -p > test_tc_neigh.y.o + +ip netns exec ${NS_FWD} tc qdisc add dev veth_src_fwd clsact +ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd ingress bpf da obj test_tc_neigh.y.o sec src_ingress +ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd egress bpf da obj test_tc_neigh.y.o sec chk_neigh + +ip netns exec ${NS_FWD} tc qdisc add dev veth_dst_fwd clsact +ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd ingress bpf da obj test_tc_neigh.y.o sec dst_ingress +ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd egress bpf da obj test_tc_neigh.y.o sec chk_neigh + +rm -f test_tc_neigh.x.o test_tc_neigh.y.o + +ip netns exec ${NS_DST} bash -c "nc -4 -l -p 9004 &" +ip netns exec ${NS_DST} bash -c "nc -6 -l -p 9006 &" + +set +e + +TEST="TCPv4 connectivity test" +ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP4_DST}/9004" +if [ $? -ne 0 ]; then + echo -e "${TEST}: ${RED}FAIL${NC}" + exit 1 +fi +echo -e "${TEST}: ${GREEN}PASS${NC}" + +TEST="TCPv6 connectivity test" +ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP6_DST}/9006" +if [ $? -ne 0 ]; then + echo -e "${TEST}: ${RED}FAIL${NC}" + exit 1 +fi +echo -e "${TEST}: ${GREEN}PASS${NC}" + +TEST="ICMPv4 connectivity test" +ip netns exec ${NS_SRC} ping $PING_ARG ${IP4_DST} +if [ $? -ne 0 ]; then + echo -e "${TEST}: ${RED}FAIL${NC}" + exit 1 +fi +echo -e "${TEST}: ${GREEN}PASS${NC}" + +TEST="ICMPv6 connectivity test" +ip netns exec ${NS_SRC} ping6 $PING_ARG ${IP6_DST} +if [ $? -ne 0 ]; then + echo -e "${TEST}: ${RED}FAIL${NC}" + exit 1 +fi +echo -e "${TEST}: ${GREEN}PASS${NC}" diff --git a/tools/testing/selftests/bpf/test_tc_redirect.sh b/tools/testing/selftests/bpf/test_tc_redirect.sh deleted file mode 100755 index 6ad4414051323..0000000000000 --- a/tools/testing/selftests/bpf/test_tc_redirect.sh +++ /dev/null @@ -1,197 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link -# between src and dst. The netns fwd has veth links to each src and dst. The -# client is in src and server in dst. The test installs a TC BPF program to each -# host facing veth in fwd which calls into bpf_redirect_peer() to perform the -# neigh addr population and redirect; it also installs a dropper prog on the -# egress side to drop skbs if neigh addrs were not populated. - -if [[ $EUID -ne 0 ]]; then - echo "This script must be run as root" - echo "FAIL" - exit 1 -fi - -# check that needed tools are present -command -v nc >/dev/null 2>&1 || \ - { echo >&2 "nc is not available"; exit 1; } -command -v dd >/dev/null 2>&1 || \ - { echo >&2 "dd is not available"; exit 1; } -command -v timeout >/dev/null 2>&1 || \ - { echo >&2 "timeout is not available"; exit 1; } -command -v ping >/dev/null 2>&1 || \ - { echo >&2 "ping is not available"; exit 1; } -command -v ping6 >/dev/null 2>&1 || \ - { echo >&2 "ping6 is not available"; exit 1; } -command -v perl >/dev/null 2>&1 || \ - { echo >&2 "perl is not available"; exit 1; } -command -v jq >/dev/null 2>&1 || \ - { echo >&2 "jq is not available"; exit 1; } -command -v bpftool >/dev/null 2>&1 || \ - { echo >&2 "bpftool is not available"; exit 1; } - -readonly GREEN='\033[0;92m' -readonly RED='\033[0;31m' -readonly NC='\033[0m' # No Color - -readonly PING_ARG="-c 3 -w 10 -q" - -readonly TIMEOUT=10 - -readonly NS_SRC="ns-src-$(mktemp -u XXXXXX)" -readonly NS_FWD="ns-fwd-$(mktemp -u XXXXXX)" -readonly NS_DST="ns-dst-$(mktemp -u XXXXXX)" - -readonly IP4_SRC="172.16.1.100" -readonly IP4_DST="172.16.2.100" - -readonly IP6_SRC="::1:dead:beef:cafe" -readonly IP6_DST="::2:dead:beef:cafe" - -readonly IP4_SLL="169.254.0.1" -readonly IP4_DLL="169.254.0.2" -readonly IP4_NET="169.254.0.0" - -netns_cleanup() -{ - ip netns del ${NS_SRC} - ip netns del ${NS_FWD} - ip netns del ${NS_DST} -} - -netns_setup() -{ - ip netns add "${NS_SRC}" - ip netns add "${NS_FWD}" - ip netns add "${NS_DST}" - - ip link add veth_src type veth peer name veth_src_fwd - ip link add veth_dst type veth peer name veth_dst_fwd - - ip link set veth_src netns ${NS_SRC} - ip link set veth_src_fwd netns ${NS_FWD} - - ip link set veth_dst netns ${NS_DST} - ip link set veth_dst_fwd netns ${NS_FWD} - - ip -netns ${NS_SRC} addr add ${IP4_SRC}/32 dev veth_src - ip -netns ${NS_DST} addr add ${IP4_DST}/32 dev veth_dst - - # The fwd netns automatically get a v6 LL address / routes, but also - # needs v4 one in order to start ARP probing. IP4_NET route is added - # to the endpoints so that the ARP processing will reply. - - ip -netns ${NS_FWD} addr add ${IP4_SLL}/32 dev veth_src_fwd - ip -netns ${NS_FWD} addr add ${IP4_DLL}/32 dev veth_dst_fwd - - ip -netns ${NS_SRC} addr add ${IP6_SRC}/128 dev veth_src nodad - ip -netns ${NS_DST} addr add ${IP6_DST}/128 dev veth_dst nodad - - ip -netns ${NS_SRC} link set dev veth_src up - ip -netns ${NS_FWD} link set dev veth_src_fwd up - - ip -netns ${NS_DST} link set dev veth_dst up - ip -netns ${NS_FWD} link set dev veth_dst_fwd up - - ip -netns ${NS_SRC} route add ${IP4_DST}/32 dev veth_src scope global - ip -netns ${NS_SRC} route add ${IP4_NET}/16 dev veth_src scope global - ip -netns ${NS_FWD} route add ${IP4_SRC}/32 dev veth_src_fwd scope global - - ip -netns ${NS_SRC} route add ${IP6_DST}/128 dev veth_src scope global - ip -netns ${NS_FWD} route add ${IP6_SRC}/128 dev veth_src_fwd scope global - - ip -netns ${NS_DST} route add ${IP4_SRC}/32 dev veth_dst scope global - ip -netns ${NS_DST} route add ${IP4_NET}/16 dev veth_dst scope global - ip -netns ${NS_FWD} route add ${IP4_DST}/32 dev veth_dst_fwd scope global - - ip -netns ${NS_DST} route add ${IP6_SRC}/128 dev veth_dst scope global - ip -netns ${NS_FWD} route add ${IP6_DST}/128 dev veth_dst_fwd scope global - - fmac_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/address) - fmac_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/address) - - ip -netns ${NS_SRC} neigh add ${IP4_DST} dev veth_src lladdr $fmac_src - ip -netns ${NS_DST} neigh add ${IP4_SRC} dev veth_dst lladdr $fmac_dst - - ip -netns ${NS_SRC} neigh add ${IP6_DST} dev veth_src lladdr $fmac_src - ip -netns ${NS_DST} neigh add ${IP6_SRC} dev veth_dst lladdr $fmac_dst -} - -netns_test_connectivity() -{ - set +e - - ip netns exec ${NS_DST} bash -c "nc -4 -l -p 9004 &" - ip netns exec ${NS_DST} bash -c "nc -6 -l -p 9006 &" - - TEST="TCPv4 connectivity test" - ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP4_DST}/9004" - if [ $? -ne 0 ]; then - echo -e "${TEST}: ${RED}FAIL${NC}" - exit 1 - fi - echo -e "${TEST}: ${GREEN}PASS${NC}" - - TEST="TCPv6 connectivity test" - ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP6_DST}/9006" - if [ $? -ne 0 ]; then - echo -e "${TEST}: ${RED}FAIL${NC}" - exit 1 - fi - echo -e "${TEST}: ${GREEN}PASS${NC}" - - TEST="ICMPv4 connectivity test" - ip netns exec ${NS_SRC} ping $PING_ARG ${IP4_DST} - if [ $? -ne 0 ]; then - echo -e "${TEST}: ${RED}FAIL${NC}" - exit 1 - fi - echo -e "${TEST}: ${GREEN}PASS${NC}" - - TEST="ICMPv6 connectivity test" - ip netns exec ${NS_SRC} ping6 $PING_ARG ${IP6_DST} - if [ $? -ne 0 ]; then - echo -e "${TEST}: ${RED}FAIL${NC}" - exit 1 - fi - echo -e "${TEST}: ${GREEN}PASS${NC}" - - set -e -} - -hex_mem_str() -{ - perl -e 'print join(" ", unpack("(H2)8", pack("L", @ARGV)))' $1 -} - -netns_setup_neigh() -{ - ip netns exec ${NS_FWD} tc qdisc add dev veth_src_fwd clsact - ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd ingress bpf da obj test_tc_neigh.o sec src_ingress - ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd egress bpf da obj test_tc_neigh.o sec chk_egress - - ip netns exec ${NS_FWD} tc qdisc add dev veth_dst_fwd clsact - ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd ingress bpf da obj test_tc_neigh.o sec dst_ingress - ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd egress bpf da obj test_tc_neigh.o sec chk_egress - - veth_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/ifindex) - veth_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/ifindex) - - progs=$(ip netns exec ${NS_FWD} bpftool net --json | jq -r '.[] | .tc | map(.id) | .[]') - for prog in $progs; do - map=$(bpftool prog show id $prog --json | jq -r '.map_ids | .? | .[]') - if [ ! -z "$map" ]; then - bpftool map update id $map key hex $(hex_mem_str 0) value hex $(hex_mem_str $veth_src) - bpftool map update id $map key hex $(hex_mem_str 1) value hex $(hex_mem_str $veth_dst) - fi - done -} - -trap netns_cleanup EXIT -set -e - -netns_setup -netns_setup_neigh -netns_test_connectivity From f46ef39c206658109fd4a398d86f350f3b758761 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 17 Jun 2022 08:02:36 +0000 Subject: [PATCH 0247/5344] Revert "bpf, selftests: Add redirect_neigh selftest" This reverts commit 67f9efd20c0174fa435b1bd2b5f66abc70c44e89. Signed-off-by: Qi Zheng --- .../selftests/bpf/progs/test_tc_neigh.c | 144 --------------- tools/testing/selftests/bpf/test_tc_neigh.sh | 168 ------------------ 2 files changed, 312 deletions(-) delete mode 100644 tools/testing/selftests/bpf/progs/test_tc_neigh.c delete mode 100755 tools/testing/selftests/bpf/test_tc_neigh.sh diff --git a/tools/testing/selftests/bpf/progs/test_tc_neigh.c b/tools/testing/selftests/bpf/progs/test_tc_neigh.c deleted file mode 100644 index 889a72c3024fd..0000000000000 --- a/tools/testing/selftests/bpf/progs/test_tc_neigh.c +++ /dev/null @@ -1,144 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#ifndef barrier_data -# define barrier_data(ptr) asm volatile("": :"r"(ptr) :"memory") -#endif - -#ifndef ctx_ptr -# define ctx_ptr(field) (void *)(long)(field) -#endif - -#define dst_to_src_tmp 0xeeddddeeU -#define src_to_dst_tmp 0xeeffffeeU - -#define ip4_src 0xac100164 /* 172.16.1.100 */ -#define ip4_dst 0xac100264 /* 172.16.2.100 */ - -#define ip6_src { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ - 0x00, 0x01, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe } -#define ip6_dst { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ - 0x00, 0x02, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe } - -#ifndef v6_equal -# define v6_equal(a, b) (a.s6_addr32[0] == b.s6_addr32[0] && \ - a.s6_addr32[1] == b.s6_addr32[1] && \ - a.s6_addr32[2] == b.s6_addr32[2] && \ - a.s6_addr32[3] == b.s6_addr32[3]) -#endif - -static __always_inline bool is_remote_ep_v4(struct __sk_buff *skb, - __be32 addr) -{ - void *data_end = ctx_ptr(skb->data_end); - void *data = ctx_ptr(skb->data); - struct iphdr *ip4h; - - if (data + sizeof(struct ethhdr) > data_end) - return false; - - ip4h = (struct iphdr *)(data + sizeof(struct ethhdr)); - if ((void *)(ip4h + 1) > data_end) - return false; - - return ip4h->daddr == addr; -} - -static __always_inline bool is_remote_ep_v6(struct __sk_buff *skb, - struct in6_addr addr) -{ - void *data_end = ctx_ptr(skb->data_end); - void *data = ctx_ptr(skb->data); - struct ipv6hdr *ip6h; - - if (data + sizeof(struct ethhdr) > data_end) - return false; - - ip6h = (struct ipv6hdr *)(data + sizeof(struct ethhdr)); - if ((void *)(ip6h + 1) > data_end) - return false; - - return v6_equal(ip6h->daddr, addr); -} - -SEC("chk_neigh") int tc_chk(struct __sk_buff *skb) -{ - void *data_end = ctx_ptr(skb->data_end); - void *data = ctx_ptr(skb->data); - __u32 *raw = data; - - if (data + sizeof(struct ethhdr) > data_end) - return TC_ACT_SHOT; - - return !raw[0] && !raw[1] && !raw[2] ? TC_ACT_SHOT : TC_ACT_OK; -} - -SEC("dst_ingress") int tc_dst(struct __sk_buff *skb) -{ - int idx = dst_to_src_tmp; - __u8 zero[ETH_ALEN * 2]; - bool redirect = false; - - switch (skb->protocol) { - case __bpf_constant_htons(ETH_P_IP): - redirect = is_remote_ep_v4(skb, __bpf_constant_htonl(ip4_src)); - break; - case __bpf_constant_htons(ETH_P_IPV6): - redirect = is_remote_ep_v6(skb, (struct in6_addr)ip6_src); - break; - } - - if (!redirect) - return TC_ACT_OK; - - barrier_data(&idx); - idx = bpf_ntohl(idx); - - __builtin_memset(&zero, 0, sizeof(zero)); - if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0) - return TC_ACT_SHOT; - - return bpf_redirect_neigh(idx, 0); -} - -SEC("src_ingress") int tc_src(struct __sk_buff *skb) -{ - int idx = src_to_dst_tmp; - __u8 zero[ETH_ALEN * 2]; - bool redirect = false; - - switch (skb->protocol) { - case __bpf_constant_htons(ETH_P_IP): - redirect = is_remote_ep_v4(skb, __bpf_constant_htonl(ip4_dst)); - break; - case __bpf_constant_htons(ETH_P_IPV6): - redirect = is_remote_ep_v6(skb, (struct in6_addr)ip6_dst); - break; - } - - if (!redirect) - return TC_ACT_OK; - - barrier_data(&idx); - idx = bpf_ntohl(idx); - - __builtin_memset(&zero, 0, sizeof(zero)); - if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0) - return TC_ACT_SHOT; - - return bpf_redirect_neigh(idx, 0); -} - -char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_tc_neigh.sh b/tools/testing/selftests/bpf/test_tc_neigh.sh deleted file mode 100755 index 31d8c3df8b242..0000000000000 --- a/tools/testing/selftests/bpf/test_tc_neigh.sh +++ /dev/null @@ -1,168 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link -# between src and dst. The netns fwd has veth links to each src and dst. The -# client is in src and server in dst. The test installs a TC BPF program to each -# host facing veth in fwd which calls into bpf_redirect_peer() to perform the -# neigh addr population and redirect; it also installs a dropper prog on the -# egress side to drop skbs if neigh addrs were not populated. - -if [[ $EUID -ne 0 ]]; then - echo "This script must be run as root" - echo "FAIL" - exit 1 -fi - -# check that nc, dd, ping, ping6 and timeout are present -command -v nc >/dev/null 2>&1 || \ - { echo >&2 "nc is not available"; exit 1; } -command -v dd >/dev/null 2>&1 || \ - { echo >&2 "dd is not available"; exit 1; } -command -v timeout >/dev/null 2>&1 || \ - { echo >&2 "timeout is not available"; exit 1; } -command -v ping >/dev/null 2>&1 || \ - { echo >&2 "ping is not available"; exit 1; } -command -v ping6 >/dev/null 2>&1 || \ - { echo >&2 "ping6 is not available"; exit 1; } - -readonly GREEN='\033[0;92m' -readonly RED='\033[0;31m' -readonly NC='\033[0m' # No Color - -readonly PING_ARG="-c 3 -w 10 -q" - -readonly TIMEOUT=10 - -readonly NS_SRC="ns-src-$(mktemp -u XXXXXX)" -readonly NS_FWD="ns-fwd-$(mktemp -u XXXXXX)" -readonly NS_DST="ns-dst-$(mktemp -u XXXXXX)" - -readonly IP4_SRC="172.16.1.100" -readonly IP4_DST="172.16.2.100" - -readonly IP6_SRC="::1:dead:beef:cafe" -readonly IP6_DST="::2:dead:beef:cafe" - -readonly IP4_SLL="169.254.0.1" -readonly IP4_DLL="169.254.0.2" -readonly IP4_NET="169.254.0.0" - -cleanup() -{ - ip netns del ${NS_SRC} - ip netns del ${NS_FWD} - ip netns del ${NS_DST} -} - -trap cleanup EXIT - -set -e - -ip netns add "${NS_SRC}" -ip netns add "${NS_FWD}" -ip netns add "${NS_DST}" - -ip link add veth_src type veth peer name veth_src_fwd -ip link add veth_dst type veth peer name veth_dst_fwd - -ip link set veth_src netns ${NS_SRC} -ip link set veth_src_fwd netns ${NS_FWD} - -ip link set veth_dst netns ${NS_DST} -ip link set veth_dst_fwd netns ${NS_FWD} - -ip -netns ${NS_SRC} addr add ${IP4_SRC}/32 dev veth_src -ip -netns ${NS_DST} addr add ${IP4_DST}/32 dev veth_dst - -# The fwd netns automatically get a v6 LL address / routes, but also needs v4 -# one in order to start ARP probing. IP4_NET route is added to the endpoints -# so that the ARP processing will reply. - -ip -netns ${NS_FWD} addr add ${IP4_SLL}/32 dev veth_src_fwd -ip -netns ${NS_FWD} addr add ${IP4_DLL}/32 dev veth_dst_fwd - -ip -netns ${NS_SRC} addr add ${IP6_SRC}/128 dev veth_src nodad -ip -netns ${NS_DST} addr add ${IP6_DST}/128 dev veth_dst nodad - -ip -netns ${NS_SRC} link set dev veth_src up -ip -netns ${NS_FWD} link set dev veth_src_fwd up - -ip -netns ${NS_DST} link set dev veth_dst up -ip -netns ${NS_FWD} link set dev veth_dst_fwd up - -ip -netns ${NS_SRC} route add ${IP4_DST}/32 dev veth_src scope global -ip -netns ${NS_SRC} route add ${IP4_NET}/16 dev veth_src scope global -ip -netns ${NS_FWD} route add ${IP4_SRC}/32 dev veth_src_fwd scope global - -ip -netns ${NS_SRC} route add ${IP6_DST}/128 dev veth_src scope global -ip -netns ${NS_FWD} route add ${IP6_SRC}/128 dev veth_src_fwd scope global - -ip -netns ${NS_DST} route add ${IP4_SRC}/32 dev veth_dst scope global -ip -netns ${NS_DST} route add ${IP4_NET}/16 dev veth_dst scope global -ip -netns ${NS_FWD} route add ${IP4_DST}/32 dev veth_dst_fwd scope global - -ip -netns ${NS_DST} route add ${IP6_SRC}/128 dev veth_dst scope global -ip -netns ${NS_FWD} route add ${IP6_DST}/128 dev veth_dst_fwd scope global - -fmac_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/address) -fmac_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/address) - -ip -netns ${NS_SRC} neigh add ${IP4_DST} dev veth_src lladdr $fmac_src -ip -netns ${NS_DST} neigh add ${IP4_SRC} dev veth_dst lladdr $fmac_dst - -ip -netns ${NS_SRC} neigh add ${IP6_DST} dev veth_src lladdr $fmac_src -ip -netns ${NS_DST} neigh add ${IP6_SRC} dev veth_dst lladdr $fmac_dst - -veth_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/ifindex | awk '{printf "%08x\n", $1}') -veth_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/ifindex | awk '{printf "%08x\n", $1}') - -xxd -p < test_tc_neigh.o | sed "s/eeddddee/$veth_src/g" | xxd -r -p > test_tc_neigh.x.o -xxd -p < test_tc_neigh.x.o | sed "s/eeffffee/$veth_dst/g" | xxd -r -p > test_tc_neigh.y.o - -ip netns exec ${NS_FWD} tc qdisc add dev veth_src_fwd clsact -ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd ingress bpf da obj test_tc_neigh.y.o sec src_ingress -ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd egress bpf da obj test_tc_neigh.y.o sec chk_neigh - -ip netns exec ${NS_FWD} tc qdisc add dev veth_dst_fwd clsact -ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd ingress bpf da obj test_tc_neigh.y.o sec dst_ingress -ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd egress bpf da obj test_tc_neigh.y.o sec chk_neigh - -rm -f test_tc_neigh.x.o test_tc_neigh.y.o - -ip netns exec ${NS_DST} bash -c "nc -4 -l -p 9004 &" -ip netns exec ${NS_DST} bash -c "nc -6 -l -p 9006 &" - -set +e - -TEST="TCPv4 connectivity test" -ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP4_DST}/9004" -if [ $? -ne 0 ]; then - echo -e "${TEST}: ${RED}FAIL${NC}" - exit 1 -fi -echo -e "${TEST}: ${GREEN}PASS${NC}" - -TEST="TCPv6 connectivity test" -ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP6_DST}/9006" -if [ $? -ne 0 ]; then - echo -e "${TEST}: ${RED}FAIL${NC}" - exit 1 -fi -echo -e "${TEST}: ${GREEN}PASS${NC}" - -TEST="ICMPv4 connectivity test" -ip netns exec ${NS_SRC} ping $PING_ARG ${IP4_DST} -if [ $? -ne 0 ]; then - echo -e "${TEST}: ${RED}FAIL${NC}" - exit 1 -fi -echo -e "${TEST}: ${GREEN}PASS${NC}" - -TEST="ICMPv6 connectivity test" -ip netns exec ${NS_SRC} ping6 $PING_ARG ${IP6_DST} -if [ $? -ne 0 ]; then - echo -e "${TEST}: ${RED}FAIL${NC}" - exit 1 -fi -echo -e "${TEST}: ${GREEN}PASS${NC}" From 9df7636737dfc3af57127e06488aa7ef2c7fa9f9 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 17 Jun 2022 08:06:13 +0000 Subject: [PATCH 0248/5344] Revert "selftests/bpf: Fix endianness issue in sk_assign" This reverts commit 52ac4203297031659af0d9526e924921b4b79832. Signed-off-by: Qi Zheng --- tools/testing/selftests/bpf/prog_tests/sk_assign.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c index 3a469099f30d8..a49a26f95a8bb 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_assign.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c @@ -265,7 +265,7 @@ void test_sk_assign(void) TEST("ipv6 udp port redir", AF_INET6, SOCK_DGRAM, false), TEST("ipv6 udp addr redir", AF_INET6, SOCK_DGRAM, true), }; - __s64 server = -1; + int server = -1; int server_map; int self_net; int i; From 7b6b3472769fa7ae838fcbead27feda68309f65a Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 17 Jun 2022 08:08:00 +0000 Subject: [PATCH 0249/5344] Revert "selftests/bpf: Enable tc verbose mode for test_sk_assign" This reverts commit 1367f7fae4ecb7af8d70109e149073bdf6a18f37. Signed-off-by: Qi Zheng --- tools/testing/selftests/bpf/prog_tests/sk_assign.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c index a49a26f95a8bb..d43038d2b9e15 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_assign.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c @@ -49,7 +49,7 @@ configure_stack(void) sprintf(tc_cmd, "%s %s %s %s", "tc filter add dev lo ingress bpf", "direct-action object-file ./test_sk_assign.o", "section classifier/sk_assign_test", - (env.verbosity < VERBOSE_VERY) ? " 2>/dev/null" : "verbose"); + (env.verbosity < VERBOSE_VERY) ? " 2>/dev/null" : ""); if (CHECK(system(tc_cmd), "BPF load failed;", "run with -vv for more info\n")) return false; From 6dcdda838bd26cda67f9f4e89422d559f1d3ce70 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 17 Jun 2022 08:08:26 +0000 Subject: [PATCH 0250/5344] Revert "selftests/bpf: Fix two minor compilation warnings reported by GCC 4.9" This reverts commit e4c489a9ec58c7349a12d85e8a151d5251a8eac4. Signed-off-by: Qi Zheng --- tools/testing/selftests/bpf/prog_tests/sk_assign.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c index d43038d2b9e15..47fa04adc1471 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_assign.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c @@ -268,7 +268,6 @@ void test_sk_assign(void) int server = -1; int server_map; int self_net; - int i; self_net = open(NS_SELF, O_RDONLY); if (CHECK_FAIL(self_net < 0)) { @@ -287,7 +286,7 @@ void test_sk_assign(void) goto cleanup; } - for (i = 0; i < ARRAY_SIZE(tests) && !READ_ONCE(stop); i++) { + for (int i = 0; i < ARRAY_SIZE(tests) && !READ_ONCE(stop); i++) { struct test_sk_cfg *test = &tests[i]; const struct sockaddr *addr; const int zero = 0; From d8ab66c57a2d6e31738fefea1915838ded1f8c34 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 17 Jun 2022 08:08:51 +0000 Subject: [PATCH 0251/5344] Revert "selftests/bpf: Use SOCKMAP for server sockets in bpf_sk_assign test" This reverts commit f8ef29b6f7361ecf4b5a6f9b1396ad2559e646ed. Signed-off-by: Qi Zheng --- .../selftests/bpf/prog_tests/sk_assign.c | 21 +---- .../selftests/bpf/progs/test_sk_assign.c | 82 +++++++++++-------- 2 files changed, 51 insertions(+), 52 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c index 47fa04adc1471..d572e1a2c297a 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_assign.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c @@ -20,7 +20,6 @@ #define CONNECT_PORT 4321 #define TEST_DADDR (0xC0A80203) #define NS_SELF "/proc/self/ns/net" -#define SERVER_MAP_PATH "/sys/fs/bpf/tc/globals/server_map" static const struct timeval timeo_sec = { .tv_sec = 3 }; static const size_t timeo_optlen = sizeof(timeo_sec); @@ -266,7 +265,6 @@ void test_sk_assign(void) TEST("ipv6 udp addr redir", AF_INET6, SOCK_DGRAM, true), }; int server = -1; - int server_map; int self_net; self_net = open(NS_SELF, O_RDONLY); @@ -280,17 +278,9 @@ void test_sk_assign(void) goto cleanup; } - server_map = bpf_obj_get(SERVER_MAP_PATH); - if (CHECK_FAIL(server_map < 0)) { - perror("Unable to open " SERVER_MAP_PATH); - goto cleanup; - } - for (int i = 0; i < ARRAY_SIZE(tests) && !READ_ONCE(stop); i++) { struct test_sk_cfg *test = &tests[i]; const struct sockaddr *addr; - const int zero = 0; - int err; if (!test__start_subtest(test->name)) continue; @@ -298,13 +288,7 @@ void test_sk_assign(void) addr = (const struct sockaddr *)test->addr; server = start_server(addr, test->len, test->type); if (server == -1) - goto close; - - err = bpf_map_update_elem(server_map, &zero, &server, BPF_ANY); - if (CHECK_FAIL(err)) { - perror("Unable to update server_map"); - goto close; - } + goto cleanup; /* connect to unbound ports */ prepare_addr(test->addr, test->family, CONNECT_PORT, @@ -318,10 +302,7 @@ void test_sk_assign(void) close: close(server); - close(server_map); cleanup: - if (CHECK_FAIL(unlink(SERVER_MAP_PATH))) - perror("Unable to unlink " SERVER_MAP_PATH); if (CHECK_FAIL(setns(self_net, CLONE_NEWNET))) perror("Failed to setns("NS_SELF")"); close(self_net); diff --git a/tools/testing/selftests/bpf/progs/test_sk_assign.c b/tools/testing/selftests/bpf/progs/test_sk_assign.c index 1ecd987005d2c..8f530843b4da8 100644 --- a/tools/testing/selftests/bpf/progs/test_sk_assign.c +++ b/tools/testing/selftests/bpf/progs/test_sk_assign.c @@ -16,26 +16,6 @@ #include #include -/* Pin map under /sys/fs/bpf/tc/globals/ */ -#define PIN_GLOBAL_NS 2 - -/* Must match struct bpf_elf_map layout from iproute2 */ -struct { - __u32 type; - __u32 size_key; - __u32 size_value; - __u32 max_elem; - __u32 flags; - __u32 id; - __u32 pinning; -} server_map SEC("maps") = { - .type = BPF_MAP_TYPE_SOCKMAP, - .size_key = sizeof(int), - .size_value = sizeof(__u64), - .max_elem = 1, - .pinning = PIN_GLOBAL_NS, -}; - int _version SEC("version") = 1; char _license[] SEC("license") = "GPL"; @@ -92,9 +72,7 @@ handle_udp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4) { struct bpf_sock_tuple ln = {0}; struct bpf_sock *sk; - const int zero = 0; size_t tuple_len; - __be16 dport; int ret; tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6); @@ -105,11 +83,32 @@ handle_udp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4) if (sk) goto assign; - dport = ipv4 ? tuple->ipv4.dport : tuple->ipv6.dport; - if (dport != bpf_htons(4321)) - return TC_ACT_OK; + if (ipv4) { + if (tuple->ipv4.dport != bpf_htons(4321)) + return TC_ACT_OK; + + ln.ipv4.daddr = bpf_htonl(0x7f000001); + ln.ipv4.dport = bpf_htons(1234); + + sk = bpf_sk_lookup_udp(skb, &ln, sizeof(ln.ipv4), + BPF_F_CURRENT_NETNS, 0); + } else { + if (tuple->ipv6.dport != bpf_htons(4321)) + return TC_ACT_OK; + + /* Upper parts of daddr are already zero. */ + ln.ipv6.daddr[3] = bpf_htonl(0x1); + ln.ipv6.dport = bpf_htons(1234); + + sk = bpf_sk_lookup_udp(skb, &ln, sizeof(ln.ipv6), + BPF_F_CURRENT_NETNS, 0); + } - sk = bpf_map_lookup_elem(&server_map, &zero); + /* workaround: We can't do a single socket lookup here, because then + * the compiler will likely spill tuple_len to the stack. This makes it + * lose all bounds information in the verifier, which then rejects the + * call as unsafe. + */ if (!sk) return TC_ACT_SHOT; @@ -124,9 +123,7 @@ handle_tcp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4) { struct bpf_sock_tuple ln = {0}; struct bpf_sock *sk; - const int zero = 0; size_t tuple_len; - __be16 dport; int ret; tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6); @@ -140,11 +137,32 @@ handle_tcp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4) bpf_sk_release(sk); } - dport = ipv4 ? tuple->ipv4.dport : tuple->ipv6.dport; - if (dport != bpf_htons(4321)) - return TC_ACT_OK; + if (ipv4) { + if (tuple->ipv4.dport != bpf_htons(4321)) + return TC_ACT_OK; - sk = bpf_map_lookup_elem(&server_map, &zero); + ln.ipv4.daddr = bpf_htonl(0x7f000001); + ln.ipv4.dport = bpf_htons(1234); + + sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv4), + BPF_F_CURRENT_NETNS, 0); + } else { + if (tuple->ipv6.dport != bpf_htons(4321)) + return TC_ACT_OK; + + /* Upper parts of daddr are already zero. */ + ln.ipv6.daddr[3] = bpf_htonl(0x1); + ln.ipv6.dport = bpf_htons(1234); + + sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv6), + BPF_F_CURRENT_NETNS, 0); + } + + /* workaround: We can't do a single socket lookup here, because then + * the compiler will likely spill tuple_len to the stack. This makes it + * lose all bounds information in the verifier, which then rejects the + * call as unsafe. + */ if (!sk) return TC_ACT_SHOT; From 44c38c044f8da956e4f7fd09414fc8e5fa98f2f0 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 17 Jun 2022 08:09:10 +0000 Subject: [PATCH 0252/5344] Revert "selftests: bpf: Extend sk_assign tests for UDP" This reverts commit 7b43fb81e9665bf092696bd8a4afa777280acb0d. Signed-off-by: Qi Zheng --- .../selftests/bpf/prog_tests/sk_assign.c | 47 ++----------- .../selftests/bpf/progs/test_sk_assign.c | 69 ++----------------- 2 files changed, 11 insertions(+), 105 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c index d572e1a2c297a..25f17fe7d6785 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_assign.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c @@ -69,7 +69,7 @@ start_server(const struct sockaddr *addr, socklen_t len, int type) goto close_out; if (CHECK_FAIL(bind(fd, addr, len) == -1)) goto close_out; - if (type == SOCK_STREAM && CHECK_FAIL(listen(fd, 128) == -1)) + if (CHECK_FAIL(listen(fd, 128) == -1)) goto close_out; goto out; @@ -125,20 +125,6 @@ get_port(int fd) return port; } -static ssize_t -rcv_msg(int srv_client, int type) -{ - struct sockaddr_storage ss; - char buf[BUFSIZ]; - socklen_t slen; - - if (type == SOCK_STREAM) - return read(srv_client, &buf, sizeof(buf)); - else - return recvfrom(srv_client, &buf, sizeof(buf), 0, - (struct sockaddr *)&ss, &slen); -} - static int run_test(int server_fd, const struct sockaddr *addr, socklen_t len, int type) { @@ -153,20 +139,16 @@ run_test(int server_fd, const struct sockaddr *addr, socklen_t len, int type) goto out; } - if (type == SOCK_STREAM) { - srv_client = accept(server_fd, NULL, NULL); - if (CHECK_FAIL(srv_client == -1)) { - perror("Can't accept connection"); - goto out; - } - } else { - srv_client = server_fd; + srv_client = accept(server_fd, NULL, NULL); + if (CHECK_FAIL(srv_client == -1)) { + perror("Can't accept connection"); + goto out; } if (CHECK_FAIL(write(client, buf, sizeof(buf)) != sizeof(buf))) { perror("Can't write on client"); goto out; } - if (CHECK_FAIL(rcv_msg(srv_client, type) != sizeof(buf))) { + if (CHECK_FAIL(read(srv_client, &buf, sizeof(buf)) != sizeof(buf))) { perror("Can't read on server"); goto out; } @@ -174,20 +156,9 @@ run_test(int server_fd, const struct sockaddr *addr, socklen_t len, int type) port = get_port(srv_client); if (CHECK_FAIL(!port)) goto out; - /* SOCK_STREAM is connected via accept(), so the server's local address - * will be the CONNECT_PORT rather than the BIND port that corresponds - * to the listen socket. SOCK_DGRAM on the other hand is connectionless - * so we can't really do the same check there; the server doesn't ever - * create a socket with CONNECT_PORT. - */ - if (type == SOCK_STREAM && - CHECK(port != htons(CONNECT_PORT), "Expected", "port %u but got %u", + if (CHECK(port != htons(CONNECT_PORT), "Expected", "port %u but got %u", CONNECT_PORT, ntohs(port))) goto out; - else if (type == SOCK_DGRAM && - CHECK(port != htons(BIND_PORT), "Expected", - "port %u but got %u", BIND_PORT, ntohs(port))) - goto out; ret = 0; out: @@ -259,10 +230,6 @@ void test_sk_assign(void) TEST("ipv4 tcp addr redir", AF_INET, SOCK_STREAM, true), TEST("ipv6 tcp port redir", AF_INET6, SOCK_STREAM, false), TEST("ipv6 tcp addr redir", AF_INET6, SOCK_STREAM, true), - TEST("ipv4 udp port redir", AF_INET, SOCK_DGRAM, false), - TEST("ipv4 udp addr redir", AF_INET, SOCK_DGRAM, true), - TEST("ipv6 udp port redir", AF_INET6, SOCK_DGRAM, false), - TEST("ipv6 udp addr redir", AF_INET6, SOCK_DGRAM, true), }; int server = -1; int self_net; diff --git a/tools/testing/selftests/bpf/progs/test_sk_assign.c b/tools/testing/selftests/bpf/progs/test_sk_assign.c index 8f530843b4da8..2d9549126e51e 100644 --- a/tools/testing/selftests/bpf/progs/test_sk_assign.c +++ b/tools/testing/selftests/bpf/progs/test_sk_assign.c @@ -21,7 +21,7 @@ char _license[] SEC("license") = "GPL"; /* Fill 'tuple' with L3 info, and attempt to find L4. On fail, return NULL. */ static inline struct bpf_sock_tuple * -get_tuple(struct __sk_buff *skb, bool *ipv4, bool *tcp) +get_tuple(struct __sk_buff *skb, bool *ipv4) { void *data_end = (void *)(long)skb->data_end; void *data = (void *)(long)skb->data; @@ -60,64 +60,12 @@ get_tuple(struct __sk_buff *skb, bool *ipv4, bool *tcp) return (struct bpf_sock_tuple *)data; } - if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) + if (result + 1 > data_end || proto != IPPROTO_TCP) return NULL; - *tcp = (proto == IPPROTO_TCP); return result; } -static inline int -handle_udp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4) -{ - struct bpf_sock_tuple ln = {0}; - struct bpf_sock *sk; - size_t tuple_len; - int ret; - - tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6); - if ((void *)tuple + tuple_len > (void *)(long)skb->data_end) - return TC_ACT_SHOT; - - sk = bpf_sk_lookup_udp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0); - if (sk) - goto assign; - - if (ipv4) { - if (tuple->ipv4.dport != bpf_htons(4321)) - return TC_ACT_OK; - - ln.ipv4.daddr = bpf_htonl(0x7f000001); - ln.ipv4.dport = bpf_htons(1234); - - sk = bpf_sk_lookup_udp(skb, &ln, sizeof(ln.ipv4), - BPF_F_CURRENT_NETNS, 0); - } else { - if (tuple->ipv6.dport != bpf_htons(4321)) - return TC_ACT_OK; - - /* Upper parts of daddr are already zero. */ - ln.ipv6.daddr[3] = bpf_htonl(0x1); - ln.ipv6.dport = bpf_htons(1234); - - sk = bpf_sk_lookup_udp(skb, &ln, sizeof(ln.ipv6), - BPF_F_CURRENT_NETNS, 0); - } - - /* workaround: We can't do a single socket lookup here, because then - * the compiler will likely spill tuple_len to the stack. This makes it - * lose all bounds information in the verifier, which then rejects the - * call as unsafe. - */ - if (!sk) - return TC_ACT_SHOT; - -assign: - ret = bpf_sk_assign(skb, sk, 0); - bpf_sk_release(sk); - return ret; -} - static inline int handle_tcp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4) { @@ -182,23 +130,14 @@ int bpf_sk_assign_test(struct __sk_buff *skb) { struct bpf_sock_tuple *tuple, ln = {0}; bool ipv4 = false; - bool tcp = false; int tuple_len; int ret = 0; - tuple = get_tuple(skb, &ipv4, &tcp); + tuple = get_tuple(skb, &ipv4); if (!tuple) return TC_ACT_SHOT; - /* Note that the verifier socket return type for bpf_skc_lookup_tcp() - * differs from bpf_sk_lookup_udp(), so even though the C-level type is - * the same here, if we try to share the implementations they will - * fail to verify because we're crossing pointer types. - */ - if (tcp) - ret = handle_tcp(skb, tuple, ipv4); - else - ret = handle_udp(skb, tuple, ipv4); + ret = handle_tcp(skb, tuple, ipv4); return ret == 0 ? TC_ACT_OK : TC_ACT_SHOT; } From 3e9e76caf33f870c275bab9ff603394267069c75 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 17 Jun 2022 08:09:29 +0000 Subject: [PATCH 0253/5344] Revert "selftests: bpf: Add test for sk_assign" This reverts commit 0f571fccc2f8bb06e622d04a087c253ac1cc8dff. Signed-off-by: Qi Zheng --- .../selftests/bpf/prog_tests/sk_assign.c | 276 ------------------ .../selftests/bpf/progs/test_sk_assign.c | 143 --------- 2 files changed, 419 deletions(-) delete mode 100644 tools/testing/selftests/bpf/prog_tests/sk_assign.c delete mode 100644 tools/testing/selftests/bpf/progs/test_sk_assign.c diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c deleted file mode 100644 index 25f17fe7d6785..0000000000000 --- a/tools/testing/selftests/bpf/prog_tests/sk_assign.c +++ /dev/null @@ -1,276 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright (c) 2018 Facebook -// Copyright (c) 2019 Cloudflare -// Copyright (c) 2020 Isovalent, Inc. -/* - * Test that the socket assign program is able to redirect traffic towards a - * socket, regardless of whether the port or address destination of the traffic - * matches the port. - */ - -#define _GNU_SOURCE -#include -#include -#include -#include - -#include "test_progs.h" - -#define BIND_PORT 1234 -#define CONNECT_PORT 4321 -#define TEST_DADDR (0xC0A80203) -#define NS_SELF "/proc/self/ns/net" - -static const struct timeval timeo_sec = { .tv_sec = 3 }; -static const size_t timeo_optlen = sizeof(timeo_sec); -static int stop, duration; - -static bool -configure_stack(void) -{ - char tc_cmd[BUFSIZ]; - - /* Move to a new networking namespace */ - if (CHECK_FAIL(unshare(CLONE_NEWNET))) - return false; - - /* Configure necessary links, routes */ - if (CHECK_FAIL(system("ip link set dev lo up"))) - return false; - if (CHECK_FAIL(system("ip route add local default dev lo"))) - return false; - if (CHECK_FAIL(system("ip -6 route add local default dev lo"))) - return false; - - /* Load qdisc, BPF program */ - if (CHECK_FAIL(system("tc qdisc add dev lo clsact"))) - return false; - sprintf(tc_cmd, "%s %s %s %s", "tc filter add dev lo ingress bpf", - "direct-action object-file ./test_sk_assign.o", - "section classifier/sk_assign_test", - (env.verbosity < VERBOSE_VERY) ? " 2>/dev/null" : ""); - if (CHECK(system(tc_cmd), "BPF load failed;", - "run with -vv for more info\n")) - return false; - - return true; -} - -static int -start_server(const struct sockaddr *addr, socklen_t len, int type) -{ - int fd; - - fd = socket(addr->sa_family, type, 0); - if (CHECK_FAIL(fd == -1)) - goto out; - if (CHECK_FAIL(setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec, - timeo_optlen))) - goto close_out; - if (CHECK_FAIL(bind(fd, addr, len) == -1)) - goto close_out; - if (CHECK_FAIL(listen(fd, 128) == -1)) - goto close_out; - - goto out; -close_out: - close(fd); - fd = -1; -out: - return fd; -} - -static int -connect_to_server(const struct sockaddr *addr, socklen_t len, int type) -{ - int fd = -1; - - fd = socket(addr->sa_family, type, 0); - if (CHECK_FAIL(fd == -1)) - goto out; - if (CHECK_FAIL(setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeo_sec, - timeo_optlen))) - goto close_out; - if (CHECK_FAIL(connect(fd, addr, len))) - goto close_out; - - goto out; -close_out: - close(fd); - fd = -1; -out: - return fd; -} - -static in_port_t -get_port(int fd) -{ - struct sockaddr_storage ss; - socklen_t slen = sizeof(ss); - in_port_t port = 0; - - if (CHECK_FAIL(getsockname(fd, (struct sockaddr *)&ss, &slen))) - return port; - - switch (ss.ss_family) { - case AF_INET: - port = ((struct sockaddr_in *)&ss)->sin_port; - break; - case AF_INET6: - port = ((struct sockaddr_in6 *)&ss)->sin6_port; - break; - default: - CHECK(1, "Invalid address family", "%d\n", ss.ss_family); - } - return port; -} - -static int -run_test(int server_fd, const struct sockaddr *addr, socklen_t len, int type) -{ - int client = -1, srv_client = -1; - char buf[] = "testing"; - in_port_t port; - int ret = 1; - - client = connect_to_server(addr, len, type); - if (client == -1) { - perror("Cannot connect to server"); - goto out; - } - - srv_client = accept(server_fd, NULL, NULL); - if (CHECK_FAIL(srv_client == -1)) { - perror("Can't accept connection"); - goto out; - } - if (CHECK_FAIL(write(client, buf, sizeof(buf)) != sizeof(buf))) { - perror("Can't write on client"); - goto out; - } - if (CHECK_FAIL(read(srv_client, &buf, sizeof(buf)) != sizeof(buf))) { - perror("Can't read on server"); - goto out; - } - - port = get_port(srv_client); - if (CHECK_FAIL(!port)) - goto out; - if (CHECK(port != htons(CONNECT_PORT), "Expected", "port %u but got %u", - CONNECT_PORT, ntohs(port))) - goto out; - - ret = 0; -out: - close(client); - if (srv_client != server_fd) - close(srv_client); - if (ret) - WRITE_ONCE(stop, 1); - return ret; -} - -static void -prepare_addr(struct sockaddr *addr, int family, __u16 port, bool rewrite_addr) -{ - struct sockaddr_in *addr4; - struct sockaddr_in6 *addr6; - - switch (family) { - case AF_INET: - addr4 = (struct sockaddr_in *)addr; - memset(addr4, 0, sizeof(*addr4)); - addr4->sin_family = family; - addr4->sin_port = htons(port); - if (rewrite_addr) - addr4->sin_addr.s_addr = htonl(TEST_DADDR); - else - addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK); - break; - case AF_INET6: - addr6 = (struct sockaddr_in6 *)addr; - memset(addr6, 0, sizeof(*addr6)); - addr6->sin6_family = family; - addr6->sin6_port = htons(port); - addr6->sin6_addr = in6addr_loopback; - if (rewrite_addr) - addr6->sin6_addr.s6_addr32[3] = htonl(TEST_DADDR); - break; - default: - fprintf(stderr, "Invalid family %d", family); - } -} - -struct test_sk_cfg { - const char *name; - int family; - struct sockaddr *addr; - socklen_t len; - int type; - bool rewrite_addr; -}; - -#define TEST(NAME, FAMILY, TYPE, REWRITE) \ -{ \ - .name = NAME, \ - .family = FAMILY, \ - .addr = (FAMILY == AF_INET) ? (struct sockaddr *)&addr4 \ - : (struct sockaddr *)&addr6, \ - .len = (FAMILY == AF_INET) ? sizeof(addr4) : sizeof(addr6), \ - .type = TYPE, \ - .rewrite_addr = REWRITE, \ -} - -void test_sk_assign(void) -{ - struct sockaddr_in addr4; - struct sockaddr_in6 addr6; - struct test_sk_cfg tests[] = { - TEST("ipv4 tcp port redir", AF_INET, SOCK_STREAM, false), - TEST("ipv4 tcp addr redir", AF_INET, SOCK_STREAM, true), - TEST("ipv6 tcp port redir", AF_INET6, SOCK_STREAM, false), - TEST("ipv6 tcp addr redir", AF_INET6, SOCK_STREAM, true), - }; - int server = -1; - int self_net; - - self_net = open(NS_SELF, O_RDONLY); - if (CHECK_FAIL(self_net < 0)) { - perror("Unable to open "NS_SELF); - return; - } - - if (!configure_stack()) { - perror("configure_stack"); - goto cleanup; - } - - for (int i = 0; i < ARRAY_SIZE(tests) && !READ_ONCE(stop); i++) { - struct test_sk_cfg *test = &tests[i]; - const struct sockaddr *addr; - - if (!test__start_subtest(test->name)) - continue; - prepare_addr(test->addr, test->family, BIND_PORT, false); - addr = (const struct sockaddr *)test->addr; - server = start_server(addr, test->len, test->type); - if (server == -1) - goto cleanup; - - /* connect to unbound ports */ - prepare_addr(test->addr, test->family, CONNECT_PORT, - test->rewrite_addr); - if (run_test(server, addr, test->len, test->type)) - goto close; - - close(server); - server = -1; - } - -close: - close(server); -cleanup: - if (CHECK_FAIL(setns(self_net, CLONE_NEWNET))) - perror("Failed to setns("NS_SELF")"); - close(self_net); -} diff --git a/tools/testing/selftests/bpf/progs/test_sk_assign.c b/tools/testing/selftests/bpf/progs/test_sk_assign.c deleted file mode 100644 index 2d9549126e51e..0000000000000 --- a/tools/testing/selftests/bpf/progs/test_sk_assign.c +++ /dev/null @@ -1,143 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright (c) 2019 Cloudflare Ltd. -// Copyright (c) 2020 Isovalent, Inc. - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -int _version SEC("version") = 1; -char _license[] SEC("license") = "GPL"; - -/* Fill 'tuple' with L3 info, and attempt to find L4. On fail, return NULL. */ -static inline struct bpf_sock_tuple * -get_tuple(struct __sk_buff *skb, bool *ipv4) -{ - void *data_end = (void *)(long)skb->data_end; - void *data = (void *)(long)skb->data; - struct bpf_sock_tuple *result; - struct ethhdr *eth; - __u64 tuple_len; - __u8 proto = 0; - __u64 ihl_len; - - eth = (struct ethhdr *)(data); - if (eth + 1 > data_end) - return NULL; - - if (eth->h_proto == bpf_htons(ETH_P_IP)) { - struct iphdr *iph = (struct iphdr *)(data + sizeof(*eth)); - - if (iph + 1 > data_end) - return NULL; - if (iph->ihl != 5) - /* Options are not supported */ - return NULL; - ihl_len = iph->ihl * 4; - proto = iph->protocol; - *ipv4 = true; - result = (struct bpf_sock_tuple *)&iph->saddr; - } else if (eth->h_proto == bpf_htons(ETH_P_IPV6)) { - struct ipv6hdr *ip6h = (struct ipv6hdr *)(data + sizeof(*eth)); - - if (ip6h + 1 > data_end) - return NULL; - ihl_len = sizeof(*ip6h); - proto = ip6h->nexthdr; - *ipv4 = false; - result = (struct bpf_sock_tuple *)&ip6h->saddr; - } else { - return (struct bpf_sock_tuple *)data; - } - - if (result + 1 > data_end || proto != IPPROTO_TCP) - return NULL; - - return result; -} - -static inline int -handle_tcp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4) -{ - struct bpf_sock_tuple ln = {0}; - struct bpf_sock *sk; - size_t tuple_len; - int ret; - - tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6); - if ((void *)tuple + tuple_len > (void *)(long)skb->data_end) - return TC_ACT_SHOT; - - sk = bpf_skc_lookup_tcp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0); - if (sk) { - if (sk->state != BPF_TCP_LISTEN) - goto assign; - bpf_sk_release(sk); - } - - if (ipv4) { - if (tuple->ipv4.dport != bpf_htons(4321)) - return TC_ACT_OK; - - ln.ipv4.daddr = bpf_htonl(0x7f000001); - ln.ipv4.dport = bpf_htons(1234); - - sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv4), - BPF_F_CURRENT_NETNS, 0); - } else { - if (tuple->ipv6.dport != bpf_htons(4321)) - return TC_ACT_OK; - - /* Upper parts of daddr are already zero. */ - ln.ipv6.daddr[3] = bpf_htonl(0x1); - ln.ipv6.dport = bpf_htons(1234); - - sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv6), - BPF_F_CURRENT_NETNS, 0); - } - - /* workaround: We can't do a single socket lookup here, because then - * the compiler will likely spill tuple_len to the stack. This makes it - * lose all bounds information in the verifier, which then rejects the - * call as unsafe. - */ - if (!sk) - return TC_ACT_SHOT; - - if (sk->state != BPF_TCP_LISTEN) { - bpf_sk_release(sk); - return TC_ACT_SHOT; - } - -assign: - ret = bpf_sk_assign(skb, sk, 0); - bpf_sk_release(sk); - return ret; -} - -SEC("classifier/sk_assign_test") -int bpf_sk_assign_test(struct __sk_buff *skb) -{ - struct bpf_sock_tuple *tuple, ln = {0}; - bool ipv4 = false; - int tuple_len; - int ret = 0; - - tuple = get_tuple(skb, &ipv4); - if (!tuple) - return TC_ACT_SHOT; - - ret = handle_tcp(skb, tuple, ipv4); - - return ret == 0 ? TC_ACT_OK : TC_ACT_SHOT; -} From 9cb3cd4e5337f1e0940e5cfb30ebc8d1799ff0f5 Mon Sep 17 00:00:00 2001 From: Xiaochen Shen Date: Sat, 28 May 2022 20:45:56 +0800 Subject: [PATCH 0254/5344] Revert "x86/resctrl: Fix memory bandwidth counter width for AMD" Backport summary: Backport "wider MBM counters" feature for 5.4.143 LTS on ICX. This reverts commit 215e562251bbac9227c89496bcab13a4ed4e2bd1. Align with following 3 backporting upstream commits: 46637d4570e1 ("x86/resctrl: Maintain MBM counter width per resource") f3d44f18b066 ("x86/resctrl: Support CPUID enumeration of MBM counter width") 0c4d5ba1b998 ("x86/resctrl: Support wider MBM counters") And then this upstream commit will be backported again: 2c18bd525c47 ("x86/resctrl: Fix memory bandwidth counter width for AMD") Signed-off-by: Xiaochen Shen Signed-off-by: Gang Li --- arch/x86/kernel/cpu/resctrl/core.c | 2 -- arch/x86/kernel/cpu/resctrl/internal.h | 3 --- arch/x86/kernel/cpu/resctrl/monitor.c | 3 +-- 3 files changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 21f61fafa974d..cffa38c80c6a4 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -260,7 +260,6 @@ static bool __get_mem_config_intel(struct rdt_resource *r) r->num_closid = edx.split.cos_max + 1; r->membw.max_delay = eax.split.max_delay + 1; r->default_ctrl = MAX_MBA_BW; - r->membw.mbm_width = MBM_CNTR_WIDTH; if (ecx & MBA_IS_LINEAR) { r->membw.delay_linear = true; r->membw.min_bw = MAX_MBA_BW - r->membw.max_delay; @@ -290,7 +289,6 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r) /* AMD does not use delay */ r->membw.delay_linear = false; - r->membw.mbm_width = MBM_CNTR_WIDTH_AMD; r->membw.min_bw = 0; r->membw.bw_gran = 1; /* Max value is 2048, Data width should be 4 in decimal */ diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 1665ac4df9060..e920edcb9c35a 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -32,7 +32,6 @@ #define CQM_LIMBOCHECK_INTERVAL 1000 #define MBM_CNTR_WIDTH 24 -#define MBM_CNTR_WIDTH_AMD 44 #define MBM_OVERFLOW_INTERVAL 1000 #define MAX_MBA_BW 100u #define MBA_IS_LINEAR 0x4 @@ -367,7 +366,6 @@ struct rdt_cache { * @min_bw: Minimum memory bandwidth percentage user can request * @bw_gran: Granularity at which the memory bandwidth is allocated * @delay_linear: True if memory B/W delay is in linear scale - * @mbm_width: memory B/W monitor counter width * @mba_sc: True if MBA software controller(mba_sc) is enabled * @mb_map: Mapping of memory B/W percentage to memory B/W delay */ @@ -376,7 +374,6 @@ struct rdt_membw { u32 min_bw; u32 bw_gran; u32 delay_linear; - u32 mbm_width; bool mba_sc; u32 *mb_map; }; diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 8c7a6f31e9329..ba4159fba5f08 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -280,9 +280,8 @@ void free_rmid(u32 rmid) static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr) { - u64 shift, chunks; + u64 shift = 64 - MBM_CNTR_WIDTH, chunks; - shift = 64 - rdt_resources_all[RDT_RESOURCE_MBA].membw.mbm_width; chunks = (cur_msr << shift) - (prev_msr << shift); return chunks >>= shift; } From bae995e992fcd931de6bcac77c7b191e515eca0f Mon Sep 17 00:00:00 2001 From: Xiaochen Shen Date: Sat, 28 May 2022 21:14:00 +0800 Subject: [PATCH 0255/5344] x86/resctrl: Maintain MBM counter width per resource commit 46637d4570e108d1f6721cfa2cca1d078882761a upstream. Backport summary: Backport "wider MBM counters" feature for 5.4.143 LTS on ICX. The original Memory Bandwidth Monitoring (MBM) architectural definition defines counters of up to 62 bits in the IA32_QM_CTR MSR, and the first-generation MBM implementation uses 24 bit counters. Software is required to poll at 1 second or faster to ensure that data is retrieved before a counter rollover occurs more than once under worst conditions. As system bandwidths scale the software requirement is maintained with the introduction of a per-resource enumerable MBM counter width. In preparation for supporting hardware with an enumerable MBM counter width the current globally static MBM counter width is moved to a per-resource MBM counter width. Currently initialized to 24 always to result in no functional change. In essence there is one function, mbm_overflow_count() that needs to know the counter width to handle rollovers. The static value used within mbm_overflow_count() will be replaced with a value discovered from the hardware. Support for learning the MBM counter width from hardware is added in the change that follows. Signed-off-by: Reinette Chatre Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/e36743b9800f16ce600f86b89127391f61261f23.1588715690.git.reinette.chatre@intel.com Signed-off-by: Xiaochen Shen Signed-off-by: Gang Li --- arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 8 +++++--- arch/x86/kernel/cpu/resctrl/internal.h | 7 +++++-- arch/x86/kernel/cpu/resctrl/monitor.c | 21 +++++++++++++-------- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 2 +- 4 files changed, 24 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 055c8613b5317..934c8fb8a64a7 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -495,14 +495,16 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, return ret; } -void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, - struct rdtgroup *rdtgrp, int evtid, int first) +void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, + struct rdt_domain *d, struct rdtgroup *rdtgrp, + int evtid, int first) { /* * setup the parameters to send to the IPI to read the data. */ rr->rgrp = rdtgrp; rr->evtid = evtid; + rr->r = r; rr->d = d; rr->val = 0; rr->first = first; @@ -539,7 +541,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) goto out; } - mon_event_read(&rr, d, rdtgrp, evtid, false); + mon_event_read(&rr, r, d, rdtgrp, evtid, false); if (rr.val & RMID_VAL_ERROR) seq_puts(m, "Error\n"); diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index e920edcb9c35a..e5d20b1d37379 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -87,6 +87,7 @@ union mon_data_bits { struct rmid_read { struct rdtgroup *rgrp; + struct rdt_resource *r; struct rdt_domain *d; int evtid; bool first; @@ -458,6 +459,7 @@ struct rdt_resource { struct list_head evt_list; int num_rmid; unsigned int mon_scale; + unsigned int mbm_width; unsigned long fflags; }; @@ -585,8 +587,9 @@ void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id); void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, struct rdt_domain *d); -void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, - struct rdtgroup *rdtgrp, int evtid, int first); +void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, + struct rdt_domain *d, struct rdtgroup *rdtgrp, + int evtid, int first); void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms); void mbm_handle_overflow(struct work_struct *work); diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index ba4159fba5f08..53b4c7915b57a 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -278,9 +278,9 @@ void free_rmid(u32 rmid) list_add_tail(&entry->list, &rmid_free_lru); } -static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr) +static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) { - u64 shift = 64 - MBM_CNTR_WIDTH, chunks; + u64 shift = 64 - width, chunks; chunks = (cur_msr << shift) - (prev_msr << shift); return chunks >>= shift; @@ -319,7 +319,7 @@ static u64 __mon_event_count(u32 rmid, struct rmid_read *rr) return 0; } - chunks = mbm_overflow_count(m->prev_msr, tval); + chunks = mbm_overflow_count(m->prev_msr, tval, rr->r->mbm_width); m->chunks += chunks; m->prev_msr = tval; @@ -342,7 +342,7 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr) if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) return; - chunks = mbm_overflow_count(m->prev_bw_msr, tval); + chunks = mbm_overflow_count(m->prev_bw_msr, tval, rr->r->mbm_width); cur_bw = (get_corrected_mbm_count(rmid, chunks) * r->mon_scale) >> 20; if (m->delta_comp) @@ -501,11 +501,12 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) } } -static void mbm_update(struct rdt_domain *d, int rmid) +static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) { struct rmid_read rr; rr.first = false; + rr.r = r; rr.d = d; /* @@ -577,6 +578,7 @@ void mbm_handle_overflow(struct work_struct *work) struct rdtgroup *prgrp, *crgrp; int cpu = smp_processor_id(); struct list_head *head; + struct rdt_resource *r; struct rdt_domain *d; mutex_lock(&rdtgroup_mutex); @@ -584,16 +586,18 @@ void mbm_handle_overflow(struct work_struct *work) if (!static_branch_likely(&rdt_mon_enable_key)) goto out_unlock; - d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]); + r = &rdt_resources_all[RDT_RESOURCE_L3]; + + d = get_domain_from_cpu(cpu, r); if (!d) goto out_unlock; list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - mbm_update(d, prgrp->mon.rmid); + mbm_update(r, d, prgrp->mon.rmid); head = &prgrp->mon.crdtgrp_list; list_for_each_entry(crgrp, head, mon.crdtgrp_list) - mbm_update(d, crgrp->mon.rmid); + mbm_update(r, d, crgrp->mon.rmid); if (is_mba_sc(NULL)) update_mba_bw(prgrp, d); @@ -686,6 +690,7 @@ int rdt_get_mon_l3_config(struct rdt_resource *r) r->mon_scale = boot_cpu_data.x86_cache_occ_scale; r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1; + r->mbm_width = MBM_CNTR_WIDTH; /* * A reasonable upper limit on the max threshold is the number diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index a910cf12180f6..0472ccd37324c 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -2365,7 +2365,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, goto out_destroy; if (is_mbm_event(mevt->evtid)) - mon_event_read(&rr, d, prgrp, mevt->evtid, true); + mon_event_read(&rr, r, d, prgrp, mevt->evtid, true); } kernfs_activate(kn); return 0; From b1bc795329caba5b981f3c05e37932129ed72fcb Mon Sep 17 00:00:00 2001 From: Xiaochen Shen Date: Sat, 28 May 2022 21:21:58 +0800 Subject: [PATCH 0256/5344] x86/resctrl: Support CPUID enumeration of MBM counter width commit f3d44f18b0662327c42128b9d3604489bdb6e36f upstream. Backport summary: Backport "wider MBM counters" feature for 5.4.143 LTS on ICX. The original Memory Bandwidth Monitoring (MBM) architectural definition defines counters of up to 62 bits in the IA32_QM_CTR MSR while the first-generation MBM implementation uses statically defined 24 bit counters. Expand the MBM CPUID enumeration properties to include the MBM counter width. The previously undefined EAX output register contains, in bits [7:0], the MBM counter width encoded as an offset from 24 bits. Enumerating this property is only specified for Intel CPUs. Suggested-by: Borislav Petkov Signed-off-by: Reinette Chatre Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/afa3af2f753f6bc301fb743bc8944e749cb24afa.1588715690.git.reinette.chatre@intel.com Signed-off-by: Xiaochen Shen Signed-off-by: Gang Li --- arch/x86/include/asm/processor.h | 3 ++- arch/x86/kernel/cpu/common.c | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 528b2dffc2775..16c6ee556f017 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -99,9 +99,10 @@ struct cpuinfo_x86 { /* in KB - valid for CPUS which support this call: */ unsigned int x86_cache_size; int x86_cache_alignment; /* In bytes */ - /* Cache QoS architectural values: */ + /* Cache QoS architectural values, valid only on the BSP: */ int x86_cache_max_rmid; /* max index */ int x86_cache_occ_scale; /* scale to bytes */ + int x86_cache_mbm_width_offset; int x86_power; unsigned long loops_per_jiffy; /* cpuid returned max cores value: */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f961a56e9da3f..05c81939e8185 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -877,6 +877,7 @@ static void init_cqm(struct cpuinfo_x86 *c) if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { c->x86_cache_max_rmid = -1; c->x86_cache_occ_scale = -1; + c->x86_cache_mbm_width_offset = -1; return; } @@ -893,6 +894,10 @@ static void init_cqm(struct cpuinfo_x86 *c) c->x86_cache_max_rmid = ecx; c->x86_cache_occ_scale = ebx; + if (c->x86_vendor == X86_VENDOR_INTEL) + c->x86_cache_mbm_width_offset = eax & 0xff; + else + c->x86_cache_mbm_width_offset = -1; } } From bf90bd09d7e23f3235431f28141e0661f862f403 Mon Sep 17 00:00:00 2001 From: Xiaochen Shen Date: Sat, 28 May 2022 21:25:02 +0800 Subject: [PATCH 0257/5344] x86/resctrl: Support wider MBM counters commit 0c4d5ba1b998e713815b7790d3db6ced0ae49489 upstream. Backport summary: Backport "wider MBM counters" feature for 5.4.143 LTS on ICX. The original Memory Bandwidth Monitoring (MBM) architectural definition defines counters of up to 62 bits in the IA32_QM_CTR MSR while the first-generation MBM implementation uses statically defined 24 bit counters. The MBM CPUID enumeration properties have been expanded to include the MBM counter width, encoded as an offset from 24 bits. While eight bits are available for the counter width offset IA32_QM_CTR MSR only supports 62 bit counters. Add a sanity check, with warning printed when encountered, to ensure counters cannot exceed the 62 bit limit. Signed-off-by: Reinette Chatre Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/69d52abd5b14794d3a0f05ba7c755ed1f4c0d5ed.1588715690.git.reinette.chatre@intel.com Signed-off-by: Xiaochen Shen Signed-off-by: Gang Li --- arch/x86/kernel/cpu/resctrl/internal.h | 8 +++++++- arch/x86/kernel/cpu/resctrl/monitor.c | 8 +++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index e5d20b1d37379..cf3293c4e8dc7 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -31,7 +31,7 @@ #define CQM_LIMBOCHECK_INTERVAL 1000 -#define MBM_CNTR_WIDTH 24 +#define MBM_CNTR_WIDTH_BASE 24 #define MBM_OVERFLOW_INTERVAL 1000 #define MAX_MBA_BW 100u #define MBA_IS_LINEAR 0x4 @@ -40,6 +40,12 @@ #define RMID_VAL_ERROR BIT_ULL(63) #define RMID_VAL_UNAVAIL BIT_ULL(62) +/* + * With the above fields in use 62 bits remain in MSR_IA32_QM_CTR for + * data to be returned. The counter width is discovered from the hardware + * as an offset from MBM_CNTR_WIDTH_BASE. + */ +#define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE) struct rdt_fs_context { diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 53b4c7915b57a..1e57d83e01f29 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -685,12 +685,18 @@ static void l3_mon_evt_init(struct rdt_resource *r) int rdt_get_mon_l3_config(struct rdt_resource *r) { + unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; unsigned int cl_size = boot_cpu_data.x86_cache_size; int ret; r->mon_scale = boot_cpu_data.x86_cache_occ_scale; r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1; - r->mbm_width = MBM_CNTR_WIDTH; + r->mbm_width = MBM_CNTR_WIDTH_BASE; + + if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) + r->mbm_width += mbm_offset; + else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX) + pr_warn("Ignoring impossible MBM counter offset\n"); /* * A reasonable upper limit on the max threshold is the number From 1e669e72b74abac40e16479e139c8ce575073071 Mon Sep 17 00:00:00 2001 From: Xiaochen Shen Date: Sat, 28 May 2022 21:32:44 +0800 Subject: [PATCH 0258/5344] x86/resctrl: Fix memory bandwidth counter width for AMD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 2c18bd525c47f882f033b0a813ecd09c93e1ecdf upstream. Backport summary: Backport "wider MBM counters" feature for 5.4.143 LTS on ICX. Memory bandwidth is calculated reading the monitoring counter at two intervals and calculating the delta. It is the software’s responsibility to read the count often enough to avoid having the count roll over _twice_ between reads. The current code hardcodes the bandwidth monitoring counter's width to 24 bits for AMD. This is due to default base counter width which is 24. Currently, AMD does not implement the CPUID 0xF.[ECX=1]:EAX to adjust the counter width. But, the AMD hardware supports much wider bandwidth counter with the default width of 44 bits. Kernel reads these monitoring counters every 1 second and adjusts the counter value for overflow. With 24 bits and scale value of 64 for AMD, it can only measure up to 1GB/s without overflowing. For the rates above 1GB/s this will fail to measure the bandwidth. Fix the issue setting the default width to 44 bits by adjusting the offset. AMD future products will implement CPUID 0xF.[ECX=1]:EAX. [ bp: Let the line stick out and drop {}-brackets around a single statement. ] Fixes: 4d05bf71f157 ("x86/resctrl: Introduce AMD QOS feature") Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/159129975546.62538.5656031125604254041.stgit@naples-babu.amd.com Signed-off-by: Xiaochen Shen Signed-off-by: Gang Li --- arch/x86/kernel/cpu/common.c | 9 +++++---- arch/x86/kernel/cpu/resctrl/internal.h | 1 + 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 05c81939e8185..1c055b22b432d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -59,6 +59,7 @@ #endif #include "cpu.h" +#include "resctrl/internal.h" u32 elf_hwcap2 __read_mostly; @@ -894,10 +895,10 @@ static void init_cqm(struct cpuinfo_x86 *c) c->x86_cache_max_rmid = ecx; c->x86_cache_occ_scale = ebx; - if (c->x86_vendor == X86_VENDOR_INTEL) - c->x86_cache_mbm_width_offset = eax & 0xff; - else - c->x86_cache_mbm_width_offset = -1; + c->x86_cache_mbm_width_offset = eax & 0xff; + + if (c->x86_vendor == X86_VENDOR_AMD && !c->x86_cache_mbm_width_offset) + c->x86_cache_mbm_width_offset = MBM_CNTR_WIDTH_OFFSET_AMD; } } diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index cf3293c4e8dc7..5dcbb1a3ee00b 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -37,6 +37,7 @@ #define MBA_IS_LINEAR 0x4 #define MBA_MAX_MBPS U32_MAX #define MAX_MBA_BW_AMD 0x800 +#define MBM_CNTR_WIDTH_OFFSET_AMD 20 #define RMID_VAL_ERROR BIT_ULL(63) #define RMID_VAL_UNAVAIL BIT_ULL(62) From 043d5acc735a9cc0815080f3065adbf07c99c86f Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 8 Apr 2022 20:19:14 +0800 Subject: [PATCH 0259/5344] sched/psi: report zeroes for CPU full at the system level commit 890d550d7dbac7a31ecaa78732aa22be282bb6b8 upstream. Martin find it confusing when look at the /proc/pressure/cpu output, and found no hint about that CPU "full" line in psi Documentation. % cat /proc/pressure/cpu some avg10=0.92 avg60=0.91 avg300=0.73 total=933490489 full avg10=0.22 avg60=0.23 avg300=0.16 total=358783277 The PSI_CPU_FULL state is introduced by commit e7fcd7622823 ("psi: Add PSI_CPU_FULL state"), which mainly for cgroup level, but also counted at the system level as a side effect. Naturally, the FULL state doesn't exist for the CPU resource at the system level. These "full" numbers can come from CPU idle schedule latency. For example, t1 is the time when task wakeup on an idle CPU, t2 is the time when CPU pick and switch to it. The delta of (t2 - t1) will be in CPU_FULL state. Another case all processes can be stalled is when all cgroups have been throttled at the same time, which unlikely to happen. Anyway, CPU_FULL metric is meaningless and confusing at the system level. So this patch will report zeroes for CPU full at the system level, and update psi Documentation accordingly. Fixes: e7fcd7622823 ("psi: Add PSI_CPU_FULL state") Reported-by: Martin Steigerwald Suggested-by: Johannes Weiner Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220408121914.82855-1-zhouchengming@bytedance.com --- Documentation/accounting/psi.rst | 9 ++++----- kernel/sched/psi.c | 15 +++++++++------ 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/Documentation/accounting/psi.rst b/Documentation/accounting/psi.rst index 621111ce57401..4a87e1eb12a6c 100644 --- a/Documentation/accounting/psi.rst +++ b/Documentation/accounting/psi.rst @@ -35,11 +35,7 @@ Pressure interface Pressure information for each resource is exported through the respective file in /proc/pressure/ -- cpu, memory, and io. -The format for CPU is as such:: - - some avg10=0.00 avg60=0.00 avg300=0.00 total=0 - -and for memory and IO:: +The format is as such:: some avg10=0.00 avg60=0.00 avg300=0.00 total=0 full avg10=0.00 avg60=0.00 avg300=0.00 total=0 @@ -56,6 +52,9 @@ situation from a state where some tasks are stalled but the CPU is still doing productive work. As such, time spent in this subset of the stall state is tracked separately and exported in the "full" averages. +CPU full is undefined at the system level, but has been reported +since 5.13, so it is set to zero for backward compatibility. + The ratios (in %) are tracked as recent trends over ten, sixty, and three hundred second windows, which gives insight into short term events as well as medium and long term trends. The total absolute stall time diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 5a8dfcb3fcc0a..d1104f07daca1 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1045,14 +1045,17 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) mutex_unlock(&group->avgs_lock); for (full = 0; full < 2; full++) { - unsigned long avg[3]; - u64 total; + unsigned long avg[3] = { 0, }; + u64 total = 0; int w; - for (w = 0; w < 3; w++) - avg[w] = group->avg[res * 2 + full][w]; - total = div_u64(group->total[PSI_AVGS][res * 2 + full], - NSEC_PER_USEC); + /* CPU FULL is undefined at the system level */ + if (!(group == &psi_system && res == PSI_CPU && full)) { + for (w = 0; w < 3; w++) + avg[w] = group->avg[res * 2 + full][w]; + total = div_u64(group->total[PSI_AVGS][res * 2 + full], + NSEC_PER_USEC); + } seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", full ? "full" : "some", From d310b249f1a6e30cb3a2c084264313fa79a5dd76 Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Thu, 19 Sep 2019 17:11:20 +0300 Subject: [PATCH 0260/5344] fuse: optimize writepages search commit 6b2fb79963fbed7db3ef850926d913518fd5c62f upstream. Re-work fi->writepages, replacing list with rb-tree. This improves performance because kernel fuse iterates through fi->writepages for each writeback page and typical number of entries is about 800 (for 100MB of fuse writeback). Before patch: 10240+0 records in 10240+0 records out 10737418240 bytes (11 GB) copied, 41.3473 s, 260 MB/s 2 1 0 57445400 40416 6323676 0 0 33 374743 8633 19210 1 8 88 3 0 29.86% [kernel] [k] _raw_spin_lock 26.62% [fuse] [k] fuse_page_is_writeback After patch: 10240+0 records in 10240+0 records out 10737418240 bytes (11 GB) copied, 21.4954 s, 500 MB/s 2 9 0 53676040 31744 10265984 0 0 64 854790 10956 48387 1 6 88 6 0 23.55% [kernel] [k] copy_user_enhanced_fast_string 9.87% [kernel] [k] __memcpy 3.10% [kernel] [k] _raw_spin_lock Signed-off-by: Maxim Patlasov Signed-off-by: Vasily Averin Signed-off-by: Miklos Szeredi Signed-off-by: Jiachen Zhang --- fs/fuse/file.c | 62 ++++++++++++++++++++++++++++++++++++++---------- fs/fuse/fuse_i.h | 2 +- 2 files changed, 50 insertions(+), 14 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 0cc44f3ed3475..4c75209c2ab7a 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -364,7 +364,7 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) struct fuse_writepage_args { struct fuse_io_args ia; - struct list_head writepages_entry; + struct rb_node writepages_entry; struct list_head queue_entry; struct fuse_writepage_args *next; struct inode *inode; @@ -373,17 +373,23 @@ struct fuse_writepage_args { static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, pgoff_t idx_from, pgoff_t idx_to) { - struct fuse_writepage_args *wpa; + struct rb_node *n; + + n = fi->writepages.rb_node; - list_for_each_entry(wpa, &fi->writepages, writepages_entry) { + while (n) { + struct fuse_writepage_args *wpa; pgoff_t curr_index; + wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry); WARN_ON(get_fuse_inode(wpa->inode) != fi); curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT; - if (idx_from < curr_index + wpa->ia.ap.num_pages && - curr_index <= idx_to) { + if (idx_from >= curr_index + wpa->ia.ap.num_pages) + n = n->rb_right; + else if (idx_to < curr_index) + n = n->rb_left; + else return wpa; - } } return NULL; } @@ -1660,7 +1666,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - list_del(&wpa->writepages_entry); + rb_erase(&wpa->writepages_entry, &fi->writepages); for (i = 0; i < ap->num_pages; i++) { dec_wb_stat(&bdi->wb, WB_WRITEBACK); dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP); @@ -1748,6 +1754,36 @@ __acquires(fi->lock) } } +static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa) +{ + pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT; + pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1; + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + + WARN_ON(!wpa->ia.ap.num_pages); + while (*p) { + struct fuse_writepage_args *curr; + pgoff_t curr_index; + + parent = *p; + curr = rb_entry(parent, struct fuse_writepage_args, + writepages_entry); + WARN_ON(curr->inode != wpa->inode); + curr_index = curr->ia.write.in.offset >> PAGE_SHIFT; + + if (idx_from >= curr_index + curr->ia.ap.num_pages) + p = &(*p)->rb_right; + else if (idx_to < curr_index) + p = &(*p)->rb_left; + else + return (void) WARN_ON(true); + } + + rb_link_node(&wpa->writepages_entry, parent, p); + rb_insert_color(&wpa->writepages_entry, root); +} + static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args, int error) { @@ -1774,7 +1810,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args, wpa->next = next->next; next->next = NULL; next->ia.ff = fuse_file_get(wpa->ia.ff); - list_add(&next->writepages_entry, &fi->writepages); + tree_insert(&fi->writepages, next); /* * Skip fuse_flush_writepages() to make it easy to crop requests @@ -1909,7 +1945,7 @@ static int fuse_writepage_locked(struct page *page) inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); spin_lock(&fi->lock); - list_add(&wpa->writepages_entry, &fi->writepages); + tree_insert(&fi->writepages, wpa); list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); @@ -2021,10 +2057,10 @@ static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa, WARN_ON(new_ap->num_pages != 0); spin_lock(&fi->lock); - list_del(&new_wpa->writepages_entry); + rb_erase(&new_wpa->writepages_entry, &fi->writepages); old_wpa = fuse_find_writeback(fi, page->index, page->index); if (!old_wpa) { - list_add(&new_wpa->writepages_entry, &fi->writepages); + tree_insert(&fi->writepages, new_wpa); spin_unlock(&fi->lock); return false; } @@ -2139,7 +2175,7 @@ static int fuse_writepages_fill(struct page *page, wpa->inode = inode; spin_lock(&fi->lock); - list_add(&wpa->writepages_entry, &fi->writepages); + tree_insert(&fi->writepages, wpa); spin_unlock(&fi->lock); data->wpa = wpa; @@ -3476,5 +3512,5 @@ void fuse_init_file_inode(struct inode *inode) INIT_LIST_HEAD(&fi->queued_writes); fi->writectr = 0; init_waitqueue_head(&fi->page_waitq); - INIT_LIST_HEAD(&fi->writepages); + fi->writepages = RB_ROOT; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 4d64994903775..1323ddc38f2b4 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -114,7 +114,7 @@ struct fuse_inode { wait_queue_head_t page_waitq; /* List of writepage requestst (pending or sent) */ - struct list_head writepages; + struct rb_root writepages; }; /* readdir cache (directory only) */ From e389719bd708e9db7ed58718f681a273a587c7b2 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 14 Jul 2020 14:45:41 +0200 Subject: [PATCH 0261/5344] fuse: fix warning in tree_insert() and clean up writepage insertion commit c146024ec44c2946de7c6c45ddd3402abcab17f9 upstream. fuse_writepages_fill() calls tree_insert() with ap->num_pages = 0 which triggers the following warning: WARNING: CPU: 1 PID: 17211 at fs/fuse/file.c:1728 tree_insert+0xab/0xc0 [fuse] RIP: 0010:tree_insert+0xab/0xc0 [fuse] Call Trace: fuse_writepages_fill+0x5da/0x6a0 [fuse] write_cache_pages+0x171/0x470 fuse_writepages+0x8a/0x100 [fuse] do_writepages+0x43/0xe0 Fix up the warning and clean up the code around rb-tree insertion: - Rename tree_insert() to fuse_insert_writeback() and make it return the conflicting entry in case of failure - Re-add tree_insert() as a wrapper around fuse_insert_writeback() - Rename fuse_writepage_in_flight() to fuse_writepage_add() and reverse the meaning of the return value to mean + "true" in case the writepage entry was successfully added + "false" in case it was in-fligt queued on an existing writepage entry's auxiliary list or the existing writepage entry's temporary page updated Switch from fuse_find_writeback() + tree_insert() to fuse_insert_writeback() - Move setting orig_pages to before inserting/updating the entry; this may result in the orig_pages value being discarded later in case of an in-flight request - In case of a new writepage entry use fuse_writepage_add() unconditionally, only set data->wpa if the entry was added. Fixes: 6b2fb79963fb ("fuse: optimize writepages search") Reported-by: kernel test robot Original-path-by: Vasily Averin Signed-off-by: Miklos Szeredi Signed-off-by: Jiachen Zhang --- fs/fuse/file.c | 62 ++++++++++++++++++++++++-------------------------- 1 file changed, 30 insertions(+), 32 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4c75209c2ab7a..416905e49551d 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1754,7 +1754,8 @@ __acquires(fi->lock) } } -static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa) +static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root, + struct fuse_writepage_args *wpa) { pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT; pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1; @@ -1777,11 +1778,17 @@ static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa) else if (idx_to < curr_index) p = &(*p)->rb_left; else - return (void) WARN_ON(true); + return curr; } rb_link_node(&wpa->writepages_entry, parent, p); rb_insert_color(&wpa->writepages_entry, root); + return NULL; +} + +static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa) +{ + WARN_ON(fuse_insert_writeback(root, wpa)); } static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args, @@ -2040,14 +2047,14 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) } /* - * First recheck under fi->lock if the offending offset is still under - * writeback. If yes, then iterate auxiliary write requests, to see if there's + * Check under fi->lock if the page is under writeback, and insert it onto the + * rb_tree if not. Otherwise iterate auxiliary write requests, to see if there's * one already added for a page at this offset. If there's none, then insert * this new request onto the auxiliary list, otherwise reuse the existing one by - * copying the new page contents over to the old temporary page. + * swapping the new temp page with the old one. */ -static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa, - struct page *page) +static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, + struct page *page) { struct fuse_inode *fi = get_fuse_inode(new_wpa->inode); struct fuse_writepage_args *tmp; @@ -2055,17 +2062,15 @@ static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa, struct fuse_args_pages *new_ap = &new_wpa->ia.ap; WARN_ON(new_ap->num_pages != 0); + new_ap->num_pages = 1; spin_lock(&fi->lock); - rb_erase(&new_wpa->writepages_entry, &fi->writepages); - old_wpa = fuse_find_writeback(fi, page->index, page->index); + old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa); if (!old_wpa) { - tree_insert(&fi->writepages, new_wpa); spin_unlock(&fi->lock); - return false; + return true; } - new_ap->num_pages = 1; for (tmp = old_wpa->next; tmp; tmp = tmp->next) { pgoff_t curr_index; @@ -2094,7 +2099,7 @@ static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa, fuse_writepage_free(new_wpa); } - return true; + return false; } static int fuse_writepages_fill(struct page *page, @@ -2173,12 +2178,6 @@ static int fuse_writepages_fill(struct page *page, ap->args.end = fuse_writepage_end; ap->num_pages = 0; wpa->inode = inode; - - spin_lock(&fi->lock); - tree_insert(&fi->writepages, wpa); - spin_unlock(&fi->lock); - - data->wpa = wpa; } set_page_writeback(page); @@ -2186,26 +2185,25 @@ static int fuse_writepages_fill(struct page *page, ap->pages[ap->num_pages] = tmp_page; ap->descs[ap->num_pages].offset = 0; ap->descs[ap->num_pages].length = PAGE_SIZE; + data->orig_pages[ap->num_pages] = page; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); err = 0; - if (is_writeback && fuse_writepage_in_flight(wpa, page)) { + if (data->wpa) { + /* + * Protected by fi->lock against concurrent access by + * fuse_page_is_writeback(). + */ + spin_lock(&fi->lock); + ap->num_pages++; + spin_unlock(&fi->lock); + } else if (fuse_writepage_add(wpa, page)) { + data->wpa = wpa; + } else { end_page_writeback(page); - data->wpa = NULL; - goto out_unlock; } - data->orig_pages[ap->num_pages] = page; - - /* - * Protected by fi->lock against concurrent access by - * fuse_page_is_writeback(). - */ - spin_lock(&fi->lock); - ap->num_pages++; - spin_unlock(&fi->lock); - out_unlock: unlock_page(page); From a9a39e5127753efcf7c137466365de37946c1272 Mon Sep 17 00:00:00 2001 From: Jiachen Zhang Date: Wed, 8 Jun 2022 14:29:38 +0800 Subject: [PATCH 0262/5344] bytedance: fuse: wb_trust_server enhancements According to the suggested writeback V2 patch in upstream mailing-list[1], this commit enhances the wb_trust_server flag: 1. Ensure c/mtime are not updated from kernel to server. 2. Simplify the wb_try_update condition with the with i_state flags and the RB_EMPTY_ROOT helper. 3. Skip mtime-based revalidation when fc->auto_inval_data is set with fc->wb_trust_server. Also make sure i_size is not updated if the inode is ever dirtied, by increasing attr_version after writeback for wb_trust_server. In that case, we should also ensure the ordering of the attr_version updating and the fi->writepages rb-tree updating. So that if an fuse page writeback happens during fuse_change_attributes(), either the fi->writepages is not empty, or the attr_version is increased. So we never mistakenly update a stale file size from server to kernel in such cases. [1] https://lore.kernel.org/linux-fsdevel/Ymfu8fGbfYi4FxQ4@miu.piliscsaba.redhat.com/ Signed-off-by: Jiachen Zhang --- fs/fuse/dir.c | 3 +- fs/fuse/file.c | 24 +++++++------- fs/fuse/fuse_i.h | 3 +- fs/fuse/inode.c | 67 ++++++++++++++++++++++++++++----------- include/uapi/linux/fuse.h | 3 +- 5 files changed, 65 insertions(+), 35 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 60378f3baaae1..9fc2a1a68f1cb 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1612,7 +1612,8 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, } fuse_change_attributes_common(inode, &outarg.attr, - attr_timeout(&outarg)); + attr_timeout(&outarg), + !trust_local_cmtime); oldsize = inode->i_size; /* see the comment in fuse_change_attributes() */ if (!is_wb || is_truncate || !S_ISREG(inode->i_mode)) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 416905e49551d..feb7f4ebbdb09 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -394,18 +394,6 @@ static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, return NULL; } -/* - * Check if any page of this file is under writeback. - * - * The fuse_inode lock should be held by the caller. - */ -bool fuse_file_is_writeback_locked(struct inode *inode) -{ - struct fuse_inode *fi = get_fuse_inode(inode); - - return fuse_find_writeback(fi, 0, ULONG_MAX); -} - /* * Check if any page in a range is under writeback * @@ -1844,6 +1832,15 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args, */ fuse_send_writepage(fc, next, inarg->offset + inarg->size); } + + if (fc->wb_trust_server) + fi->attr_version = atomic64_inc_return(&fc->attr_version); + /* + * Ensure attr_version increases before the page is move out of the + * writepages rb-tree. + */ + smp_mb(); + fi->writectr--; fuse_writepage_finish(fc, wpa); spin_unlock(&fi->lock); @@ -1881,6 +1878,9 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) struct fuse_file *ff; int err; + if (fc->wb_trust_server) + return 0; + ff = __fuse_write_file_get(fc, fi); err = fuse_flush_times(inode, ff); if (ff) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 1323ddc38f2b4..0fce3c4df08ee 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -923,7 +923,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, u64 attr_valid, u64 attr_version); void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, - u64 attr_valid); + u64 attr_valid, bool update_cmtime); /** * Initialize the client device @@ -1117,6 +1117,5 @@ unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args); */ u64 fuse_get_unique(struct fuse_iqueue *fiq); void fuse_free_conn(struct fuse_conn *fc); -bool fuse_file_is_writeback_locked(struct inode *inode); #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 9fd6abb11796a..39d25d6bd3fa5 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -146,7 +146,7 @@ static ino_t fuse_squash_ino(u64 ino64) } void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, - u64 attr_valid) + u64 attr_valid, bool update_cmtime) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); @@ -166,7 +166,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, inode->i_atime.tv_sec = attr->atime; inode->i_atime.tv_nsec = attr->atimensec; /* mtime from server may be stale due to local buffered write */ - if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) { + if (update_cmtime || !S_ISREG(inode->i_mode)) { inode->i_mtime.tv_sec = attr->mtime; inode->i_mtime.tv_nsec = attr->mtimensec; inode->i_ctime.tv_sec = attr->ctime; @@ -199,10 +199,14 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, loff_t oldsize; struct timespec64 old_mtime; bool try_wb_update = false; + bool wb_update = false; + bool inval = false; + bool update_cmtime = is_wb; - if (is_wb && fc->wb_trust_server && S_ISREG(inode->i_mode) && - inode_trylock(inode)) + if (is_wb && fc->wb_trust_server && S_ISREG(inode->i_mode)) { + inode_lock(inode); try_wb_update = true; + } spin_lock(&fi->lock); /* @@ -210,13 +214,26 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, * may update i_size. In these cases trust the cached value in the * inode. * - * One expection is if the page cache is clean and there is no in-flight - * fuse writeback request. The c/mtime and file size are allowed to be - * updated from server, so clear is_wb in that case. + * But with wb_trust_server, if all the following conditions are met, + * then we allow the attributes to be refreshed: + * + * - inode is not in the process of being written (I_SYNC) + * - inode has no dirty pages (I_DIRTY_PAGES) + * - inode data-related attributes are clean (I_DIRTY_DATASYNC) + * - inode does not have any page writeback in progress + * + * Note: checking PAGECACHE_TAG_WRITEBACK is not sufficient in fuse, + * since inode can appear to have no PageWriteback pages, yet still have + * outstanding write request. */ - if (try_wb_update && !fuse_file_is_writeback_locked(inode) && - !filemap_range_needs_writeback(inode->i_mapping, 0, LLONG_MAX)) - is_wb = 0; + if (try_wb_update && !(inode->i_state & (I_DIRTY_PAGES | I_SYNC | + I_DIRTY_DATASYNC)) && RB_EMPTY_ROOT(&fi->writepages)) + wb_update = true; + /* + * Ensure the ordering of cleanness checking and following attr_version + * comparison. + */ + smp_mb(); if ((attr_version != 0 && fi->attr_version > attr_version) || test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) { @@ -226,8 +243,15 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, return; } + if (fc->wb_trust_server && attr->should_inval) { + update_cmtime = true; + if (S_ISREG(inode->i_mode)) + inval = true; + } + old_mtime = inode->i_mtime; - fuse_change_attributes_common(inode, attr, attr_valid); + fuse_change_attributes_common(inode, attr, attr_valid, + update_cmtime); oldsize = inode->i_size; /* @@ -235,18 +259,22 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, * extend local i_size without keeping userspace server in sync. So, * attr->size coming from server can be stale. We cannot trust it. */ - if (!is_wb || !S_ISREG(inode->i_mode)) + if (!is_wb || wb_update || !S_ISREG(inode->i_mode)) i_size_write(inode, attr->size); spin_unlock(&fi->lock); - if (!is_wb && S_ISREG(inode->i_mode)) { - bool inval = false; - + if ((!is_wb || wb_update) && S_ISREG(inode->i_mode)) { if (oldsize != attr->size) { truncate_pagecache(inode, attr->size); if (!fc->explicit_inval_data) inval = true; - } else if (fc->auto_inval_data) { + } else if (!fc->wb_trust_server && fc->auto_inval_data) { + /* + * When fc->wb_trust_server is set, the old_mtime + * can be generated by kernel and must not equal to + * new_mtime generated by server. So skip in such + * case. + */ struct timespec64 new_mtime = { .tv_sec = attr->mtime, .tv_nsec = attr->mtimensec, @@ -259,10 +287,11 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, if (!timespec64_equal(&old_mtime, &new_mtime)) inval = true; } - - if (inval) - invalidate_inode_pages2(inode->i_mapping); } + + if (inval) + invalidate_inode_pages2(inode->i_mapping); + if (try_wb_update) inode_unlock(inode); } diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index ffcc561c4dc54..e5dcfa9883f72 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -231,7 +231,8 @@ struct fuse_attr { uint32_t gid; uint32_t rdev; uint32_t blksize; - uint32_t padding; + uint32_t padding:31; + uint32_t should_inval:1; }; struct fuse_kstatfs { From 51e3805c3d5444a62d979c611f9c732c54c87c4d Mon Sep 17 00:00:00 2001 From: Jiachen Zhang Date: Wed, 22 Jun 2022 20:11:07 +0800 Subject: [PATCH 0263/5344] bytedance: fuse: add FOPEN_INVAL_ATTR So that the fuse daemon can ask kernel to invalidate the attr cache on file open. Signed-off-by: Jiachen Zhang --- fs/fuse/file.c | 3 +++ include/uapi/linux/fuse.h | 2 ++ 2 files changed, 5 insertions(+) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index feb7f4ebbdb09..c7f43c1de21dd 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -213,6 +213,9 @@ void fuse_finish_open(struct inode *inode, struct file *file) invalidate_inode_pages2(inode->i_mapping); } + if (ff->open_flags & FOPEN_INVAL_ATTR) + fuse_invalidate_attr(inode); + if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) fuse_link_write_file(file); } diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index e5dcfa9883f72..6a8050e4abfa1 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -278,12 +278,14 @@ struct fuse_file_lock { * FOPEN_NONSEEKABLE: the file is not seekable * FOPEN_CACHE_DIR: allow caching this directory * FOPEN_STREAM: the file is stream-like (no file position at all) + * FOPEN_INVAL_ATTR: invalidate the attr cache on open */ #define FOPEN_DIRECT_IO (1 << 0) #define FOPEN_KEEP_CACHE (1 << 1) #define FOPEN_NONSEEKABLE (1 << 2) #define FOPEN_CACHE_DIR (1 << 3) #define FOPEN_STREAM (1 << 4) +#define FOPEN_INVAL_ATTR (1 << 31) /** * INIT request/reply flags From 79ef6d549150d769ef8828374028e516b318a55b Mon Sep 17 00:00:00 2001 From: Jiachen Zhang Date: Wed, 8 Jun 2022 10:42:33 +0800 Subject: [PATCH 0264/5344] Revert "mm: provide filemap_range_needs_writeback() helper" This reverts commit 20f02563e985a24ecca78fdd5f5ec267155c2f51. Signed-off-by: Jiachen Zhang --- include/linux/fs.h | 2 -- mm/filemap.c | 43 ------------------------------------------- 2 files changed, 45 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index c4438a53d7891..203a3fbae8e7f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2767,8 +2767,6 @@ static inline int filemap_fdatawait(struct address_space *mapping) extern bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend); extern int filemap_write_and_wait(struct address_space *mapping); -extern bool filemap_range_needs_writeback(struct address_space *, - loff_t lstart, loff_t lend); extern int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend); extern int __filemap_fdatawrite_range(struct address_space *mapping, diff --git a/mm/filemap.c b/mm/filemap.c index 282bff6ae177b..810e6e0a0c5c7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -659,49 +659,6 @@ int filemap_write_and_wait(struct address_space *mapping) } EXPORT_SYMBOL(filemap_write_and_wait); -/** - * filemap_range_needs_writeback - check if range potentially needs writeback - * @mapping: address space within which to check - * @start_byte: offset in bytes where the range starts - * @end_byte: offset in bytes where the range ends (inclusive) - * - * Find at least one page in the range supplied, usually used to check if - * direct writing in this range will trigger a writeback. Used by O_DIRECT - * read/write with IOCB_NOWAIT, to see if the caller needs to do - * filemap_write_and_wait_range() before proceeding. - * - * Return: %true if the caller should do filemap_write_and_wait_range() before - * doing O_DIRECT to a page in this range, %false otherwise. - */ -bool filemap_range_needs_writeback(struct address_space *mapping, - loff_t start_byte, loff_t end_byte) -{ - XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); - pgoff_t max = end_byte >> PAGE_SHIFT; - struct page *page; - - if (!mapping_needs_writeback(mapping)) - return false; - if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && - !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) - return false; - if (end_byte < start_byte) - return false; - - rcu_read_lock(); - xas_for_each(&xas, page, max) { - if (xas_retry(&xas, page)) - continue; - if (xa_is_value(page)) - continue; - if (PageDirty(page) || PageLocked(page) || PageWriteback(page)) - break; - } - rcu_read_unlock(); - return page != NULL; -} -EXPORT_SYMBOL_GPL(filemap_range_needs_writeback); - /** * filemap_write_and_wait_range - write out & wait on a file range * @mapping: the address_space for the pages From b953f1fb9aa30dfbef4ab3dfb14ed7cd5e0e254a Mon Sep 17 00:00:00 2001 From: Jiachen Zhang Date: Tue, 28 Jun 2022 13:48:42 +0800 Subject: [PATCH 0265/5344] bytedance: fuse: fix wrong value assignment in fuse_change_attributes() Generally, we should not update c/mtime in writeback_cache mode. This commit fixes it. Fixes: 6f6eec0a70c6 ("bytedance: fuse: wb_trust_server enhancements") Signed-off-by: Jiachen Zhang --- fs/fuse/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 39d25d6bd3fa5..654504403f0f5 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -201,7 +201,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, bool try_wb_update = false; bool wb_update = false; bool inval = false; - bool update_cmtime = is_wb; + bool update_cmtime = !is_wb; if (is_wb && fc->wb_trust_server && S_ISREG(inode->i_mode)) { inode_lock(inode); From 719e56670ceb841c655343398bdb94b6aee9ca23 Mon Sep 17 00:00:00 2001 From: Jiachen Zhang Date: Mon, 27 Jun 2022 21:06:49 +0800 Subject: [PATCH 0266/5344] bytedance: fuse: fix recursive inode_lock() Some code path of fuse_change_attributes() already get inode lock, this commit explicitly marks them to avoid recursive locking. Fixes: 6f6eec0a70c6 ("bytedance: fuse: wb_trust_server enhancements") Signed-off-by: Jiachen Zhang --- fs/fuse/dir.c | 25 ++++++++++++++---------- fs/fuse/file.c | 10 ++++++---- fs/fuse/fuse_i.h | 12 ++++++++++-- fs/fuse/inode.c | 48 +++++++++++++++++++++++++++++++++++++++-------- fs/fuse/readdir.c | 5 +++-- 5 files changed, 74 insertions(+), 26 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 9fc2a1a68f1cb..3994d24f8bb13 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -255,7 +255,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) forget_all_cached_acls(inode); fuse_change_attributes(inode, &outarg.attr, entry_attr_timeout(&outarg), - attr_version); + attr_version, FUSE_INODE_UNLOCKED); fuse_change_entry_timeout(entry, &outarg); } else if (inode) { fi = get_fuse_inode(inode); @@ -939,7 +939,7 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, } static int fuse_do_getattr(struct inode *inode, struct kstat *stat, - struct file *file) + struct file *file, enum inode_lock_state lock_state) { int err; struct fuse_getattr_in inarg; @@ -976,7 +976,8 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, } else { fuse_change_attributes(inode, &outarg.attr, attr_timeout(&outarg), - attr_version); + attr_version, + lock_state); if (stat) fuse_fillattr(inode, &outarg.attr, stat); } @@ -986,7 +987,8 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, static int fuse_update_get_attr(struct inode *inode, struct file *file, struct kstat *stat, u32 request_mask, - unsigned int flags) + unsigned int flags, + enum inode_lock_state lock_state) { struct fuse_inode *fi = get_fuse_inode(inode); int err = 0; @@ -1003,7 +1005,7 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, if (sync) { forget_all_cached_acls(inode); - err = fuse_do_getattr(inode, stat, file); + err = fuse_do_getattr(inode, stat, file, lock_state); } else if (stat) { generic_fillattr(inode, stat); stat->mode = fi->orig_i_mode; @@ -1013,11 +1015,13 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, return err; } -int fuse_update_attributes(struct inode *inode, struct file *file) +int fuse_update_attributes(struct inode *inode, struct file *file, + enum inode_lock_state lock_state) { /* Do *not* need to get atime for internal purposes */ return fuse_update_get_attr(inode, file, NULL, - STATX_BASIC_STATS & ~STATX_ATIME, 0); + STATX_BASIC_STATS & ~STATX_ATIME, 0, + lock_state); } int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, @@ -1151,7 +1155,7 @@ static int fuse_perm_getattr(struct inode *inode, int mask) return -ECHILD; forget_all_cached_acls(inode); - return fuse_do_getattr(inode, NULL, NULL); + return fuse_do_getattr(inode, NULL, NULL, FUSE_INODE_MAY_LOCKED); } /* @@ -1674,7 +1678,7 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr) * ia_mode calculation may have used stale i_mode. * Refresh and recalculate. */ - ret = fuse_do_getattr(inode, NULL, file); + ret = fuse_do_getattr(inode, NULL, file, FUSE_INODE_LOCKED); if (ret) return ret; @@ -1720,7 +1724,8 @@ static int fuse_getattr(const struct path *path, struct kstat *stat, if (!fuse_allow_current_process(fc)) return -EACCES; - return fuse_update_get_attr(inode, NULL, stat, request_mask, flags); + return fuse_update_get_attr(inode, NULL, stat, request_mask, flags, + FUSE_INODE_UNLOCKED); } static const struct inode_operations fuse_dir_inode_operations = { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index c7f43c1de21dd..52a29d55a684f 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1034,7 +1034,8 @@ static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) if (fc->auto_inval_data || (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) { int err; - err = fuse_update_attributes(inode, iocb->ki_filp); + err = fuse_update_attributes(inode, iocb->ki_filp, + FUSE_INODE_UNLOCKED); if (err) return err; } @@ -1320,7 +1321,8 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) if (get_fuse_conn(inode)->writeback_cache) { /* Update size (EOF optimization) and mode (SUID clearing) */ - err = fuse_update_attributes(mapping->host, file); + err = fuse_update_attributes(mapping->host, file, + FUSE_INODE_UNLOCKED); if (err) return err; @@ -2623,7 +2625,7 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes); fallback: - err = fuse_update_attributes(inode, file); + err = fuse_update_attributes(inode, file, FUSE_INODE_LOCKED); if (!err) return generic_file_llseek(file, offset, whence); else @@ -2643,7 +2645,7 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence) break; case SEEK_END: inode_lock(inode); - retval = fuse_update_attributes(inode, file); + retval = fuse_update_attributes(inode, file, FUSE_INODE_LOCKED); if (!retval) retval = generic_file_llseek(file, offset, whence); inode_unlock(inode); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 0fce3c4df08ee..0674ab6dad965 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -916,11 +916,18 @@ void fuse_init_dir(struct inode *inode); */ void fuse_init_symlink(struct inode *inode); +enum inode_lock_state { + FUSE_INODE_LOCKED, + FUSE_INODE_MAY_LOCKED, + FUSE_INODE_UNLOCKED, +}; + /** * Change attributes of an inode */ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, - u64 attr_valid, u64 attr_version); + u64 attr_valid, u64 attr_version, + enum inode_lock_state lock_state); void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, u64 attr_valid, bool update_cmtime); @@ -1028,7 +1035,8 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id); void fuse_update_ctime(struct inode *inode); -int fuse_update_attributes(struct inode *inode, struct file *file); +int fuse_update_attributes(struct inode *inode, struct file *file, + enum inode_lock_state lock_state); void fuse_flush_writepages(struct inode *inode); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 654504403f0f5..9bc7872330b2a 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -191,7 +191,8 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, } void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, - u64 attr_valid, u64 attr_version) + u64 attr_valid, u64 attr_version, + enum inode_lock_state lock_state) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); @@ -202,10 +203,32 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, bool wb_update = false; bool inval = false; bool update_cmtime = !is_wb; + int res = 0; + bool attr_is_inval = false; + bool should_unlock_inode = false; - if (is_wb && fc->wb_trust_server && S_ISREG(inode->i_mode)) { - inode_lock(inode); + if (fc->wb_trust_server && S_ISREG(inode->i_mode)) { try_wb_update = true; + switch (lock_state) { + case FUSE_INODE_LOCKED: + break; + case FUSE_INODE_MAY_LOCKED: + if (!inode_trylock(inode)) + try_wb_update = false; + else + should_unlock_inode = true; + break; + case FUSE_INODE_UNLOCKED: + res = down_write_killable(&inode->i_rwsem); + if (res) + try_wb_update = false; + else + should_unlock_inode = true; + break; + default: + WARN_ON(1); + try_wb_update = false; + } } spin_lock(&fi->lock); @@ -238,9 +261,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, if ((attr_version != 0 && fi->attr_version > attr_version) || test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) { spin_unlock(&fi->lock); - if (try_wb_update) - inode_unlock(inode); - return; + goto out_unlock_inode; } if (fc->wb_trust_server && attr->should_inval) { @@ -250,8 +271,17 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, } old_mtime = inode->i_mtime; + attr_is_inval = !!(STATX_BASIC_STATS & ~STATX_ATIME & + READ_ONCE(fi->inval_mask)); fuse_change_attributes_common(inode, attr, attr_valid, update_cmtime); + /* + * If we did not get the inode lock in this function, then we should + * re-invalidate the attributes and try it later. + */ + if (fc->wb_trust_server && S_ISREG(inode->i_mode) && !wb_update && + attr_is_inval) + fuse_invalidate_attr(inode); oldsize = inode->i_size; /* @@ -292,7 +322,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, if (inval) invalidate_inode_pages2(inode->i_mapping); - if (try_wb_update) +out_unlock_inode: + if (should_unlock_inode) inode_unlock(inode); } @@ -367,7 +398,8 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, spin_lock(&fi->lock); fi->nlookup++; spin_unlock(&fi->lock); - fuse_change_attributes(inode, attr, attr_valid, attr_version); + fuse_change_attributes(inode, attr, attr_valid, attr_version, + FUSE_INODE_UNLOCKED); return inode; } diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 9f596a4ec4b8c..6c8c3d4c96881 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -220,7 +220,7 @@ static int fuse_direntplus_link(struct file *file, forget_all_cached_acls(inode); fuse_change_attributes(inode, &o->attr, entry_attr_timeout(o), - attr_version); + attr_version, FUSE_INODE_UNLOCKED); /* * The other branch comes via fuse_iget() * which bumps nlookup inside @@ -468,7 +468,8 @@ static int fuse_readdir_cached(struct file *file, struct dir_context *ctx) * cache; both cases require an up-to-date mtime value. */ if (!ctx->pos && fc->auto_inval_data) { - int err = fuse_update_attributes(inode, file); + int err = fuse_update_attributes(inode, file, + FUSE_INODE_LOCKED); if (err) return err; From 0d2eb45b89aed13116c17b09e4136f428a00732e Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Thu, 26 May 2022 15:04:55 +0800 Subject: [PATCH 0267/5344] bytedance: vduse: Fix virtiofs dead timeout handler Now the virtiofs driver would not check the unique field from response. And it even doesn't need any reply for high priority queue. So let's fix the virtiofs dead timeout handler. This also adds some logs for inflight I/O replay for further debug. Signed-off-by: Xie Yongji --- drivers/vdpa/vdpa_user/vduse_dev.c | 35 ++++++++++-------------------- 1 file changed, 12 insertions(+), 23 deletions(-) diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index 64027a285d0f9..35f0a0532c811 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -1489,7 +1489,6 @@ static int vduse_fs_timeout_handler(struct vduse_dev *dev, ssize_t bytes; unsigned short head; int ret; - struct fuse_in_header in; struct fuse_out_header out; ret = vringh_getdesc_iotlb(&vq->vring, &vq->out_iov, &vq->in_iov, @@ -1497,37 +1496,22 @@ static int vduse_fs_timeout_handler(struct vduse_dev *dev, if (ret != 1) return ret; - if (vq->out_iov.used < 1) { - pr_err("VDUSE: missing headers - out_iov: %u\n", - vq->out_iov.used); + len = vringh_kiov_length(&vq->in_iov); + if (!len) goto out; - } - if (vq->out_iov.iov[0].iov_len < sizeof(struct fuse_in_header)) { - pr_err("VDUSE: request in header too short\n"); + if (vq->index == 0) { + pr_err("VDUSE: invalid req in virtiofs high priority queue\n"); goto out; } - bytes = vringh_iov_pull_iotlb(&vq->vring, &vq->out_iov, - &in, sizeof(struct fuse_in_header)); - if (bytes != sizeof(struct fuse_in_header)) { - ret = (bytes >= 0) ? -EINVAL : bytes; - pr_err("VDUSE: read fuse header failed: %ld\n", bytes); - goto err; - } - - len = vringh_kiov_length(&vq->in_iov); - if (!len) - goto out; - - out.unique = in.unique; out.error = -EIO; - bytes = vringh_iov_push_iotlb(&vq->vring, &vq->in_iov, &out, sizeof(struct fuse_out_header)); if (bytes != sizeof(struct fuse_out_header)) { ret = (bytes >= 0) ? -EINVAL : bytes; - pr_err("VDUSE: write fuse header failed: %ld\n", bytes); + pr_err("VDUSE: write fuse header failed in (%u, %u): %ld\n", + head, vq->vring.last_avail_idx, bytes); goto err; } @@ -1536,7 +1520,8 @@ static int vduse_fs_timeout_handler(struct vduse_dev *dev, out: ret = vringh_complete_iotlb(&vq->vring, head, len); if (ret) { - pr_err("VDUSE: update used vring failed\n"); + pr_err("VDUSE: update used vring failed in (%u, %u): %ld\n", + head, vq->vring.last_avail_idx, bytes); goto err; } @@ -1590,6 +1575,10 @@ static void vduse_vq_check_inflights(struct vduse_dev *dev, int index) if (inflight->desc[i].inflight) vringh_recover_desc_iotlb(&vq->vring, idx++, i); } + + pr_info("VDUSE: get vq%d I/O for %s, desc %d num %d idx %u inuse %d\n", + index, dev_name(&dev->dev), desc_num, vq->vring.vring.num, + vq->vring.last_avail_idx, idx - vq->vring.last_avail_idx); } static void vduse_dev_timeout_work(struct work_struct *work) From b6cc3c8b1b0c2538cae93162a8e6b27b73614989 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Wed, 1 Jun 2022 19:30:12 +0800 Subject: [PATCH 0268/5344] bytedance: vduse: Add default dead timeout handler Add default dead timeout handler for virtio devices other than block device and fs device. Signed-off-by: Xie Yongji --- drivers/vdpa/vdpa_user/vduse_dev.c | 48 +++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index 35f0a0532c811..40991f40327c3 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -101,6 +101,7 @@ struct vduse_dev { u8 status; struct delayed_work timeout_work; u16 dead_timeout; + int (*dead_handler)(struct vduse_dev *dev, struct vduse_virtqueue *vq); void *shm_addr; u8 dev_shm_size; u8 vq_shm_off; @@ -1531,6 +1532,30 @@ static int vduse_fs_timeout_handler(struct vduse_dev *dev, return ret; } +static int vduse_default_timeout_handler(struct vduse_dev *dev, + struct vduse_virtqueue *vq) +{ + size_t len = 0; + unsigned short head; + int ret; + + ret = vringh_getdesc_iotlb(&vq->vring, &vq->out_iov, &vq->in_iov, + &head, GFP_ATOMIC); + if (ret != 1) + return ret; + + ret = vringh_complete_iotlb(&vq->vring, head, len); + if (ret) { + pr_err("VDUSE: update used vring failed\n"); + goto err; + } + + return 1; +err: + vringh_abandon_iotlb(&vq->vring, 1); + return ret; +} + /* Returns 0 if there was no request, 1 if there was, or -errno. */ static int vduse_req_timeout_handler(struct vduse_dev *dev, struct vduse_virtqueue *vq) @@ -1545,11 +1570,7 @@ static int vduse_req_timeout_handler(struct vduse_dev *dev, return ret; } - if (dev->device_id == VIRTIO_ID_BLOCK) - ret = vduse_blk_timeout_handler(dev, vq); - else if (dev->device_id == VIRTIO_ID_FS) - ret = vduse_fs_timeout_handler(dev, vq); - + ret = dev->dead_handler(dev, vq); if (ret < 0 && do_retry) { do_retry = false; goto retry; @@ -1609,12 +1630,8 @@ static void vduse_dev_timeout_work(struct work_struct *work) } spin_unlock(&dev->msg_lock); - if (dev->device_id != VIRTIO_ID_BLOCK && - dev->device_id != VIRTIO_ID_FS) { - pr_warn("VDUSE: unsupported device type %d in %s\n", - dev->device_id, dev_name(&dev->dev)); + if (!dev->dead_handler) goto unlock; - } if (!dev->shm_addr && check_inflight) { check_inflight = false; @@ -1647,6 +1664,16 @@ static void vduse_dev_timeout_work(struct work_struct *work) mutex_unlock(&dev->lock); } +static void vduse_set_dead_handler(struct vduse_dev *dev) +{ + if (dev->device_id == VIRTIO_ID_FS) + dev->dead_handler = vduse_fs_timeout_handler; + else if (dev->device_id == VIRTIO_ID_BLOCK) + dev->dead_handler = vduse_blk_timeout_handler; + else + dev->dead_handler = vduse_default_timeout_handler; +} + static ssize_t abort_conn_store(struct device *device, struct device_attribute *attr, const char *buf, size_t count) @@ -1806,6 +1833,7 @@ static int vduse_create_dev(struct vduse_dev_config *config, if (ret) goto err_vqs; + vduse_set_dead_handler(dev); list_add(&dev->list, &vduse_devs); __module_get(THIS_MODULE); From c75a2a62765c14fd291a13b25595c48283ce5de8 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Wed, 1 Jun 2022 19:39:47 +0800 Subject: [PATCH 0269/5344] bytedance: vduse: Add a device attribute to enable dead timeout handler Add a device attribute to enable dead timeout handler. This might be used by some device drivers which can handle the I/O timeout cases by self. Signed-off-by: Xie Yongji --- drivers/vdpa/vdpa_user/vduse_dev.c | 34 ++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index 40991f40327c3..e45a1e5279126 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -1688,8 +1688,42 @@ static ssize_t abort_conn_store(struct device *device, static DEVICE_ATTR_WO(abort_conn); +static ssize_t enable_dead_handler_show(struct device *device, + struct device_attribute *attr, + char *buf) +{ + struct vduse_dev *dev = container_of(device, struct vduse_dev, dev); + + return sprintf(buf, "%d\n", dev->dead_handler ? 1 : 0); +} + +static ssize_t enable_dead_handler_store(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct vduse_dev *dev = container_of(device, struct vduse_dev, dev); + bool enable; + int ret; + + ret = kstrtobool(buf, &enable); + if (ret) + return ret; + + mutex_lock(&dev->lock); + if (enable) + vduse_set_dead_handler(dev); + else + dev->dead_handler = NULL; + mutex_unlock(&dev->lock); + + return count; +} + +static DEVICE_ATTR_RW(enable_dead_handler); + static struct attribute *vduse_dev_attrs[] = { &dev_attr_abort_conn.attr, + &dev_attr_enable_dead_handler.attr, NULL }; From 51ab0ca60a31df4d6e95cfcd248e64aa3d66faf3 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Thu, 16 Jun 2022 21:20:07 +0800 Subject: [PATCH 0270/5344] bytedance: vduse: Add sysfs interface for dead timeout Add a sysfs interface to allow user to set dead timeout explicitly. Signed-off-by: Xie Yongji --- drivers/vdpa/vdpa_user/vduse_dev.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index e45a1e5279126..3af8701675e64 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -1721,9 +1721,35 @@ static ssize_t enable_dead_handler_store(struct device *device, static DEVICE_ATTR_RW(enable_dead_handler); +static ssize_t dead_timeout_show(struct device *device, + struct device_attribute *attr, + char *buf) +{ + struct vduse_dev *dev = container_of(device, struct vduse_dev, dev); + + return sprintf(buf, "%u\n", dev->dead_timeout); +} + +static ssize_t dead_timeout_store(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct vduse_dev *dev = container_of(device, struct vduse_dev, dev); + int ret; + + ret = kstrtou16(buf, 0, &dev->dead_timeout); + if (ret) + return ret; + + return count; +} + +static DEVICE_ATTR_RW(dead_timeout); + static struct attribute *vduse_dev_attrs[] = { &dev_attr_abort_conn.attr, &dev_attr_enable_dead_handler.attr, + &dev_attr_dead_timeout.attr, NULL }; From 9b6bd6a71836ff1a7046515cb3e3251cdee8a675 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Tue, 24 May 2022 21:51:58 +0800 Subject: [PATCH 0271/5344] fuse: allow skipping abort interface for virtiofs The commit 15c8e72e88e0 ("fuse: allow skipping control interface and forced unmount") tries to remove the control interface for virtio-fs since it does not support aborting requests which are being processed. But it doesn't work now. This commits fix the bug, but only remove the abort interface instead since other interfaces should be useful. Link: https://lore.kernel.org/linux-fsdevel/20220607110504.198-1-xieyongji@bytedance.com/ Fixes: 15c8e72e88e0 ("fuse: allow skipping control interface and forced unmount") Signed-off-by: Xie Yongji --- fs/fuse/control.c | 4 ++-- fs/fuse/fuse_i.h | 6 +++--- fs/fuse/inode.c | 2 +- fs/fuse/virtio_fs.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/fuse/control.c b/fs/fuse/control.c index c23f6f243ad42..71a2ecf50c241 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -279,8 +279,8 @@ int fuse_ctl_add_conn(struct fuse_conn *fc) if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1, NULL, &fuse_ctl_waiting_ops) || - !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1, - NULL, &fuse_ctl_abort_ops) || + (!fc->no_abort_control && !fuse_ctl_add_dentry(parent, fc, "abort", + S_IFREG | 0200, 1, NULL, &fuse_ctl_abort_ops)) || !fuse_ctl_add_dentry(parent, fc, "max_background", S_IFREG | 0600, 1, NULL, &fuse_conn_max_background_ops) || !fuse_ctl_add_dentry(parent, fc, "congestion_threshold", diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 0674ab6dad965..5c42acce32e75 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -485,7 +485,7 @@ struct fuse_fs_context { bool default_permissions:1; bool allow_other:1; bool destroy:1; - bool no_control:1; + bool no_abort_control:1; bool no_force_umount:1; bool no_mount_options:1; bool delete_stale:1; @@ -724,8 +724,8 @@ struct fuse_conn { /* Delete dentries that have gone stale */ unsigned int delete_stale:1; - /** Do not create entry in fusectl fs */ - unsigned int no_control:1; + /** Do not create abort entry in fusectl fs */ + unsigned int no_abort_control:1; /** Do not allow MNT_FORCE umount */ unsigned int no_force_umount:1; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 9bc7872330b2a..aee1460c9cce0 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1298,7 +1298,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) fc->group_id = ctx->group_id; fc->max_read = max_t(unsigned, 4096, ctx->max_read); fc->destroy = ctx->destroy; - fc->no_control = ctx->no_control; + fc->no_abort_control = ctx->no_abort_control; fc->no_force_umount = ctx->no_force_umount; fc->no_mount_options = ctx->no_mount_options; diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 01d2e67eb163e..f0a7d2a07e96a 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1132,7 +1132,7 @@ static inline void virtio_fs_ctx_set_defaults(struct fuse_fs_context *ctx) ctx->max_read = UINT_MAX; ctx->blksize = 512; ctx->destroy = true; - ctx->no_control = true; + ctx->no_abort_control = true; } static int virtio_fs_fill_super(struct super_block *sb, struct fs_context *fsc) From 9e6bce23d0b2845c86098c3da234801d06956abe Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Fri, 29 Apr 2022 18:27:31 +0800 Subject: [PATCH 0272/5344] bytedance: virtiofs: Support I/O timeout This add I/O timeout support for virtio-fs. We will abort all inflight I/O request after timeout. To keep compatibility, this feature is disabled by default. Signed-off-by: Xie Yongji --- fs/fuse/virtio_fs.c | 62 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index f0a7d2a07e96a..77b6ffbfafd3a 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -14,6 +14,12 @@ #include #include "fuse_i.h" +/* Disabled by default */ +#define IO_TIMEOUT_MS_DEFAULT 0 + +static unsigned long io_timeout_ms = IO_TIMEOUT_MS_DEFAULT; +module_param(io_timeout_ms, ulong, 0644); + /* List of virtio-fs device instances and a lock for the list. Also provides * mutual exclusion in device removal and mounting path */ @@ -167,12 +173,23 @@ static void virtio_fs_fiq_release(struct fuse_iqueue *fiq) mutex_unlock(&virtio_fs_mutex); } +static void virtio_fs_forgot_requests_abort(struct virtio_fs_vq *fsvq); +static void virtio_fs_requests_abort(struct virtio_fs_vq *fsvq); + static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq) { + ktime_t start = ktime_get(); + u64 timeout_ms; + WARN_ON(fsvq->in_flight < 0); /* Wait for in flight requests to finish.*/ while (1) { + timeout_ms = READ_ONCE(io_timeout_ms); + if (timeout_ms && + ktime_after(ktime_sub_ms(ktime_get(), timeout_ms), start)) + break; + spin_lock(&fsvq->lock); if (!fsvq->in_flight) { spin_unlock(&fsvq->lock); @@ -182,7 +199,12 @@ static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq) /* TODO use completion instead of timeout */ usleep_range(1000, 2000); } - + if (fsvq->in_flight) { + if (fsvq->vq->index == VQ_HIPRIO) + virtio_fs_forgot_requests_abort(fsvq); + else + virtio_fs_requests_abort(fsvq); + } flush_work(&fsvq->done_work); flush_delayed_work(&fsvq->dispatch_work); } @@ -599,6 +621,44 @@ static void virtio_fs_requests_done_work(struct work_struct *work) } } +static void virtio_fs_forgot_requests_abort(struct virtio_fs_vq *fsvq) +{ + struct virtqueue *vq = fsvq->vq; + void *req; + + /* Free completed FUSE_FORGET requests */ + spin_lock(&fsvq->lock); + while ((req = virtqueue_detach_unused_buf(vq)) != NULL) { + kfree(req); + dec_in_flight_req(fsvq); + } + spin_unlock(&fsvq->lock); +} + +static void virtio_fs_requests_abort(struct virtio_fs_vq *fsvq) +{ + struct fuse_pqueue *fpq = &fsvq->fud->pq; + struct virtqueue *vq = fsvq->vq; + struct fuse_req *req; + struct fuse_req *next; + LIST_HEAD(reqs); + + spin_lock(&fsvq->lock); + while ((req = virtqueue_detach_unused_buf(vq)) != NULL) { + spin_lock(&fpq->lock); + list_move_tail(&req->list, &reqs); + spin_unlock(&fpq->lock); + } + spin_unlock(&fsvq->lock); + + list_for_each_entry_safe(req, next, &reqs, list) { + pr_err("Abort virtiofs request: %u\n", req->in.h.opcode); + list_del_init(&req->list); + req->out.h.error = -ECONNABORTED; + virtio_fs_request_complete(req, fsvq); + } +} + /* Virtqueue interrupt handler */ static void virtio_fs_vq_done(struct virtqueue *vq) { From a0336ecaaac620d264041e1351dfd2c373a6ed12 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Tue, 24 May 2022 14:48:59 +0800 Subject: [PATCH 0273/5344] bytedance: virtiofs: Abort requests if virtqueue is broken The inflight requests would never be completed if virtqueue is broken. So let's abort them. Signed-off-by: Xie Yongji --- fs/fuse/virtio_fs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 77b6ffbfafd3a..64d3ab963c8ea 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -186,8 +186,8 @@ static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq) /* Wait for in flight requests to finish.*/ while (1) { timeout_ms = READ_ONCE(io_timeout_ms); - if (timeout_ms && - ktime_after(ktime_sub_ms(ktime_get(), timeout_ms), start)) + if (virtqueue_is_broken(fsvq->vq) || (timeout_ms && + ktime_after(ktime_sub_ms(ktime_get(), timeout_ms), start))) break; spin_lock(&fsvq->lock); From a4d33a6fdc9d1388723c2cb530c554fe597805bc Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Fri, 27 May 2022 13:29:10 +0800 Subject: [PATCH 0274/5344] bytedance: vduse: Add irq_use_wq sysfs interface Add irq_use_wq sysfs interface to control whether use workqueue to inject irq or not. Signed-off-by: Xie Yongji --- drivers/vdpa/vdpa_user/vduse_dev.c | 31 ++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index 3af8701675e64..cd7b9e7a3954f 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -52,6 +52,7 @@ struct vduse_virtqueue { struct work_struct inject; struct kobject kobj; int irq_affinity; + bool irq_use_wq; u16 avail_index; u32 num; u64 desc_addr; @@ -1124,8 +1125,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, vq = dev->vqs[vq_index]; ret = 0; - /* virtio-fs driver already uses workqueue in irq handler */ - if (dev->device_id == VIRTIO_ID_FS) { + if (!vq->irq_use_wq) { spin_lock_irq(&vq->irq_lock); if (vq->ready && vq->cb.callback) vq->cb.callback(vq->cb.private); @@ -1256,6 +1256,26 @@ static const struct file_operations vduse_dev_fops = { .mmap = vduse_dev_mmap, }; +static ssize_t irq_use_wq_show(struct vduse_virtqueue *vq, char *buf) +{ + return sprintf(buf, "%d\n", vq->irq_use_wq); +} + +static ssize_t irq_use_wq_store(struct vduse_virtqueue *vq, + const char *buf, size_t count) +{ + bool enabled; + int ret; + + ret = kstrtobool(buf, &enabled); + if (ret) + return ret; + + vq->irq_use_wq = enabled; + + return count; +} + static ssize_t irq_affinity_show(struct vduse_virtqueue *vq, char *buf) { return sprintf(buf, "%d\n", vq->irq_affinity); @@ -1313,10 +1333,13 @@ static struct vq_sysfs_entry irq_inject_attr = __ATTR_WO(irq_inject); static struct vq_sysfs_entry kick_attr = __ATTR_WO(kick); +static struct vq_sysfs_entry irq_use_wq_attr = __ATTR_RW(irq_use_wq); + static struct attribute *vq_attrs[] = { &irq_affinity_attr.attr, &irq_inject_attr.attr, &kick_attr.attr, + &irq_use_wq_attr.attr, NULL, }; @@ -1407,6 +1430,10 @@ static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, dev->vqs[i]->index = i; dev->vqs[i]->dev = dev; dev->vqs[i]->irq_affinity = -1; + /* virtio-fs driver already uses workqueue in irq handler */ + dev->vqs[i]->irq_use_wq = (dev->device_id == VIRTIO_ID_FS) ? + false : true; + vringh_set_iotlb(&dev->vqs[i]->vring, dev->domain->iotlb, &dev->domain->iotlb_lock); vringh_kiov_init(&dev->vqs[i]->out_iov, NULL, 0); From 982925398c19405b352378b5cd9d49ca1b272f7e Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Fri, 27 May 2022 16:11:02 +0800 Subject: [PATCH 0275/5344] bytedance: vduse: Allow concurrent irq processing The irq callback is not handled in irq context in VDUSE cases, so we try to allow the concurrent irq processing. It will be used by some new drivers or mechanism designed only for VDUSE. The old driver would be protected by the virtqueue lock, so it should be safe. Signed-off-by: Xie Yongji --- drivers/vdpa/vdpa_user/vduse_dev.c | 51 +++++++++++++++--------------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index cd7b9e7a3954f..b52a1611fcf5b 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -46,7 +46,7 @@ struct vduse_virtqueue { u16 index; bool ready; spinlock_t kick_lock; - spinlock_t irq_lock; + struct rw_semaphore irq_lock; struct eventfd_ctx *kickfd; struct vdpa_callback cb; struct work_struct inject; @@ -551,11 +551,11 @@ static void vduse_dev_reset(struct vduse_dev *dev) vq->device_addr = 0; vq->avail_index = 0; vq->num = 0; - spin_lock(&vq->irq_lock); + down_write(&vq->irq_lock); vq->ready = false; vq->cb.callback = NULL; vq->cb.private = NULL; - spin_unlock(&vq->irq_lock); + up_write(&vq->irq_lock); } } @@ -605,10 +605,10 @@ static void vduse_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 idx, struct vduse_dev *dev = vdpa_to_vduse(vdpa); struct vduse_virtqueue *vq = dev->vqs[idx]; - spin_lock(&vq->irq_lock); + down_write(&vq->irq_lock); vq->cb.callback = cb->callback; vq->cb.private = cb->private; - spin_unlock(&vq->irq_lock); + up_write(&vq->irq_lock); } static void vduse_vdpa_set_vq_num(struct vdpa_device *vdpa, u16 idx, u32 num) @@ -992,23 +992,28 @@ static void vduse_dev_irq_inject(struct work_struct *work) dev->name, dev->status); } -static void vduse_vq_irq_inject(struct work_struct *work) +static void vduse_vq_irq_inject(struct vduse_virtqueue *vq) { - struct vduse_virtqueue *vq = container_of(work, - struct vduse_virtqueue, inject); struct vduse_dev *dev = vq->dev; - bool missed = true; - spin_lock_irq(&vq->irq_lock); - if (dev && (dev->status & VIRTIO_CONFIG_S_DRIVER_OK) && - vq->cb.callback) { + if (!dev) + return; + + down_read(&vq->irq_lock); + if ((dev->status & VIRTIO_CONFIG_S_DRIVER_OK) && + vq->cb.callback) vq->cb.callback(vq->cb.private); - missed = false; - } - spin_unlock_irq(&vq->irq_lock); - if (missed && dev) + else pr_err_ratelimited("Miss vduse [%s] vq%d irq, status: %d\n", dev->name, vq->index, dev->status); + up_read(&vq->irq_lock); +} + +static void vduse_vq_irq_inject_work(struct work_struct *work) +{ + struct vduse_virtqueue *vq = container_of(work, + struct vduse_virtqueue, inject); + vduse_vq_irq_inject(vq); } static long vduse_dev_ioctl(struct file *file, unsigned int cmd, @@ -1126,10 +1131,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, ret = 0; if (!vq->irq_use_wq) { - spin_lock_irq(&vq->irq_lock); - if (vq->ready && vq->cb.callback) - vq->cb.callback(vq->cb.private); - spin_unlock_irq(&vq->irq_lock); + vduse_vq_irq_inject(vq); break; } if (vq->irq_affinity == -1) @@ -1438,9 +1440,9 @@ static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, &dev->domain->iotlb_lock); vringh_kiov_init(&dev->vqs[i]->out_iov, NULL, 0); vringh_kiov_init(&dev->vqs[i]->in_iov, NULL, 0); - INIT_WORK(&dev->vqs[i]->inject, vduse_vq_irq_inject); + INIT_WORK(&dev->vqs[i]->inject, vduse_vq_irq_inject_work); spin_lock_init(&dev->vqs[i]->kick_lock); - spin_lock_init(&dev->vqs[i]->irq_lock); + init_rwsem(&dev->vqs[i]->irq_lock); kobject_init(&dev->vqs[i]->kobj, &vq_attr_type); ret = kobject_add(&dev->vqs[i]->kobj, &dev->dev.kobj, "vq%d", i); @@ -1682,10 +1684,7 @@ static void vduse_dev_timeout_work(struct work_struct *work) dev->vqs[i])) == 1); } while (!vringh_notify_enable_iotlb(&dev->vqs[i]->vring) && ret >= 0); - spin_lock_irq(&dev->vqs[i]->irq_lock); - if (dev->vqs[i]->cb.callback) - dev->vqs[i]->cb.callback(dev->vqs[i]->cb.private); - spin_unlock_irq(&dev->vqs[i]->irq_lock); + vduse_vq_irq_inject(dev->vqs[i]); } unlock: mutex_unlock(&dev->lock); From 8020b0442d608591de1e393f2aa274e74925ede9 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Sun, 5 Jun 2022 19:13:46 +0800 Subject: [PATCH 0276/5344] bytedance: virtiofs: Allow disabling workqueue in irq handler In VDUSE case, we don't need to use workqueue for irq callback since it doesn't happen in irq context. So let's add a module parameter to disable workqueue to support concurrent irq processing. Signed-off-by: Xie Yongji --- fs/fuse/virtio_fs.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 64d3ab963c8ea..96e79e139a0c5 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -20,6 +20,9 @@ static unsigned long io_timeout_ms = IO_TIMEOUT_MS_DEFAULT; module_param(io_timeout_ms, ulong, 0644); +static bool irq_use_wq = true; +module_param(irq_use_wq, bool, 0644); + /* List of virtio-fs device instances and a lock for the list. Also provides * mutual exclusion in device removal and mounting path */ @@ -659,6 +662,28 @@ static void virtio_fs_requests_abort(struct virtio_fs_vq *fsvq) } } +static void virtio_fs_requests_done(struct virtio_fs_vq *fsvq) +{ + struct fuse_pqueue *fpq = &fsvq->fud->pq; + struct virtqueue *vq = fsvq->vq; + struct fuse_req *req; + unsigned int len; + + do { + spin_lock(&fsvq->lock); + req = virtqueue_get_buf(vq, &len); + spin_unlock(&fsvq->lock); + + if (!req) + break; + + spin_lock(&fpq->lock); + list_del_init(&req->list); + spin_unlock(&fpq->lock); + virtio_fs_request_complete(req, fsvq); + } while (likely(!virtqueue_is_broken(vq))); +} + /* Virtqueue interrupt handler */ static void virtio_fs_vq_done(struct virtqueue *vq) { @@ -666,7 +691,10 @@ static void virtio_fs_vq_done(struct virtqueue *vq) dev_dbg(&vq->vdev->dev, "%s %s\n", __func__, fsvq->name); - queue_work(virtiofs_wq, &fsvq->done_work); + if (irq_use_wq || in_interrupt() || vq->index == VQ_HIPRIO) + queue_work(virtiofs_wq, &fsvq->done_work); + else + virtio_fs_requests_done(fsvq); } /* Initialize virtqueues */ From 35ad5407609c494eaaa5cdf9dd30d249ece15d8d Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Sun, 5 Jun 2022 17:06:09 +0800 Subject: [PATCH 0277/5344] bytedance: virtio_ring: Support DMA buffer premapping Export some functions to support DMA buffer premapping, this can reduce the critical section of virtqueue lock since VDUSE always do bounce-buffer copy during DMA mapping/unmapping. Signed-off-by: Xie Yongji --- drivers/virtio/virtio_ring.c | 371 +++++++++++++++++++++++++++++++++++ include/linux/virtio.h | 26 +++ 2 files changed, 397 insertions(+) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index c04efc9eec3f0..d0f4263812708 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -412,6 +412,218 @@ static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq, return desc; } +dma_addr_t virtio_dma_map_sg(struct virtio_device *dev, + struct scatterlist *sg, + enum dma_data_direction direction) +{ + if (!vring_use_dma_api(dev)) + return (dma_addr_t)sg_phys(sg); + + /* + * We can't use dma_map_sg, because we don't use scatterlists in + * the way it expects (we don't guarantee that the scatterlist + * will exist for the lifetime of the mapping). + */ + return dma_map_page(dev->dev.parent, + sg_page(sg), sg->offset, sg->length, + direction); +} +EXPORT_SYMBOL_GPL(virtio_dma_map_sg); + +dma_addr_t virtio_dma_map(struct virtio_device *dev, void *addr, + unsigned int length, enum dma_data_direction dir) +{ + struct page *page; + size_t offset; + + page = virt_to_page(addr); + offset = offset_in_page(addr); + + if (!vring_use_dma_api(dev)) + return page_to_phys(page) + offset; + + return dma_map_page(dev->dev.parent, page, offset, length, dir); +} +EXPORT_SYMBOL_GPL(virtio_dma_map); + +int virtio_dma_mapping_error(struct virtio_device *dev, dma_addr_t addr) +{ + if (!vring_use_dma_api(dev)) + return 0; + + return dma_mapping_error(dev->dev.parent, addr); +} +EXPORT_SYMBOL_GPL(virtio_dma_mapping_error); + +void virtio_dma_unmap(struct virtio_device *dev, dma_addr_t dma, + unsigned int length, enum dma_data_direction dir) +{ + if (!vring_use_dma_api(dev)) + return; + + dma_unmap_page(dev->dev.parent, dma, length, dir); +} +EXPORT_SYMBOL_GPL(virtio_dma_unmap); + +int virtqueue_add_mapped_buf_split(struct virtqueue *_vq, + struct scatterlist *sgs[], + unsigned int total_sg, + unsigned int out_sgs, + unsigned int in_sgs, + void *data, void *ctx, + gfp_t gfp) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + struct scatterlist *sg; + struct vring_desc *desc; + unsigned int i, n, avail, descs_used, uninitialized_var(prev); + int head; + bool indirect; + + START_USE(vq); + + BUG_ON(data == NULL); + BUG_ON(ctx && vq->indirect); + + if (unlikely(vq->broken)) { + END_USE(vq); + return -EIO; + } + + LAST_ADD_TIME_UPDATE(vq); + + BUG_ON(total_sg == 0); + + head = vq->free_head; + + if (virtqueue_use_indirect(_vq, total_sg)) + desc = alloc_indirect_split(_vq, total_sg, gfp); + else { + desc = NULL; + WARN_ON_ONCE(total_sg > vq->split.vring.num && !vq->indirect); + } + + if (desc) { + /* Use a single buffer which doesn't continue */ + indirect = true; + /* Set up rest to use this indirect table. */ + i = 0; + descs_used = 1; + } else { + indirect = false; + desc = vq->split.vring.desc; + i = head; + descs_used = total_sg; + } + + if (vq->vq.num_free < descs_used) { + pr_debug("Can't add buf len %i - avail = %i\n", + descs_used, vq->vq.num_free); + /* FIXME: for historical reasons, we force a notify here if + * there are outgoing parts to the buffer. Presumably the + * host should service the ring ASAP. + */ + if (out_sgs) + vq->notify(&vq->vq); + if (indirect) + kfree(desc); + END_USE(vq); + return -ENOSPC; + } + + for (n = 0; n < out_sgs; n++) { + for (sg = sgs[n]; sg; sg = sg_next(sg)) { + dma_addr_t addr = sg_dma_address(sg); + + desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT); + desc[i].addr = cpu_to_virtio64(_vq->vdev, addr); + desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length); + prev = i; + i = virtio16_to_cpu(_vq->vdev, desc[i].next); + } + } + for (; n < (out_sgs + in_sgs); n++) { + for (sg = sgs[n]; sg; sg = sg_next(sg)) { + dma_addr_t addr = sg_dma_address(sg); + + desc[i].flags = cpu_to_virtio16(_vq->vdev, + VRING_DESC_F_NEXT | VRING_DESC_F_WRITE); + desc[i].addr = cpu_to_virtio64(_vq->vdev, addr); + desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length); + prev = i; + i = virtio16_to_cpu(_vq->vdev, desc[i].next); + } + } + /* Last one doesn't continue. */ + desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT); + + if (indirect) { + /* Now that the indirect table is filled in, map it. */ + dma_addr_t addr = vring_map_single( + vq, desc, total_sg * sizeof(struct vring_desc), + DMA_TO_DEVICE); + if (vring_mapping_error(vq, addr)) + goto unmap_release; + + vq->split.vring.desc[head].flags = cpu_to_virtio16(_vq->vdev, + VRING_DESC_F_INDIRECT); + vq->split.vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, + addr); + + vq->split.vring.desc[head].len = cpu_to_virtio32(_vq->vdev, + total_sg * sizeof(struct vring_desc)); + } + + /* We're using some buffers from the free list. */ + vq->vq.num_free -= descs_used; + + /* Update free pointer */ + if (indirect) + vq->free_head = virtio16_to_cpu(_vq->vdev, + vq->split.vring.desc[head].next); + else + vq->free_head = i; + + /* Store token and indirect buffer state. */ + vq->split.desc_state[head].data = data; + if (indirect) + vq->split.desc_state[head].indir_desc = desc; + else + vq->split.desc_state[head].indir_desc = ctx; + + /* Put entry in available array (but don't update avail->idx until they + * do sync). + */ + avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); + vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); + + /* Descriptors and available array need to be set before we expose the + * new available array entries. + */ + virtio_wmb(vq->weak_barriers); + vq->split.avail_idx_shadow++; + vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, + vq->split.avail_idx_shadow); + vq->num_added++; + + pr_debug("Added buffer head %i to %p\n", head, vq); + END_USE(vq); + + /* This is very unlikely, but theoretically possible. Kick + * just in case. + */ + if (unlikely(vq->num_added == (1 << 16) - 1)) + virtqueue_kick(_vq); + + return 0; + +unmap_release: + kfree(desc); + END_USE(vq); + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(virtqueue_add_mapped_buf_split); + static inline int virtqueue_add_split(struct virtqueue *_vq, struct scatterlist *sgs[], unsigned int total_sg, @@ -675,6 +887,137 @@ static inline bool more_used_split(const struct vring_virtqueue *vq) vq->split.vring.used->idx); } +static void vring_unmap_indirect_split(const struct vring_virtqueue *vq, + struct vring_desc *desc) +{ + u16 flags; + + if (!vq->use_dma_api) + return; + + flags = virtio16_to_cpu(vq->vq.vdev, desc->flags); + + if (!(flags & VRING_DESC_F_INDIRECT)) + return; + + dma_unmap_single(vring_dma_dev(vq), + virtio64_to_cpu(vq->vq.vdev, desc->addr), + virtio32_to_cpu(vq->vq.vdev, desc->len), + (flags & VRING_DESC_F_WRITE) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE); +} + +static void detach_mapped_buf_split(struct vring_virtqueue *vq, unsigned int head, + void **ctx) +{ + unsigned int i; + __virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT); + + /* Clear data ptr. */ + vq->split.desc_state[head].data = NULL; + + /* Put back on free list: unmap first-level descriptors and find end */ + i = head; + + while (vq->split.vring.desc[i].flags & nextflag) { + vring_unmap_indirect_split(vq, &vq->split.vring.desc[i]); + i = virtio16_to_cpu(vq->vq.vdev, vq->split.vring.desc[i].next); + vq->vq.num_free++; + } + + vring_unmap_indirect_split(vq, &vq->split.vring.desc[i]); + vq->split.vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, + vq->free_head); + vq->free_head = head; + + /* Plus final descriptor */ + vq->vq.num_free++; + + if (vq->indirect) { + struct vring_desc *indir_desc = + vq->split.desc_state[head].indir_desc; + u32 len; + + /* Free the indirect table, if any, now that it's unmapped. */ + if (!indir_desc) + return; + + len = virtio32_to_cpu(vq->vq.vdev, + vq->split.vring.desc[head].len); + + BUG_ON(!(vq->split.vring.desc[head].flags & + cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT))); + BUG_ON(len == 0 || len % sizeof(struct vring_desc)); + + kfree(indir_desc); + vq->split.desc_state[head].indir_desc = NULL; + } else if (ctx) { + *ctx = vq->split.desc_state[head].indir_desc; + } +} + +void *virtqueue_get_mapped_buf_split(struct virtqueue *_vq, + unsigned int *len, + void **ctx) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + void *ret; + unsigned int i; + u16 last_used; + + START_USE(vq); + + if (unlikely(vq->broken)) { + END_USE(vq); + return NULL; + } + + if (!more_used_split(vq)) { + pr_debug("No more buffers in queue\n"); + END_USE(vq); + return NULL; + } + + /* Only get used array entries after they have been exposed by host. */ + virtio_rmb(vq->weak_barriers); + + last_used = (vq->last_used_idx & (vq->split.vring.num - 1)); + i = virtio32_to_cpu(_vq->vdev, + vq->split.vring.used->ring[last_used].id); + *len = virtio32_to_cpu(_vq->vdev, + vq->split.vring.used->ring[last_used].len); + + if (unlikely(i >= vq->split.vring.num)) { + BAD_RING(vq, "id %u out of range, usd_idx: %u\n", i, + vq->last_used_idx); + return NULL; + } + if (unlikely(!vq->split.desc_state[i].data)) { + BAD_RING(vq, "id %u is not a head! used_idx: %u\n", i, + vq->last_used_idx); + return NULL; + } + + /* detach_buf_split clears data, so grab it now. */ + ret = vq->split.desc_state[i].data; + detach_mapped_buf_split(vq, i, ctx); + vq->last_used_idx++; + /* If we expect an interrupt for the next entry, tell host + * by writing event index and flush out the write before + * the read in the next get_buf call. + */ + if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) + virtio_store_mb(vq->weak_barriers, + &vring_used_event(&vq->split.vring), + cpu_to_virtio16(_vq->vdev, vq->last_used_idx)); + + LAST_ADD_TIME_INVALID(vq); + + END_USE(vq); + return ret; +} +EXPORT_SYMBOL_GPL(virtqueue_get_mapped_buf_split); + static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq, unsigned int *len, void **ctx) @@ -817,6 +1160,34 @@ static bool virtqueue_enable_cb_delayed_split(struct virtqueue *_vq) return true; } +void *virtqueue_detach_unused_mapped_buf_split(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + unsigned int i; + void *buf; + + START_USE(vq); + + for (i = 0; i < vq->split.vring.num; i++) { + if (!vq->split.desc_state[i].data) + continue; + /* detach_buf_split clears data, so grab it now. */ + buf = vq->split.desc_state[i].data; + detach_mapped_buf_split(vq, i, NULL); + vq->split.avail_idx_shadow--; + vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, + vq->split.avail_idx_shadow); + END_USE(vq); + return buf; + } + /* That should have freed everything. */ + BUG_ON(vq->vq.num_free != vq->split.vring.num); + + END_USE(vq); + return NULL; +} +EXPORT_SYMBOL_GPL(virtqueue_detach_unused_mapped_buf_split); + static void *virtqueue_detach_unused_buf_split(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 7c075463c7f2b..622194ff29143 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -58,6 +58,14 @@ int virtqueue_add_sgs(struct virtqueue *vq, void *data, gfp_t gfp); +int virtqueue_add_mapped_buf_split(struct virtqueue *_vq, + struct scatterlist *sgs[], + unsigned int total_sg, + unsigned int out_sgs, + unsigned int in_sgs, + void *data, void *ctx, + gfp_t gfp); + bool virtqueue_kick(struct virtqueue *vq); bool virtqueue_kick_prepare(struct virtqueue *vq); @@ -69,6 +77,10 @@ void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len); void *virtqueue_get_buf_ctx(struct virtqueue *vq, unsigned int *len, void **ctx); +void *virtqueue_get_mapped_buf_split(struct virtqueue *_vq, + unsigned int *len, + void **ctx); + void virtqueue_disable_cb(struct virtqueue *vq); bool virtqueue_enable_cb(struct virtqueue *vq); @@ -81,6 +93,8 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *vq); void *virtqueue_detach_unused_buf(struct virtqueue *vq); +void *virtqueue_detach_unused_mapped_buf_split(struct virtqueue *_vq); + unsigned int virtqueue_get_vring_size(struct virtqueue *vq); bool virtqueue_is_broken(struct virtqueue *vq); @@ -143,6 +157,18 @@ int virtio_device_restore(struct virtio_device *dev); size_t virtio_max_dma_size(struct virtio_device *vdev); +dma_addr_t virtio_dma_map_sg(struct virtio_device *dev, + struct scatterlist *sg, + enum dma_data_direction direction); + +dma_addr_t virtio_dma_map(struct virtio_device *dev, void *addr, + unsigned int length, enum dma_data_direction dir); + +int virtio_dma_mapping_error(struct virtio_device *dev, dma_addr_t addr); + +void virtio_dma_unmap(struct virtio_device *dev, dma_addr_t dma, + unsigned int length, enum dma_data_direction dir); + #define virtio_device_for_each_vq(vdev, vq) \ list_for_each_entry(vq, &vdev->vqs, list) From f36b6f202bb163b0a9ad9bec2f16916d2cb64172 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Sun, 5 Jun 2022 17:52:50 +0800 Subject: [PATCH 0278/5344] bytedance: virtiofs: Support DMA buffer premapping This adds a module parameter to support DMA buffer premapping. Since it's not needed in VM case, we disable it by default. Signed-off-by: Xie Yongji --- fs/fuse/fuse_i.h | 6 +++ fs/fuse/virtio_fs.c | 112 +++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 112 insertions(+), 6 deletions(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 5c42acce32e75..1f8d757d1e1ce 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -364,6 +364,12 @@ struct fuse_req { #if IS_ENABLED(CONFIG_VIRTIO_FS) /** virtio-fs's physically contiguous buffer for in and out args */ void *argbuf; + /** dma sg list */ + struct scatterlist *sg; + /** number of out sg list */ + unsigned int out_sgs; + /** number of in sg list */ + unsigned int in_sgs; #endif }; diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 96e79e139a0c5..2ff315ce30549 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -23,6 +23,9 @@ module_param(io_timeout_ms, ulong, 0644); static bool irq_use_wq = true; module_param(irq_use_wq, bool, 0644); +static bool enable_dma_premap; +module_param(enable_dma_premap, bool, 0444); + /* List of virtio-fs device instances and a lock for the list. Also provides * mutual exclusion in device removal and mounting path */ @@ -528,6 +531,62 @@ static void copy_args_from_argbuf(struct fuse_args *args, struct fuse_req *req) req->argbuf = NULL; } +static int virtio_fs_request_map(struct fuse_req *req, + struct virtio_fs_vq *fsvq, + struct scatterlist *sg, + unsigned int out_sgs, + unsigned int in_sgs) +{ + dma_addr_t addr; + unsigned int i, total_sgs = out_sgs + in_sgs; + + req->sg = kmalloc_array(total_sgs, sizeof(*sg), GFP_ATOMIC); + if (!req->sg) + return -ENOMEM; + + for (i = 0; i < total_sgs; i++) { + addr = virtio_dma_map_sg(fsvq->vq->vdev, &sg[i], + i < out_sgs ? DMA_TO_DEVICE : + DMA_FROM_DEVICE); + if (virtio_dma_mapping_error(fsvq->vq->vdev, addr)) + goto err; + + sg[i].dma_address = addr; + } + req->out_sgs = out_sgs; + req->in_sgs = in_sgs; + memcpy(req->sg, sg, sizeof(*sg) * total_sgs); + + return 0; +err: + while (i--) + virtio_dma_unmap(fsvq->vq->vdev, sg[i].dma_address, + sg[i].length, i < out_sgs ? + DMA_TO_DEVICE : DMA_FROM_DEVICE); + kfree(req->sg); + req->sg = NULL; + return -ENOMEM; +} + +static void virtio_fs_request_unmap(struct fuse_req *req, + struct virtio_fs_vq *fsvq) +{ + unsigned int i, total_sgs = req->out_sgs + req->in_sgs; + + if (!req->sg) + return; + + for (i = 0; i < total_sgs; i++) { + virtio_dma_unmap(fsvq->vq->vdev, + req->sg[i].dma_address, + req->sg[i].length, + i < req->out_sgs ? DMA_TO_DEVICE : + DMA_FROM_DEVICE); + } + kfree(req->sg); + req->sg = NULL; +} + /* Work function for request completion */ static void virtio_fs_request_complete(struct fuse_req *req, struct virtio_fs_vq *fsvq) @@ -539,6 +598,8 @@ static void virtio_fs_request_complete(struct fuse_req *req, unsigned int len, i, thislen; struct page *page; + virtio_fs_request_unmap(req, fsvq); + /* * TODO verify that server properly follows FUSE protocol * (oh.uniq, oh.len) @@ -581,6 +642,19 @@ static void virtio_fs_complete_req_work(struct work_struct *work) kfree(w); } +static struct fuse_req *virtio_fs_get_buf(struct virtqueue *vq) +{ + struct fuse_req *req; + unsigned int len; + + if (enable_dma_premap) + req = virtqueue_get_mapped_buf_split(vq, &len, NULL); + else + req = virtqueue_get_buf(vq, &len); + + return req; +} + static void virtio_fs_requests_done_work(struct work_struct *work) { struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, @@ -589,7 +663,6 @@ static void virtio_fs_requests_done_work(struct work_struct *work) struct virtqueue *vq = fsvq->vq; struct fuse_req *req; struct fuse_req *next; - unsigned int len; LIST_HEAD(reqs); /* Collect completed requests off the virtqueue */ @@ -597,7 +670,7 @@ static void virtio_fs_requests_done_work(struct work_struct *work) do { virtqueue_disable_cb(vq); - while ((req = virtqueue_get_buf(vq, &len)) != NULL) { + while ((req = virtio_fs_get_buf(vq)) != NULL) { spin_lock(&fpq->lock); list_move_tail(&req->list, &reqs); spin_unlock(&fpq->lock); @@ -638,6 +711,18 @@ static void virtio_fs_forgot_requests_abort(struct virtio_fs_vq *fsvq) spin_unlock(&fsvq->lock); } +static struct fuse_req *virtio_fs_detach_buf(struct virtqueue *vq) +{ + struct fuse_req *req; + + if (enable_dma_premap) + req = virtqueue_detach_unused_mapped_buf_split(vq); + else + req = virtqueue_detach_unused_buf(vq); + + return req; +} + static void virtio_fs_requests_abort(struct virtio_fs_vq *fsvq) { struct fuse_pqueue *fpq = &fsvq->fud->pq; @@ -647,7 +732,7 @@ static void virtio_fs_requests_abort(struct virtio_fs_vq *fsvq) LIST_HEAD(reqs); spin_lock(&fsvq->lock); - while ((req = virtqueue_detach_unused_buf(vq)) != NULL) { + while ((req = virtio_fs_detach_buf(vq)) != NULL) { spin_lock(&fpq->lock); list_move_tail(&req->list, &reqs); spin_unlock(&fpq->lock); @@ -657,6 +742,7 @@ static void virtio_fs_requests_abort(struct virtio_fs_vq *fsvq) list_for_each_entry_safe(req, next, &reqs, list) { pr_err("Abort virtiofs request: %u\n", req->in.h.opcode); list_del_init(&req->list); + virtio_fs_request_unmap(req, fsvq); req->out.h.error = -ECONNABORTED; virtio_fs_request_complete(req, fsvq); } @@ -667,11 +753,10 @@ static void virtio_fs_requests_done(struct virtio_fs_vq *fsvq) struct fuse_pqueue *fpq = &fsvq->fud->pq; struct virtqueue *vq = fsvq->vq; struct fuse_req *req; - unsigned int len; do { spin_lock(&fsvq->lock); - req = virtqueue_get_buf(vq, &len); + req = virtio_fs_get_buf(vq); spin_unlock(&fsvq->lock); if (!req) @@ -1071,6 +1156,7 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, bool notify; struct fuse_pqueue *fpq; + req->sg = NULL; /* Does the sglist fit on the stack? */ total_sgs = sg_count_fuse_req(req); if (total_sgs > ARRAY_SIZE(stack_sgs)) { @@ -1118,7 +1204,18 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, } vq = fsvq->vq; - ret = virtqueue_add_sgs(vq, sgs, out_sgs, in_sgs, req, GFP_ATOMIC); + if (enable_dma_premap) { + ret = virtio_fs_request_map(req, fsvq, sg, out_sgs, in_sgs); + if (ret) { + spin_unlock(&fsvq->lock); + goto out; + } + ret = virtqueue_add_mapped_buf_split(vq, sgs, total_sgs, + out_sgs, in_sgs, req, + NULL, GFP_ATOMIC); + } else + ret = virtqueue_add_sgs(vq, sgs, out_sgs, in_sgs, + req, GFP_ATOMIC); if (ret < 0) { spin_unlock(&fsvq->lock); goto out; @@ -1143,6 +1240,9 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, virtqueue_notify(vq); out: + if (ret < 0 && req->sg) + virtio_fs_request_unmap(req, fsvq); + if (ret < 0 && req->argbuf) { kfree(req->argbuf); req->argbuf = NULL; From 4e9ff13933acf66327c480f9b8baf97e71656301 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Mon, 13 Jun 2022 14:52:23 +0800 Subject: [PATCH 0279/5344] bytedance: fuse: Add sysfs interface for inode invalidation Some userspace daemon might want to invalidate inode's cache explicitly. This patch adds a sysfs interface to support that. Signed-off-by: Xie Yongji --- fs/fuse/control.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++- fs/fuse/fuse_i.h | 2 +- 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 71a2ecf50c241..66e291b64096c 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -191,6 +191,45 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, return ret; } +static ssize_t fuse_conn_invalidate_inode_write(struct file *file, + const char __user *buf, + size_t count, loff_t *ppos) +{ + char tmp[64]; + struct fuse_conn *fc; + uint64_t ino; + int64_t off, len; + int ret; + + if (*ppos != 0) + return -EINVAL; + + if (count >= sizeof(tmp)) + return -ENOSPC; + + ret = simple_write_to_buffer(tmp, sizeof(tmp) - 1, ppos, buf, count); + if (ret != count) + return ret < 0 ? ret : -EIO; + + tmp[ret] = '\0'; + ret = sscanf(tmp, "%llu:%lld:%lld", &ino, &off, &len); + if (ret != 3) + return -EINVAL; + + fc = fuse_ctl_file_conn_get(file); + if (!fc) + return -ENODEV; + + down_read(&fc->killsb); + ret = -ENOENT; + if (fc->sb) + ret = fuse_reverse_inval_inode(fc->sb, ino, off, len); + up_read(&fc->killsb); + fuse_conn_put(fc); + + return ret < 0 ? ret : count; +} + static const struct file_operations fuse_ctl_abort_ops = { .open = nonseekable_open, .write = fuse_conn_abort_write, @@ -217,6 +256,12 @@ static const struct file_operations fuse_conn_congestion_threshold_ops = { .llseek = no_llseek, }; +static const struct file_operations fuse_ctl_invalidate_inode_ops = { + .open = nonseekable_open, + .write = fuse_conn_invalidate_inode_write, + .llseek = no_llseek, +}; + static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, struct fuse_conn *fc, const char *name, @@ -285,7 +330,10 @@ int fuse_ctl_add_conn(struct fuse_conn *fc) 1, NULL, &fuse_conn_max_background_ops) || !fuse_ctl_add_dentry(parent, fc, "congestion_threshold", S_IFREG | 0600, 1, NULL, - &fuse_conn_congestion_threshold_ops)) + &fuse_conn_congestion_threshold_ops) || + !fuse_ctl_add_dentry(parent, fc, "invalidate_inode", + S_IFREG | 0200, 1, NULL, + &fuse_ctl_invalidate_inode_ops)) goto err; return 0; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 1f8d757d1e1ce..a190f7abec69e 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -48,7 +48,7 @@ #define FUSE_NAME_MAX 1024 /** Number of dentries for each connection in the control filesystem */ -#define FUSE_CTL_NUM_DENTRIES 5 +#define FUSE_CTL_NUM_DENTRIES 6 /** List of active connections */ extern struct list_head fuse_conn_list; From a1f6beb8c3093c41489a3243384a8f911a5699f8 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 18 Oct 2019 15:26:28 +0200 Subject: [PATCH 0280/5344] sched/fair: Clean up asym packing commit 490ba971d8b498ba3a47999ab94c6a0d1830ad41 upstream. Clean up asym packing to follow the default load balance behavior: - classify the group by creating a group_asym_packing field. - calculate the imbalance in calculate_imbalance() instead of bypassing it. We don't need to test twice same conditions anymore to detect asym packing and we consolidate the calculation of imbalance in calculate_imbalance(). There is no functional changes. Signed-off-by: Vincent Guittot Acked-by: Rik van Riel Cc: Ben Segall Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: hdanton@sina.com Cc: parth@linux.ibm.com Cc: pauld@redhat.com Cc: quentin.perret@arm.com Cc: srikar@linux.vnet.ibm.com Cc: valentin.schneider@arm.com Link: https://lkml.kernel.org/r/1571405198-27570-2-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 63 ++++++++++++--------------------------------- 1 file changed, 16 insertions(+), 47 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1f34252b748a5..c01525a152997 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7766,6 +7766,7 @@ struct sg_lb_stats { unsigned int group_weight; enum group_type group_type; int group_no_capacity; + unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; @@ -8222,9 +8223,17 @@ static bool update_sd_pick_busiest(struct lb_env *env, * ASYM_PACKING needs to move all the work to the highest * prority CPUs in the group, therefore mark all groups * of lower priority than ourself as busy. + * + * This is primarily intended to used at the sibling level. Some + * cores like POWER7 prefer to use lower numbered SMT threads. In the + * case of POWER7, it can move to lower SMT modes only when higher + * threads are idle. When in lower SMT modes, the threads will + * perform better since they share less core resources. Hence when we + * have idle threads, we want them to be the higher ones. */ if (sgs->sum_nr_running && sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) { + sgs->group_asym_packing = 1; if (!sds->busiest) return true; @@ -8365,51 +8374,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd } } -/** - * check_asym_packing - Check to see if the group is packed into the - * sched domain. - * - * This is primarily intended to used at the sibling level. Some - * cores like POWER7 prefer to use lower numbered SMT threads. In the - * case of POWER7, it can move to lower SMT modes only when higher - * threads are idle. When in lower SMT modes, the threads will - * perform better since they share less core resources. Hence when we - * have idle threads, we want them to be the higher ones. - * - * This packing function is run on idle threads. It checks to see if - * the busiest CPU in this domain (core in the P7 case) has a higher - * CPU number than the packing function is being run on. Here we are - * assuming lower CPU number will be equivalent to lower a SMT thread - * number. - * - * Return: 1 when packing is required and a task should be moved to - * this CPU. The amount of the imbalance is returned in env->imbalance. - * - * @env: The load balancing environment. - * @sds: Statistics of the sched_domain which is to be packed - */ -static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) -{ - int busiest_cpu; - - if (!(env->sd->flags & SD_ASYM_PACKING)) - return 0; - - if (env->idle == CPU_NOT_IDLE) - return 0; - - if (!sds->busiest) - return 0; - - busiest_cpu = sds->busiest->asym_prefer_cpu; - if (sched_asym_prefer(busiest_cpu, env->dst_cpu)) - return 0; - - env->imbalance = sds->busiest_stat.group_load; - - return 1; -} - /** * fix_small_imbalance - Calculate the minor imbalance that exists * amongst the groups of a sched_domain, during @@ -8494,6 +8458,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s local = &sds->local_stat; busiest = &sds->busiest_stat; + if (busiest->group_asym_packing) { + env->imbalance = busiest->group_load; + return; + } + if (busiest->group_type == group_imbalanced) { /* * In the group_imb case we cannot rely on group-wide averages @@ -8598,8 +8567,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) busiest = &sds.busiest_stat; /* ASYM feature bypasses nice load balance check */ - if (check_asym_packing(env, &sds)) - return sds.busiest; + if (busiest->group_asym_packing) + goto force_balance; /* There is no busy sibling group to pull tasks from */ if (!sds.busiest || busiest->sum_nr_running == 0) From 322d539eb852382cf647f872578410c41b33c1d4 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 18 Oct 2019 15:26:29 +0200 Subject: [PATCH 0281/5344] sched/fair: Rename sg_lb_stats::sum_nr_running to sum_h_nr_running commit a34983470301018324f0110791da452fee1318c2 upstream. Rename sum_nr_running to sum_h_nr_running because it effectively tracks cfs->h_nr_running so we can use sum_nr_running to track rq->nr_running when needed. There are no functional changes. Signed-off-by: Vincent Guittot Reviewed-by: Valentin Schneider Acked-by: Rik van Riel Cc: Ben Segall Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: hdanton@sina.com Cc: parth@linux.ibm.com Cc: pauld@redhat.com Cc: quentin.perret@arm.com Cc: srikar@linux.vnet.ibm.com Link: https://lkml.kernel.org/r/1571405198-27570-3-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c01525a152997..56274150fd3ff 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7761,7 +7761,7 @@ struct sg_lb_stats { unsigned long load_per_task; unsigned long group_capacity; unsigned long group_util; /* Total utilization of the group */ - unsigned int sum_nr_running; /* Nr tasks running in the group */ + unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ unsigned int idle_cpus; unsigned int group_weight; enum group_type group_type; @@ -7806,7 +7806,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) .total_capacity = 0UL, .busiest_stat = { .avg_load = 0UL, - .sum_nr_running = 0, + .sum_h_nr_running = 0, .group_type = group_other, }, }; @@ -7999,7 +7999,7 @@ static inline int sg_imbalanced(struct sched_group *group) static inline bool group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) { - if (sgs->sum_nr_running < sgs->group_weight) + if (sgs->sum_h_nr_running < sgs->group_weight) return true; if ((sgs->group_capacity * 100) > @@ -8020,7 +8020,7 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) static inline bool group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) { - if (sgs->sum_nr_running <= sgs->group_weight) + if (sgs->sum_h_nr_running <= sgs->group_weight) return false; if ((sgs->group_capacity * 100) < @@ -8112,7 +8112,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_load += cpu_runnable_load(rq); sgs->group_util += cpu_util(i); - sgs->sum_nr_running += rq->cfs.h_nr_running; + sgs->sum_h_nr_running += rq->cfs.h_nr_running; nr_running = rq->nr_running; if (nr_running > 1) @@ -8142,8 +8142,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_capacity = group->sgc->capacity; sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; - if (sgs->sum_nr_running) - sgs->load_per_task = sgs->group_load / sgs->sum_nr_running; + if (sgs->sum_h_nr_running) + sgs->load_per_task = sgs->group_load / sgs->sum_h_nr_running; sgs->group_weight = group->group_weight; @@ -8200,7 +8200,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, * capable CPUs may harm throughput. Maximize throughput, * power/energy consequences are not considered. */ - if (sgs->sum_nr_running <= sgs->group_weight && + if (sgs->sum_h_nr_running <= sgs->group_weight && group_smaller_min_cpu_capacity(sds->local, sg)) return false; @@ -8231,7 +8231,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, * perform better since they share less core resources. Hence when we * have idle threads, we want them to be the higher ones. */ - if (sgs->sum_nr_running && + if (sgs->sum_h_nr_running && sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) { sgs->group_asym_packing = 1; if (!sds->busiest) @@ -8249,9 +8249,9 @@ static bool update_sd_pick_busiest(struct lb_env *env, #ifdef CONFIG_NUMA_BALANCING static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) { - if (sgs->sum_nr_running > sgs->nr_numa_running) + if (sgs->sum_h_nr_running > sgs->nr_numa_running) return regular; - if (sgs->sum_nr_running > sgs->nr_preferred_running) + if (sgs->sum_h_nr_running > sgs->nr_preferred_running) return remote; return all; } @@ -8326,7 +8326,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd */ if (prefer_sibling && sds->local && group_has_capacity(env, local) && - (sgs->sum_nr_running > local->sum_nr_running + 1)) { + (sgs->sum_h_nr_running > local->sum_h_nr_running + 1)) { sgs->group_no_capacity = 1; sgs->group_type = group_classify(sg, sgs); } @@ -8338,7 +8338,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd next_group: /* Now, start updating sd_lb_stats */ - sds->total_running += sgs->sum_nr_running; + sds->total_running += sgs->sum_h_nr_running; sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; @@ -8392,7 +8392,7 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) local = &sds->local_stat; busiest = &sds->busiest_stat; - if (!local->sum_nr_running) + if (!local->sum_h_nr_running) local->load_per_task = cpu_avg_load_per_task(env->dst_cpu); else if (busiest->load_per_task > local->load_per_task) imbn = 1; @@ -8490,7 +8490,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ if (busiest->group_type == group_overloaded && local->group_type == group_overloaded) { - load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE; + load_above_capacity = busiest->sum_h_nr_running * SCHED_CAPACITY_SCALE; if (load_above_capacity > busiest->group_capacity) { load_above_capacity -= busiest->group_capacity; load_above_capacity *= scale_load_down(NICE_0_LOAD); @@ -8571,7 +8571,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto force_balance; /* There is no busy sibling group to pull tasks from */ - if (!sds.busiest || busiest->sum_nr_running == 0) + if (!sds.busiest || busiest->sum_h_nr_running == 0) goto out_balanced; /* XXX broken for overlapping NUMA groups */ From 8afd0d989d05287ded5aca59695e693aa72892ca Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 18 Oct 2019 15:26:30 +0200 Subject: [PATCH 0282/5344] sched/fair: Remove meaningless imbalance calculation commit fcf0553db6f4c79387864f6e4ab4a891601f395e upstream. Clean up load_balance() and remove meaningless calculation and fields before adding a new algorithm. Signed-off-by: Vincent Guittot Acked-by: Rik van Riel Cc: Ben Segall Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: hdanton@sina.com Cc: parth@linux.ibm.com Cc: pauld@redhat.com Cc: quentin.perret@arm.com Cc: srikar@linux.vnet.ibm.com Cc: valentin.schneider@arm.com Link: https://lkml.kernel.org/r/1571405198-27570-4-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 105 +------------------------------------------- 1 file changed, 1 insertion(+), 104 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 56274150fd3ff..9553c51cc7221 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5435,18 +5435,6 @@ static unsigned long capacity_of(int cpu) return cpu_rq(cpu)->cpu_capacity; } -static unsigned long cpu_avg_load_per_task(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); - unsigned long load_avg = cpu_runnable_load(rq); - - if (nr_running) - return load_avg / nr_running; - - return 0; -} - static void record_wakee(struct task_struct *p) { /* @@ -7758,7 +7746,6 @@ static void update_blocked_averages(int cpu) struct sg_lb_stats { unsigned long avg_load; /*Avg load across the CPUs of the group */ unsigned long group_load; /* Total load over the CPUs of the group */ - unsigned long load_per_task; unsigned long group_capacity; unsigned long group_util; /* Total utilization of the group */ unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ @@ -8142,9 +8129,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_capacity = group->sgc->capacity; sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; - if (sgs->sum_h_nr_running) - sgs->load_per_task = sgs->group_load / sgs->sum_h_nr_running; - sgs->group_weight = group->group_weight; sgs->group_no_capacity = group_is_overloaded(env, sgs); @@ -8374,76 +8358,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd } } -/** - * fix_small_imbalance - Calculate the minor imbalance that exists - * amongst the groups of a sched_domain, during - * load balancing. - * @env: The load balancing environment. - * @sds: Statistics of the sched_domain whose imbalance is to be calculated. - */ -static inline -void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) -{ - unsigned long tmp, capa_now = 0, capa_move = 0; - unsigned int imbn = 2; - unsigned long scaled_busy_load_per_task; - struct sg_lb_stats *local, *busiest; - - local = &sds->local_stat; - busiest = &sds->busiest_stat; - - if (!local->sum_h_nr_running) - local->load_per_task = cpu_avg_load_per_task(env->dst_cpu); - else if (busiest->load_per_task > local->load_per_task) - imbn = 1; - - scaled_busy_load_per_task = - (busiest->load_per_task * SCHED_CAPACITY_SCALE) / - busiest->group_capacity; - - if (busiest->avg_load + scaled_busy_load_per_task >= - local->avg_load + (scaled_busy_load_per_task * imbn)) { - env->imbalance = busiest->load_per_task; - return; - } - - /* - * OK, we don't have enough imbalance to justify moving tasks, - * however we may be able to increase total CPU capacity used by - * moving them. - */ - - capa_now += busiest->group_capacity * - min(busiest->load_per_task, busiest->avg_load); - capa_now += local->group_capacity * - min(local->load_per_task, local->avg_load); - capa_now /= SCHED_CAPACITY_SCALE; - - /* Amount of load we'd subtract */ - if (busiest->avg_load > scaled_busy_load_per_task) { - capa_move += busiest->group_capacity * - min(busiest->load_per_task, - busiest->avg_load - scaled_busy_load_per_task); - } - - /* Amount of load we'd add */ - if (busiest->avg_load * busiest->group_capacity < - busiest->load_per_task * SCHED_CAPACITY_SCALE) { - tmp = (busiest->avg_load * busiest->group_capacity) / - local->group_capacity; - } else { - tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) / - local->group_capacity; - } - capa_move += local->group_capacity * - min(local->load_per_task, local->avg_load + tmp); - capa_move /= SCHED_CAPACITY_SCALE; - - /* Move if we gain throughput */ - if (capa_move > capa_now) - env->imbalance = busiest->load_per_task; -} - /** * calculate_imbalance - Calculate the amount of imbalance present within the * groups of a given sched_domain during load balance. @@ -8463,15 +8377,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s return; } - if (busiest->group_type == group_imbalanced) { - /* - * In the group_imb case we cannot rely on group-wide averages - * to ensure CPU-load equilibrium, look at wider averages. XXX - */ - busiest->load_per_task = - min(busiest->load_per_task, sds->avg_load); - } - /* * Avg load of busiest sg can be less and avg load of local sg can * be greater than avg load across all sgs of sd because avg load @@ -8482,7 +8387,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s (busiest->avg_load <= sds->avg_load || local->avg_load >= sds->avg_load)) { env->imbalance = 0; - return fix_small_imbalance(env, sds); + return; } /* @@ -8520,14 +8425,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s busiest->group_misfit_task_load); } - /* - * if *imbalance is less than the average load per runnable task - * there is no guarantee that any tasks will be moved so we'll have - * a think about bumping its value to force at least one task to be - * moved - */ - if (env->imbalance < busiest->load_per_task) - return fix_small_imbalance(env, sds); } /******* find_busiest_group() helpers end here *********************/ From e9f272a4144662cd5de5dc696aea4499de145bb2 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Mon, 13 Sep 2021 10:59:23 +0800 Subject: [PATCH 0283/5344] Revert "sched/fair: handle case of task_h_load() returning 0" This reverts commit 6aae92ed2c42a94c578fd478cfee0573dab3506b. Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9553c51cc7221..3cd8e28e062c7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3831,11 +3831,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) return; } - /* - * Make sure that misfit_task_load will not be null even if - * task_h_load() returns 0. - */ - rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); + rq->misfit_task_load = task_h_load(p); } #else /* CONFIG_SMP */ @@ -7430,15 +7426,7 @@ static int detach_tasks(struct lb_env *env) if (!can_migrate_task(p, env)) goto next; - /* - * Depending of the number of CPUs and tasks and the - * cgroup hierarchy, task_h_load() can return a null - * value. Make sure that env->imbalance decreases - * otherwise detach_tasks() will stop only after - * detaching up to loop_max tasks. - */ - load = max_t(unsigned long, task_h_load(p), 1); - + load = task_h_load(p); if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) goto next; From 18107e82cd00ebbd26bc5c18cc4c06272f850ac6 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 18 Oct 2019 15:26:31 +0200 Subject: [PATCH 0284/5344] sched/fair: Rework load_balance() commit 0b0695f2b34a4afa3f6e9aa1ff0e5336d8dad912 upstream. The load_balance() algorithm contains some heuristics which have become meaningless since the rework of the scheduler's metrics like the introduction of PELT. Furthermore, load is an ill-suited metric for solving certain task placement imbalance scenarios. For instance, in the presence of idle CPUs, we should simply try to get at least one task per CPU, whereas the current load-based algorithm can actually leave idle CPUs alone simply because the load is somewhat balanced. The current algorithm ends up creating virtual and meaningless values like the avg_load_per_task or tweaks the state of a group to make it overloaded whereas it's not, in order to try to migrate tasks. load_balance() should better qualify the imbalance of the group and clearly define what has to be moved to fix this imbalance. The type of sched_group has been extended to better reflect the type of imbalance. We now have: group_has_spare group_fully_busy group_misfit_task group_asym_packing group_imbalanced group_overloaded Based on the type of sched_group, load_balance now sets what it wants to move in order to fix the imbalance. It can be some load as before but also some utilization, a number of task or a type of task: migrate_task migrate_util migrate_load migrate_misfit This new load_balance() algorithm fixes several pending wrong tasks placement: - the 1 task per CPU case with asymmetric system - the case of cfs task preempted by other class - the case of tasks not evenly spread on groups with spare capacity Also the load balance decisions have been consolidated in the 3 functions below after removing the few bypasses and hacks of the current code: - update_sd_pick_busiest() select the busiest sched_group. - find_busiest_group() checks if there is an imbalance between local and busiest group. - calculate_imbalance() decides what have to be moved. Finally, the now unused field total_running of struct sd_lb_stats has been removed. Signed-off-by: Vincent Guittot Cc: Ben Segall Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: hdanton@sina.com Cc: parth@linux.ibm.com Cc: pauld@redhat.com Cc: quentin.perret@arm.com Cc: riel@surriel.com Cc: srikar@linux.vnet.ibm.com Cc: valentin.schneider@arm.com Link: https://lkml.kernel.org/r/1571405198-27570-5-git-send-email-vincent.guittot@linaro.org [ Small readability and spelling updates. ] Signed-off-by: Ingo Molnar Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 611 +++++++++++++++++++++++++++++--------------- 1 file changed, 402 insertions(+), 209 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3cd8e28e062c7..10c92919e5f16 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7120,11 +7120,26 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; enum fbq_type { regular, remote, all }; +/* + * group_type describes the group of CPUs at the moment of the load balance. + * The enum is ordered by pulling priority, with the group with lowest priority + * first so the groupe_type can be simply compared when selecting the busiest + * group. see update_sd_pick_busiest(). + */ enum group_type { - group_other = 0, + group_has_spare = 0, + group_fully_busy, group_misfit_task, + group_asym_packing, group_imbalanced, - group_overloaded, + group_overloaded +}; + +enum migration_type { + migrate_load = 0, + migrate_util, + migrate_task, + migrate_misfit }; #define LBF_ALL_PINNED 0x01 @@ -7157,7 +7172,7 @@ struct lb_env { unsigned int loop_max; enum fbq_type fbq_type; - enum group_type src_grp_type; + enum migration_type migration_type; struct list_head tasks; }; @@ -7384,7 +7399,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) static const unsigned int sched_nr_migrate_break = 32; /* - * detach_tasks() -- tries to detach up to imbalance runnable load from + * detach_tasks() -- tries to detach up to imbalance load/util/tasks from * busiest_rq, as part of a balancing operation within domain "sd". * * Returns number of detached tasks if successful and 0 otherwise. @@ -7392,8 +7407,8 @@ static const unsigned int sched_nr_migrate_break = 32; static int detach_tasks(struct lb_env *env) { struct list_head *tasks = &env->src_rq->cfs_tasks; + unsigned long util, load; struct task_struct *p; - unsigned long load; int detached = 0; lockdep_assert_held(&env->src_rq->lock); @@ -7426,19 +7441,51 @@ static int detach_tasks(struct lb_env *env) if (!can_migrate_task(p, env)) goto next; - load = task_h_load(p); + switch (env->migration_type) { + case migrate_load: + load = task_h_load(p); - if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) - goto next; + if (sched_feat(LB_MIN) && + load < 16 && !env->sd->nr_balance_failed) + goto next; - if ((load / 2) > env->imbalance) - goto next; + if (load/2 > env->imbalance) + goto next; + + env->imbalance -= load; + break; + + case migrate_util: + util = task_util_est(p); + + if (util > env->imbalance) + goto next; + + env->imbalance -= util; + break; + + case migrate_task: + env->imbalance--; + break; + + case migrate_misfit: + load = task_h_load(p); + + /* + * Load of misfit task might decrease a bit since it has + * been recorded. Be conservative in the condition. + */ + if (load/2 < env->imbalance) + goto next; + + env->imbalance = 0; + break; + } detach_task(p, env); list_add(&p->se.group_node, &env->tasks); detached++; - env->imbalance -= load; #ifdef CONFIG_PREEMPTION /* @@ -7452,7 +7499,7 @@ static int detach_tasks(struct lb_env *env) /* * We only want to steal up to the prescribed amount of - * runnable load. + * load/util/tasks. */ if (env->imbalance <= 0) break; @@ -7740,7 +7787,6 @@ struct sg_lb_stats { unsigned int idle_cpus; unsigned int group_weight; enum group_type group_type; - int group_no_capacity; unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING @@ -7756,10 +7802,10 @@ struct sg_lb_stats { struct sd_lb_stats { struct sched_group *busiest; /* Busiest group in this sd */ struct sched_group *local; /* Local group in this sd */ - unsigned long total_running; unsigned long total_load; /* Total load of all groups in sd */ unsigned long total_capacity; /* Total capacity of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */ + unsigned int prefer_sibling; /* tasks should go to sibling first */ struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ struct sg_lb_stats local_stat; /* Statistics of the local group */ @@ -7770,19 +7816,18 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) /* * Skimp on the clearing to avoid duplicate work. We can avoid clearing * local_stat because update_sg_lb_stats() does a full clear/assignment. - * We must however clear busiest_stat::avg_load because - * update_sd_pick_busiest() reads this before assignment. + * We must however set busiest_stat::group_type and + * busiest_stat::idle_cpus to the worst busiest group because + * update_sd_pick_busiest() reads these before assignment. */ *sds = (struct sd_lb_stats){ .busiest = NULL, .local = NULL, - .total_running = 0UL, .total_load = 0UL, .total_capacity = 0UL, .busiest_stat = { - .avg_load = 0UL, - .sum_h_nr_running = 0, - .group_type = group_other, + .idle_cpus = UINT_MAX, + .group_type = group_has_spare, }, }; } @@ -8026,19 +8071,26 @@ group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref) } static inline enum -group_type group_classify(struct sched_group *group, +group_type group_classify(struct lb_env *env, + struct sched_group *group, struct sg_lb_stats *sgs) { - if (sgs->group_no_capacity) + if (group_is_overloaded(env, sgs)) return group_overloaded; if (sg_imbalanced(group)) return group_imbalanced; + if (sgs->group_asym_packing) + return group_asym_packing; + if (sgs->group_misfit_task_load) return group_misfit_task; - return group_other; + if (!group_has_capacity(env, sgs)) + return group_fully_busy; + + return group_has_spare; } static bool update_nohz_stats(struct rq *rq, bool force) @@ -8075,10 +8127,12 @@ static inline void update_sg_lb_stats(struct lb_env *env, struct sg_lb_stats *sgs, int *sg_status) { - int i, nr_running; + int i, nr_running, local_group; memset(sgs, 0, sizeof(*sgs)); + local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group)); + for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); @@ -8103,9 +8157,16 @@ static inline void update_sg_lb_stats(struct lb_env *env, /* * No need to call idle_cpu() if nr_running is not 0 */ - if (!nr_running && idle_cpu(i)) + if (!nr_running && idle_cpu(i)) { sgs->idle_cpus++; + /* Idle cpu can't have misfit task */ + continue; + } + + if (local_group) + continue; + /* Check for a misfit task on the cpu */ if (env->sd->flags & SD_ASYM_CPUCAPACITY && sgs->group_misfit_task_load < rq->misfit_task_load) { sgs->group_misfit_task_load = rq->misfit_task_load; @@ -8113,14 +8174,24 @@ static inline void update_sg_lb_stats(struct lb_env *env, } } - /* Adjust by relative CPU capacity of the group */ + /* Check if dst CPU is idle and preferred to this group */ + if (env->sd->flags & SD_ASYM_PACKING && + env->idle != CPU_NOT_IDLE && + sgs->sum_h_nr_running && + sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) { + sgs->group_asym_packing = 1; + } + sgs->group_capacity = group->sgc->capacity; - sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; sgs->group_weight = group->group_weight; - sgs->group_no_capacity = group_is_overloaded(env, sgs); - sgs->group_type = group_classify(group, sgs); + sgs->group_type = group_classify(env, group, sgs); + + /* Computing avg_load makes sense only when group is overloaded */ + if (sgs->group_type == group_overloaded) + sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / + sgs->group_capacity; } /** @@ -8143,6 +8214,10 @@ static bool update_sd_pick_busiest(struct lb_env *env, { struct sg_lb_stats *busiest = &sds->busiest_stat; + /* Make sure that there is at least one task to pull */ + if (!sgs->sum_h_nr_running) + return false; + /* * Don't try to pull misfit tasks we can't help. * We can use max_capacity here as reduction in capacity on some @@ -8151,7 +8226,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, */ if (sgs->group_type == group_misfit_task && (!group_smaller_max_cpu_capacity(sg, sds->local) || - !group_has_capacity(env, &sds->local_stat))) + sds->local_stat.group_type != group_has_spare)) return false; if (sgs->group_type > busiest->group_type) @@ -8160,62 +8235,80 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (sgs->group_type < busiest->group_type) return false; - if (sgs->avg_load <= busiest->avg_load) - return false; - - if (!(env->sd->flags & SD_ASYM_CPUCAPACITY)) - goto asym_packing; - /* - * Candidate sg has no more than one task per CPU and - * has higher per-CPU capacity. Migrating tasks to less - * capable CPUs may harm throughput. Maximize throughput, - * power/energy consequences are not considered. + * The candidate and the current busiest group are the same type of + * group. Let check which one is the busiest according to the type. */ - if (sgs->sum_h_nr_running <= sgs->group_weight && - group_smaller_min_cpu_capacity(sds->local, sg)) - return false; - /* - * If we have more than one misfit sg go with the biggest misfit. - */ - if (sgs->group_type == group_misfit_task && - sgs->group_misfit_task_load < busiest->group_misfit_task_load) + switch (sgs->group_type) { + case group_overloaded: + /* Select the overloaded group with highest avg_load. */ + if (sgs->avg_load <= busiest->avg_load) + return false; + break; + + case group_imbalanced: + /* + * Select the 1st imbalanced group as we don't have any way to + * choose one more than another. + */ return false; -asym_packing: - /* This is the busiest node in its class. */ - if (!(env->sd->flags & SD_ASYM_PACKING)) - return true; + case group_asym_packing: + /* Prefer to move from lowest priority CPU's work */ + if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu)) + return false; + break; - /* No ASYM_PACKING if target CPU is already busy */ - if (env->idle == CPU_NOT_IDLE) - return true; - /* - * ASYM_PACKING needs to move all the work to the highest - * prority CPUs in the group, therefore mark all groups - * of lower priority than ourself as busy. - * - * This is primarily intended to used at the sibling level. Some - * cores like POWER7 prefer to use lower numbered SMT threads. In the - * case of POWER7, it can move to lower SMT modes only when higher - * threads are idle. When in lower SMT modes, the threads will - * perform better since they share less core resources. Hence when we - * have idle threads, we want them to be the higher ones. - */ - if (sgs->sum_h_nr_running && - sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) { - sgs->group_asym_packing = 1; - if (!sds->busiest) - return true; + case group_misfit_task: + /* + * If we have more than one misfit sg go with the biggest + * misfit. + */ + if (sgs->group_misfit_task_load < busiest->group_misfit_task_load) + return false; + break; - /* Prefer to move from lowest priority CPU's work */ - if (sched_asym_prefer(sds->busiest->asym_prefer_cpu, - sg->asym_prefer_cpu)) - return true; + case group_fully_busy: + /* + * Select the fully busy group with highest avg_load. In + * theory, there is no need to pull task from such kind of + * group because tasks have all compute capacity that they need + * but we can still improve the overall throughput by reducing + * contention when accessing shared HW resources. + * + * XXX for now avg_load is not computed and always 0 so we + * select the 1st one. + */ + if (sgs->avg_load <= busiest->avg_load) + return false; + break; + + case group_has_spare: + /* + * Select not overloaded group with lowest number of + * idle cpus. We could also compare the spare capacity + * which is more stable but it can end up that the + * group has less spare capacity but finally more idle + * CPUs which means less opportunity to pull tasks. + */ + if (sgs->idle_cpus >= busiest->idle_cpus) + return false; + break; } - return false; + /* + * Candidate sg has no more than one task per CPU and has higher + * per-CPU capacity. Migrating tasks to less capable CPUs may harm + * throughput. Maximize throughput, power/energy consequences are not + * considered. + */ + if ((env->sd->flags & SD_ASYM_CPUCAPACITY) && + (sgs->group_type <= group_fully_busy) && + (group_smaller_min_cpu_capacity(sds->local, sg))) + return false; + + return true; } #ifdef CONFIG_NUMA_BALANCING @@ -8253,13 +8346,13 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) * @env: The load balancing environment. * @sds: variable to hold the statistics for this sched_domain. */ + static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) { struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; - bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING; int sg_status = 0; #ifdef CONFIG_NO_HZ_COMMON @@ -8286,22 +8379,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (local_group) goto next_group; - /* - * In case the child domain prefers tasks go to siblings - * first, lower the sg capacity so that we'll try - * and move all the excess tasks away. We lower the capacity - * of a group only if the local group has the capacity to fit - * these excess tasks. The extra check prevents the case where - * you always pull from the heaviest group when it is already - * under-utilized (possible with a large weight task outweighs - * the tasks on the system). - */ - if (prefer_sibling && sds->local && - group_has_capacity(env, local) && - (sgs->sum_h_nr_running > local->sum_h_nr_running + 1)) { - sgs->group_no_capacity = 1; - sgs->group_type = group_classify(sg, sgs); - } if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; @@ -8310,13 +8387,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd next_group: /* Now, start updating sd_lb_stats */ - sds->total_running += sgs->sum_h_nr_running; sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; sg = sg->next; } while (sg != env->sd->groups); + /* Tag domain that child domain prefers tasks go to siblings first */ + sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING; + #ifdef CONFIG_NO_HZ_COMMON if ((env->flags & LBF_NOHZ_AGAIN) && cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) { @@ -8354,69 +8433,149 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd */ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) { - unsigned long max_pull, load_above_capacity = ~0UL; struct sg_lb_stats *local, *busiest; local = &sds->local_stat; busiest = &sds->busiest_stat; - if (busiest->group_asym_packing) { - env->imbalance = busiest->group_load; + if (busiest->group_type == group_misfit_task) { + /* Set imbalance to allow misfit tasks to be balanced. */ + env->migration_type = migrate_misfit; + env->imbalance = busiest->group_misfit_task_load; + return; + } + + if (busiest->group_type == group_asym_packing) { + /* + * In case of asym capacity, we will try to migrate all load to + * the preferred CPU. + */ + env->migration_type = migrate_task; + env->imbalance = busiest->sum_h_nr_running; + return; + } + + if (busiest->group_type == group_imbalanced) { + /* + * In the group_imb case we cannot rely on group-wide averages + * to ensure CPU-load equilibrium, try to move any task to fix + * the imbalance. The next load balance will take care of + * balancing back the system. + */ + env->migration_type = migrate_task; + env->imbalance = 1; return; } /* - * Avg load of busiest sg can be less and avg load of local sg can - * be greater than avg load across all sgs of sd because avg load - * factors in sg capacity and sgs with smaller group_type are - * skipped when updating the busiest sg: + * Try to use spare capacity of local group without overloading it or + * emptying busiest */ - if (busiest->group_type != group_misfit_task && - (busiest->avg_load <= sds->avg_load || - local->avg_load >= sds->avg_load)) { - env->imbalance = 0; + if (local->group_type == group_has_spare) { + if (busiest->group_type > group_fully_busy) { + /* + * If busiest is overloaded, try to fill spare + * capacity. This might end up creating spare capacity + * in busiest or busiest still being overloaded but + * there is no simple way to directly compute the + * amount of load to migrate in order to balance the + * system. + */ + env->migration_type = migrate_util; + env->imbalance = max(local->group_capacity, local->group_util) - + local->group_util; + + /* + * In some cases, the group's utilization is max or even + * higher than capacity because of migrations but the + * local CPU is (newly) idle. There is at least one + * waiting task in this overloaded busiest group. Let's + * try to pull it. + */ + if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) { + env->migration_type = migrate_task; + env->imbalance = 1; + } + + return; + } + + if (busiest->group_weight == 1 || sds->prefer_sibling) { + unsigned int nr_diff = busiest->sum_h_nr_running; + /* + * When prefer sibling, evenly spread running tasks on + * groups. + */ + env->migration_type = migrate_task; + lsub_positive(&nr_diff, local->sum_h_nr_running); + env->imbalance = nr_diff >> 1; + return; + } + + /* + * If there is no overload, we just want to even the number of + * idle cpus. + */ + env->migration_type = migrate_task; + env->imbalance = max_t(long, 0, (local->idle_cpus - + busiest->idle_cpus) >> 1); return; } /* - * If there aren't any idle CPUs, avoid creating some. + * Local is fully busy but has to take more load to relieve the + * busiest group */ - if (busiest->group_type == group_overloaded && - local->group_type == group_overloaded) { - load_above_capacity = busiest->sum_h_nr_running * SCHED_CAPACITY_SCALE; - if (load_above_capacity > busiest->group_capacity) { - load_above_capacity -= busiest->group_capacity; - load_above_capacity *= scale_load_down(NICE_0_LOAD); - load_above_capacity /= busiest->group_capacity; - } else - load_above_capacity = ~0UL; + if (local->group_type < group_overloaded) { + /* + * Local will become overloaded so the avg_load metrics are + * finally needed. + */ + + local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) / + local->group_capacity; + + sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / + sds->total_capacity; } /* - * We're trying to get all the CPUs to the average_load, so we don't - * want to push ourselves above the average load, nor do we wish to - * reduce the max loaded CPU below the average load. At the same time, - * we also don't want to reduce the group load below the group - * capacity. Thus we look for the minimum possible imbalance. + * Both group are or will become overloaded and we're trying to get all + * the CPUs to the average_load, so we don't want to push ourselves + * above the average load, nor do we wish to reduce the max loaded CPU + * below the average load. At the same time, we also don't want to + * reduce the group load below the group capacity. Thus we look for + * the minimum possible imbalance. */ - max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity); - - /* How much load to actually move to equalise the imbalance */ + env->migration_type = migrate_load; env->imbalance = min( - max_pull * busiest->group_capacity, + (busiest->avg_load - sds->avg_load) * busiest->group_capacity, (sds->avg_load - local->avg_load) * local->group_capacity ) / SCHED_CAPACITY_SCALE; - - /* Boost imbalance to allow misfit task to be balanced. */ - if (busiest->group_type == group_misfit_task) { - env->imbalance = max_t(long, env->imbalance, - busiest->group_misfit_task_load); - } - } /******* find_busiest_group() helpers end here *********************/ +/* + * Decision matrix according to the local and busiest group type: + * + * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded + * has_spare nr_idle balanced N/A N/A balanced balanced + * fully_busy nr_idle nr_idle N/A N/A balanced balanced + * misfit_task force N/A N/A N/A force force + * asym_packing force force N/A N/A force force + * imbalanced force force N/A N/A force force + * overloaded force force N/A N/A force avg_load + * + * N/A : Not Applicable because already filtered while updating + * statistics. + * balanced : The system is balanced for these 2 groups. + * force : Calculate the imbalance as load migration is probably needed. + * avg_load : Only if imbalance is significant enough. + * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite + * different in groups. + */ + /** * find_busiest_group - Returns the busiest group within the sched_domain * if there is an imbalance. @@ -8451,17 +8610,17 @@ static struct sched_group *find_busiest_group(struct lb_env *env) local = &sds.local_stat; busiest = &sds.busiest_stat; - /* ASYM feature bypasses nice load balance check */ - if (busiest->group_asym_packing) - goto force_balance; - /* There is no busy sibling group to pull tasks from */ - if (!sds.busiest || busiest->sum_h_nr_running == 0) + if (!sds.busiest) goto out_balanced; - /* XXX broken for overlapping NUMA groups */ - sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load) - / sds.total_capacity; + /* Misfit tasks should be dealt with regardless of the avg load */ + if (busiest->group_type == group_misfit_task) + goto force_balance; + + /* ASYM feature bypasses nice load balance check */ + if (busiest->group_type == group_asym_packing) + goto force_balance; /* * If the busiest group is imbalanced the below checks don't @@ -8471,56 +8630,65 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (busiest->group_type == group_imbalanced) goto force_balance; - /* - * When dst_cpu is idle, prevent SMP nice and/or asymmetric group - * capacities from resulting in underutilization due to avg_load. - */ - if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) && - busiest->group_no_capacity) - goto force_balance; - - /* Misfit tasks should be dealt with regardless of the avg load */ - if (busiest->group_type == group_misfit_task) - goto force_balance; - /* * If the local group is busier than the selected busiest group * don't try and pull any tasks. */ - if (local->avg_load >= busiest->avg_load) + if (local->group_type > busiest->group_type) goto out_balanced; /* - * Don't pull any tasks if this group is already above the domain - * average load. + * When groups are overloaded, use the avg_load to ensure fairness + * between tasks. */ - if (local->avg_load >= sds.avg_load) - goto out_balanced; + if (local->group_type == group_overloaded) { + /* + * If the local group is more loaded than the selected + * busiest group don't try to pull any tasks. + */ + if (local->avg_load >= busiest->avg_load) + goto out_balanced; + + /* XXX broken for overlapping NUMA groups */ + sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) / + sds.total_capacity; - if (env->idle == CPU_IDLE) { /* - * This CPU is idle. If the busiest group is not overloaded - * and there is no imbalance between this and busiest group - * wrt idle CPUs, it is balanced. The imbalance becomes - * significant if the diff is greater than 1 otherwise we - * might end up to just move the imbalance on another group + * Don't pull any tasks if this group is already above the + * domain average load. */ - if ((busiest->group_type != group_overloaded) && - (local->idle_cpus <= (busiest->idle_cpus + 1))) + if (local->avg_load >= sds.avg_load) goto out_balanced; - } else { + /* - * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use - * imbalance_pct to be conservative. + * If the busiest group is more loaded, use imbalance_pct to be + * conservative. */ if (100 * busiest->avg_load <= env->sd->imbalance_pct * local->avg_load) goto out_balanced; } + /* Try to move all excess tasks to child's sibling domain */ + if (sds.prefer_sibling && local->group_type == group_has_spare && + busiest->sum_h_nr_running > local->sum_h_nr_running + 1) + goto force_balance; + + if (busiest->group_type != group_overloaded && + (env->idle == CPU_NOT_IDLE || + local->idle_cpus <= (busiest->idle_cpus + 1))) + /* + * If the busiest group is not overloaded + * and there is no imbalance between this and busiest group + * wrt. idle CPUs, it is balanced. The imbalance + * becomes significant if the diff is greater than 1 otherwise + * we might end up just moving the imbalance to another + * group. + */ + goto out_balanced; + force_balance: /* Looks like there is an imbalance. Compute it */ - env->src_grp_type = busiest->group_type; calculate_imbalance(env, &sds); return env->imbalance ? sds.busiest : NULL; @@ -8536,11 +8704,13 @@ static struct rq *find_busiest_queue(struct lb_env *env, struct sched_group *group) { struct rq *busiest = NULL, *rq; - unsigned long busiest_load = 0, busiest_capacity = 1; + unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1; + unsigned int busiest_nr = 0; int i; for_each_cpu_and(i, sched_group_span(group), env->cpus) { - unsigned long capacity, load; + unsigned long capacity, load, util; + unsigned int nr_running; enum fbq_type rt; rq = cpu_rq(i); @@ -8568,20 +8738,8 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (rt > env->fbq_type) continue; - /* - * For ASYM_CPUCAPACITY domains with misfit tasks we simply - * seek the "biggest" misfit task. - */ - if (env->src_grp_type == group_misfit_task) { - if (rq->misfit_task_load > busiest_load) { - busiest_load = rq->misfit_task_load; - busiest = rq; - } - - continue; - } - capacity = capacity_of(i); + nr_running = rq->cfs.h_nr_running; /* * For ASYM_CPUCAPACITY domains, don't pick a CPU that could @@ -8591,35 +8749,70 @@ static struct rq *find_busiest_queue(struct lb_env *env, */ if (env->sd->flags & SD_ASYM_CPUCAPACITY && capacity_of(env->dst_cpu) < capacity && - rq->nr_running == 1) + nr_running == 1) continue; - load = cpu_runnable_load(rq); + switch (env->migration_type) { + case migrate_load: + /* + * When comparing with load imbalance, use + * cpu_runnable_load() which is not scaled with the CPU + * capacity. + */ + load = cpu_runnable_load(rq); - /* - * When comparing with imbalance, use cpu_runnable_load() - * which is not scaled with the CPU capacity. - */ + if (nr_running == 1 && load > env->imbalance && + !check_cpu_capacity(rq, env->sd)) + break; - if (rq->nr_running == 1 && load > env->imbalance && - !check_cpu_capacity(rq, env->sd)) - continue; + /* + * For the load comparisons with the other CPUs, + * consider the cpu_runnable_load() scaled with the CPU + * capacity, so that the load can be moved away from + * the CPU that is potentially running at a lower + * capacity. + * + * Thus we're looking for max(load_i / capacity_i), + * crosswise multiplication to rid ourselves of the + * division works out to: + * load_i * capacity_j > load_j * capacity_i; + * where j is our previous maximum. + */ + if (load * busiest_capacity > busiest_load * capacity) { + busiest_load = load; + busiest_capacity = capacity; + busiest = rq; + } + break; + + case migrate_util: + util = cpu_util(cpu_of(rq)); + + if (busiest_util < util) { + busiest_util = util; + busiest = rq; + } + break; + + case migrate_task: + if (busiest_nr < nr_running) { + busiest_nr = nr_running; + busiest = rq; + } + break; + + case migrate_misfit: + /* + * For ASYM_CPUCAPACITY domains with misfit tasks we + * simply seek the "biggest" misfit task. + */ + if (rq->misfit_task_load > busiest_load) { + busiest_load = rq->misfit_task_load; + busiest = rq; + } + + break; - /* - * For the load comparisons with the other CPU's, consider - * the cpu_runnable_load() scaled with the CPU capacity, so - * that the load can be moved away from the CPU that is - * potentially running at a lower capacity. - * - * Thus we're looking for max(load_i / capacity_i), crosswise - * multiplication to rid ourselves of the division works out - * to: load_i * capacity_j > load_j * capacity_i; where j is - * our previous maximum. - */ - if (load * busiest_capacity > busiest_load * capacity) { - busiest_load = load; - busiest_capacity = capacity; - busiest = rq; } } @@ -8665,7 +8858,7 @@ voluntary_active_balance(struct lb_env *env) return 1; } - if (env->src_grp_type == group_misfit_task) + if (env->migration_type == migrate_misfit) return 1; return 0; From 28398d15c497e9153bc08ce23d1ff8bc443ce63f Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 18 Oct 2019 15:26:32 +0200 Subject: [PATCH 0285/5344] sched/fair: Use rq->nr_running when balancing load commit 5e23e474431529b7d1480f649ce33d0e9c1b2e48 upstream. CFS load_balance() only takes care of CFS tasks whereas CPUs can be used by other scheduling classes. Typically, a CFS task preempted by an RT or deadline task will not get a chance to be pulled by another CPU because load_balance() doesn't take into account tasks from other classes. Add sum of nr_running in the statistics and use it to detect such situations. Signed-off-by: Vincent Guittot Cc: Ben Segall Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: hdanton@sina.com Cc: parth@linux.ibm.com Cc: pauld@redhat.com Cc: quentin.perret@arm.com Cc: riel@surriel.com Cc: srikar@linux.vnet.ibm.com Cc: valentin.schneider@arm.com Link: https://lkml.kernel.org/r/1571405198-27570-6-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 10c92919e5f16..28e3e348bc666 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7783,6 +7783,7 @@ struct sg_lb_stats { unsigned long group_load; /* Total load over the CPUs of the group */ unsigned long group_capacity; unsigned long group_util; /* Total utilization of the group */ + unsigned int sum_nr_running; /* Nr of tasks running in the group */ unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ unsigned int idle_cpus; unsigned int group_weight; @@ -8019,7 +8020,7 @@ static inline int sg_imbalanced(struct sched_group *group) static inline bool group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) { - if (sgs->sum_h_nr_running < sgs->group_weight) + if (sgs->sum_nr_running < sgs->group_weight) return true; if ((sgs->group_capacity * 100) > @@ -8040,7 +8041,7 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) static inline bool group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) { - if (sgs->sum_h_nr_running <= sgs->group_weight) + if (sgs->sum_nr_running <= sgs->group_weight) return false; if ((sgs->group_capacity * 100) < @@ -8144,6 +8145,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->sum_h_nr_running += rq->cfs.h_nr_running; nr_running = rq->nr_running; + sgs->sum_nr_running += nr_running; + if (nr_running > 1) *sg_status |= SG_OVERLOAD; @@ -8501,13 +8504,13 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s } if (busiest->group_weight == 1 || sds->prefer_sibling) { - unsigned int nr_diff = busiest->sum_h_nr_running; + unsigned int nr_diff = busiest->sum_nr_running; /* * When prefer sibling, evenly spread running tasks on * groups. */ env->migration_type = migrate_task; - lsub_positive(&nr_diff, local->sum_h_nr_running); + lsub_positive(&nr_diff, local->sum_nr_running); env->imbalance = nr_diff >> 1; return; } @@ -8671,7 +8674,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) /* Try to move all excess tasks to child's sibling domain */ if (sds.prefer_sibling && local->group_type == group_has_spare && - busiest->sum_h_nr_running > local->sum_h_nr_running + 1) + busiest->sum_nr_running > local->sum_nr_running + 1) goto force_balance; if (busiest->group_type != group_overloaded && From ae509b2e1668557b5227964efc6ba2cd402571cf Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 18 Oct 2019 15:26:33 +0200 Subject: [PATCH 0286/5344] sched/fair: Use load instead of runnable load in load_balance() commit b0fb1eb4f04ae4768231b9731efb1134e22053a4 upstream. 'runnable load' was originally introduced to take into account the case where blocked load biases the load balance decision which was selecting underutilized groups with huge blocked load whereas other groups were overloaded. The load is now only used when groups are overloaded. In this case, it's worth being conservative and taking into account the sleeping tasks that might wake up on the CPU. Signed-off-by: Vincent Guittot Cc: Ben Segall Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: hdanton@sina.com Cc: parth@linux.ibm.com Cc: pauld@redhat.com Cc: quentin.perret@arm.com Cc: riel@surriel.com Cc: srikar@linux.vnet.ibm.com Cc: valentin.schneider@arm.com Link: https://lkml.kernel.org/r/1571405198-27570-7-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 28e3e348bc666..79957a65a9f71 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5426,6 +5426,11 @@ static unsigned long cpu_runnable_load(struct rq *rq) return cfs_rq_runnable_load_avg(&rq->cfs); } +static unsigned long cpu_load(struct rq *rq) +{ + return cfs_rq_load_avg(&rq->cfs); +} + static unsigned long capacity_of(int cpu) { return cpu_rq(cpu)->cpu_capacity; @@ -8140,7 +8145,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) env->flags |= LBF_NOHZ_AGAIN; - sgs->group_load += cpu_runnable_load(rq); + sgs->group_load += cpu_load(rq); sgs->group_util += cpu_util(i); sgs->sum_h_nr_running += rq->cfs.h_nr_running; @@ -8598,7 +8603,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) init_sd_lb_stats(&sds); /* - * Compute the various statistics relavent for load balancing at + * Compute the various statistics relevant for load balancing at * this level. */ update_sd_lb_stats(env, &sds); @@ -8758,11 +8763,10 @@ static struct rq *find_busiest_queue(struct lb_env *env, switch (env->migration_type) { case migrate_load: /* - * When comparing with load imbalance, use - * cpu_runnable_load() which is not scaled with the CPU - * capacity. + * When comparing with load imbalance, use cpu_load() + * which is not scaled with the CPU capacity. */ - load = cpu_runnable_load(rq); + load = cpu_load(rq); if (nr_running == 1 && load > env->imbalance && !check_cpu_capacity(rq, env->sd)) @@ -8770,10 +8774,10 @@ static struct rq *find_busiest_queue(struct lb_env *env, /* * For the load comparisons with the other CPUs, - * consider the cpu_runnable_load() scaled with the CPU - * capacity, so that the load can be moved away from - * the CPU that is potentially running at a lower - * capacity. + * consider the cpu_load() scaled with the CPU + * capacity, so that the load can be moved away + * from the CPU that is potentially running at a + * lower capacity. * * Thus we're looking for max(load_i / capacity_i), * crosswise multiplication to rid ourselves of the From 84ef65fae8ee156d66d7ae385c3c136604a7306e Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 18 Oct 2019 15:26:34 +0200 Subject: [PATCH 0287/5344] sched/fair: Spread out tasks evenly when not overloaded commit 2ab4092fc82d6001fdd9d51dbba27d04dec967e0 upstream. When there is only one CPU per group, using the idle CPUs to evenly spread tasks doesn't make sense and nr_running is a better metrics. Signed-off-by: Vincent Guittot Cc: Ben Segall Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: hdanton@sina.com Cc: parth@linux.ibm.com Cc: pauld@redhat.com Cc: quentin.perret@arm.com Cc: riel@surriel.com Cc: srikar@linux.vnet.ibm.com Cc: valentin.schneider@arm.com Link: https://lkml.kernel.org/r/1571405198-27570-8-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 79957a65a9f71..2618302fb43c3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8682,18 +8682,34 @@ static struct sched_group *find_busiest_group(struct lb_env *env) busiest->sum_nr_running > local->sum_nr_running + 1) goto force_balance; - if (busiest->group_type != group_overloaded && - (env->idle == CPU_NOT_IDLE || - local->idle_cpus <= (busiest->idle_cpus + 1))) - /* - * If the busiest group is not overloaded - * and there is no imbalance between this and busiest group - * wrt. idle CPUs, it is balanced. The imbalance - * becomes significant if the diff is greater than 1 otherwise - * we might end up just moving the imbalance to another - * group. - */ - goto out_balanced; + if (busiest->group_type != group_overloaded) { + if (env->idle == CPU_NOT_IDLE) + /* + * If the busiest group is not overloaded (and as a + * result the local one too) but this CPU is already + * busy, let another idle CPU try to pull task. + */ + goto out_balanced; + + if (busiest->group_weight > 1 && + local->idle_cpus <= (busiest->idle_cpus + 1)) + /* + * If the busiest group is not overloaded + * and there is no imbalance between this and busiest + * group wrt idle CPUs, it is balanced. The imbalance + * becomes significant if the diff is greater than 1 + * otherwise we might end up to just move the imbalance + * on another group. Of course this applies only if + * there is more than 1 CPU per group. + */ + goto out_balanced; + + if (busiest->sum_h_nr_running == 1) + /* + * busiest doesn't have any tasks waiting to run + */ + goto out_balanced; + } force_balance: /* Looks like there is an imbalance. Compute it */ From 347ba3c31b29133907c13ef6546f4a5821109b64 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 18 Oct 2019 15:26:35 +0200 Subject: [PATCH 0288/5344] sched/fair: Use utilization to select misfit task commit c63be7be59de65d12ff7b4329acea99cf734d6de upstream. Utilization is used to detect a misfit task but the load is then used to select the task on the CPU which can lead to select a small task with high weight instead of the task that triggered the misfit migration. Check that task can't fit the CPU's capacity when selecting the misfit task instead of using the load. Signed-off-by: Vincent Guittot Acked-by: Valentin Schneider Cc: Ben Segall Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: hdanton@sina.com Cc: parth@linux.ibm.com Cc: pauld@redhat.com Cc: quentin.perret@arm.com Cc: riel@surriel.com Cc: srikar@linux.vnet.ibm.com Link: https://lkml.kernel.org/r/1571405198-27570-9-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2618302fb43c3..b5e092eb84063 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7474,13 +7474,8 @@ static int detach_tasks(struct lb_env *env) break; case migrate_misfit: - load = task_h_load(p); - - /* - * Load of misfit task might decrease a bit since it has - * been recorded. Be conservative in the condition. - */ - if (load/2 < env->imbalance) + /* This is not a misfit task */ + if (task_fits_capacity(p, capacity_of(env->src_cpu))) goto next; env->imbalance = 0; @@ -8449,7 +8444,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s if (busiest->group_type == group_misfit_task) { /* Set imbalance to allow misfit tasks to be balanced. */ env->migration_type = migrate_misfit; - env->imbalance = busiest->group_misfit_task_load; + env->imbalance = 1; return; } From 1810da1d77c7f186e2b1fc28a8d8b9a187988a39 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 18 Oct 2019 15:26:36 +0200 Subject: [PATCH 0289/5344] sched/fair: Use load instead of runnable load in wakeup path commit 11f10e5420f6cecac7d4823638bff040c257aba9 upstream. Runnable load was originally introduced to take into account the case where blocked load biases the wake up path which may end to select an overloaded CPU with a large number of runnable tasks instead of an underutilized CPU with a huge blocked load. Tha wake up path now starts looking for idle CPUs before comparing runnable load and it's worth aligning the wake up path with the load_balance() logic. Signed-off-by: Vincent Guittot Cc: Ben Segall Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: hdanton@sina.com Cc: parth@linux.ibm.com Cc: pauld@redhat.com Cc: quentin.perret@arm.com Cc: riel@surriel.com Cc: srikar@linux.vnet.ibm.com Cc: valentin.schneider@arm.com Link: https://lkml.kernel.org/r/1571405198-27570-10-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b5e092eb84063..13d3be2b631bd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1475,7 +1475,12 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; } -static unsigned long cpu_runnable_load(struct rq *rq); +static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); + +static unsigned long cpu_runnable_load(struct rq *rq) +{ + return cfs_rq_runnable_load_avg(&rq->cfs); +} /* Cached statistics for all CPUs within a node */ struct numa_stats { @@ -5421,11 +5426,6 @@ static int sched_idle_cpu(int cpu) rq->nr_running); } -static unsigned long cpu_runnable_load(struct rq *rq) -{ - return cfs_rq_runnable_load_avg(&rq->cfs); -} - static unsigned long cpu_load(struct rq *rq) { return cfs_rq_load_avg(&rq->cfs); @@ -5526,7 +5526,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, s64 this_eff_load, prev_eff_load; unsigned long task_load; - this_eff_load = cpu_runnable_load(cpu_rq(this_cpu)); + this_eff_load = cpu_load(cpu_rq(this_cpu)); if (sync) { unsigned long current_load = task_h_load(current); @@ -5544,7 +5544,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, this_eff_load *= 100; this_eff_load *= capacity_of(prev_cpu); - prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu)); + prev_eff_load = cpu_load(cpu_rq(prev_cpu)); prev_eff_load -= task_load; if (sched_feat(WA_BIAS)) prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; @@ -5632,7 +5632,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, max_spare_cap = 0; for_each_cpu(i, sched_group_span(group)) { - load = cpu_runnable_load(cpu_rq(i)); + load = cpu_load(cpu_rq(i)); runnable_load += load; avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); @@ -5773,7 +5773,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this continue; } - load = cpu_runnable_load(cpu_rq(i)); + load = cpu_load(cpu_rq(i)); if (load < min_load) { min_load = load; least_loaded_cpu = i; From d590e60a4a43c20faf4ec744d032d7437fa4cfba Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 18 Oct 2019 15:26:37 +0200 Subject: [PATCH 0290/5344] sched/fair: Optimize find_idlest_group() commit fc1273f4cefe6670d528715581c848abf64f391c upstream. find_idlest_group() now reads CPU's load_avg in two different ways. Consolidate the function to read and use load_avg only once and simplify the algorithm to only look for the group with lowest load_avg. Signed-off-by: Vincent Guittot Cc: Ben Segall Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: hdanton@sina.com Cc: parth@linux.ibm.com Cc: pauld@redhat.com Cc: quentin.perret@arm.com Cc: riel@surriel.com Cc: srikar@linux.vnet.ibm.com Cc: valentin.schneider@arm.com Link: https://lkml.kernel.org/r/1571405198-27570-11-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 50 +++++++++++++-------------------------------- 1 file changed, 14 insertions(+), 36 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 13d3be2b631bd..a4128ea08f890 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5601,16 +5601,14 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, { struct sched_group *idlest = NULL, *group = sd->groups; struct sched_group *most_spare_sg = NULL; - unsigned long min_runnable_load = ULONG_MAX; - unsigned long this_runnable_load = ULONG_MAX; - unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX; + unsigned long min_load = ULONG_MAX, this_load = ULONG_MAX; unsigned long most_spare = 0, this_spare = 0; int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; unsigned long imbalance = scale_load_down(NICE_0_LOAD) * (sd->imbalance_pct-100) / 100; do { - unsigned long load, avg_load, runnable_load; + unsigned long load; unsigned long spare_cap, max_spare_cap; int local_group; int i; @@ -5627,15 +5625,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, * Tally up the load of all CPUs in the group and find * the group containing the CPU with most spare capacity. */ - avg_load = 0; - runnable_load = 0; + load = 0; max_spare_cap = 0; for_each_cpu(i, sched_group_span(group)) { - load = cpu_load(cpu_rq(i)); - runnable_load += load; - - avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); + load += cpu_load(cpu_rq(i)); spare_cap = capacity_spare_without(i, p); @@ -5644,31 +5638,15 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, } /* Adjust by relative CPU capacity of the group */ - avg_load = (avg_load * SCHED_CAPACITY_SCALE) / - group->sgc->capacity; - runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) / + load = (load * SCHED_CAPACITY_SCALE) / group->sgc->capacity; if (local_group) { - this_runnable_load = runnable_load; - this_avg_load = avg_load; + this_load = load; this_spare = max_spare_cap; } else { - if (min_runnable_load > (runnable_load + imbalance)) { - /* - * The runnable load is significantly smaller - * so we can pick this new CPU: - */ - min_runnable_load = runnable_load; - min_avg_load = avg_load; - idlest = group; - } else if ((runnable_load < (min_runnable_load + imbalance)) && - (100*min_avg_load > imbalance_scale*avg_load)) { - /* - * The runnable loads are close so take the - * blocked load into account through avg_load: - */ - min_avg_load = avg_load; + if (load < min_load) { + min_load = load; idlest = group; } @@ -5709,18 +5687,18 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, * local domain to be very lightly loaded relative to the remote * domains but "imbalance" skews the comparison making remote CPUs * look much more favourable. When considering cross-domain, add - * imbalance to the runnable load on the remote node and consider - * staying local. + * imbalance to the load on the remote node and consider staying + * local. */ if ((sd->flags & SD_NUMA) && - min_runnable_load + imbalance >= this_runnable_load) + min_load + imbalance >= this_load) return NULL; - if (min_runnable_load > (this_runnable_load + imbalance)) + if (min_load >= this_load + imbalance) return NULL; - if ((this_runnable_load < (min_runnable_load + imbalance)) && - (100*this_avg_load < imbalance_scale*min_avg_load)) + if ((this_load < (min_load + imbalance)) && + (100*this_load < imbalance_scale*min_load)) return NULL; return idlest; From 4e444def9655b1ad49ea90f976f5257156165869 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 18 Oct 2019 15:26:38 +0200 Subject: [PATCH 0291/5344] sched/fair: Rework find_idlest_group() commit 57abff067a084889b6e06137e61a3dc3458acd56 upstream. The slow wake up path computes per sched_group statisics to select the idlest group, which is quite similar to what load_balance() is doing for selecting busiest group. Rework find_idlest_group() to classify the sched_group and select the idlest one following the same steps as load_balance(). Signed-off-by: Vincent Guittot Cc: Ben Segall Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: hdanton@sina.com Cc: parth@linux.ibm.com Cc: pauld@redhat.com Cc: quentin.perret@arm.com Cc: riel@surriel.com Cc: srikar@linux.vnet.ibm.com Cc: valentin.schneider@arm.com Link: https://lkml.kernel.org/r/1571405198-27570-12-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 384 +++++++++++++++++++++++++++++--------------- 1 file changed, 256 insertions(+), 128 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a4128ea08f890..b13a91daa1fe2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5582,127 +5582,9 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, return target; } -static unsigned long cpu_util_without(int cpu, struct task_struct *p); - -static unsigned long capacity_spare_without(int cpu, struct task_struct *p) -{ - return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0); -} - -/* - * find_idlest_group finds and returns the least busy CPU group within the - * domain. - * - * Assumes p is allowed on at least one CPU in sd. - */ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int sd_flag) -{ - struct sched_group *idlest = NULL, *group = sd->groups; - struct sched_group *most_spare_sg = NULL; - unsigned long min_load = ULONG_MAX, this_load = ULONG_MAX; - unsigned long most_spare = 0, this_spare = 0; - int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; - unsigned long imbalance = scale_load_down(NICE_0_LOAD) * - (sd->imbalance_pct-100) / 100; - - do { - unsigned long load; - unsigned long spare_cap, max_spare_cap; - int local_group; - int i; - - /* Skip over this group if it has no CPUs allowed */ - if (!cpumask_intersects(sched_group_span(group), - p->cpus_ptr)) - continue; - - local_group = cpumask_test_cpu(this_cpu, - sched_group_span(group)); - - /* - * Tally up the load of all CPUs in the group and find - * the group containing the CPU with most spare capacity. - */ - load = 0; - max_spare_cap = 0; - - for_each_cpu(i, sched_group_span(group)) { - load += cpu_load(cpu_rq(i)); - - spare_cap = capacity_spare_without(i, p); - - if (spare_cap > max_spare_cap) - max_spare_cap = spare_cap; - } - - /* Adjust by relative CPU capacity of the group */ - load = (load * SCHED_CAPACITY_SCALE) / - group->sgc->capacity; - - if (local_group) { - this_load = load; - this_spare = max_spare_cap; - } else { - if (load < min_load) { - min_load = load; - idlest = group; - } - - if (most_spare < max_spare_cap) { - most_spare = max_spare_cap; - most_spare_sg = group; - } - } - } while (group = group->next, group != sd->groups); - - /* - * The cross-over point between using spare capacity or least load - * is too conservative for high utilization tasks on partially - * utilized systems if we require spare_capacity > task_util(p), - * so we allow for some task stuffing by using - * spare_capacity > task_util(p)/2. - * - * Spare capacity can't be used for fork because the utilization has - * not been set yet, we must first select a rq to compute the initial - * utilization. - */ - if (sd_flag & SD_BALANCE_FORK) - goto skip_spare; - - if (this_spare > task_util(p) / 2 && - imbalance_scale*this_spare > 100*most_spare) - return NULL; - - if (most_spare > task_util(p) / 2) - return most_spare_sg; - -skip_spare: - if (!idlest) - return NULL; - - /* - * When comparing groups across NUMA domains, it's possible for the - * local domain to be very lightly loaded relative to the remote - * domains but "imbalance" skews the comparison making remote CPUs - * look much more favourable. When considering cross-domain, add - * imbalance to the load on the remote node and consider staying - * local. - */ - if ((sd->flags & SD_NUMA) && - min_load + imbalance >= this_load) - return NULL; - - if (min_load >= this_load + imbalance) - return NULL; - - if ((this_load < (min_load + imbalance)) && - (100*this_load < imbalance_scale*min_load)) - return NULL; - - return idlest; -} + int this_cpu, int sd_flag); /* * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group. @@ -5775,7 +5657,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p return prev_cpu; /* - * We need task's util for capacity_spare_without, sync it up to + * We need task's util for cpu_util_without, sync it up to * prev_cpu's last_update_time. */ if (!(sd_flag & SD_BALANCE_FORK)) @@ -7996,13 +7878,13 @@ static inline int sg_imbalanced(struct sched_group *group) * any benefit for the load balance. */ static inline bool -group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) +group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs) { if (sgs->sum_nr_running < sgs->group_weight) return true; if ((sgs->group_capacity * 100) > - (sgs->group_util * env->sd->imbalance_pct)) + (sgs->group_util * imbalance_pct)) return true; return false; @@ -8017,13 +7899,13 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) * false. */ static inline bool -group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) +group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs) { if (sgs->sum_nr_running <= sgs->group_weight) return false; if ((sgs->group_capacity * 100) < - (sgs->group_util * env->sd->imbalance_pct)) + (sgs->group_util * imbalance_pct)) return true; return false; @@ -8050,11 +7932,11 @@ group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref) } static inline enum -group_type group_classify(struct lb_env *env, +group_type group_classify(unsigned int imbalance_pct, struct sched_group *group, struct sg_lb_stats *sgs) { - if (group_is_overloaded(env, sgs)) + if (group_is_overloaded(imbalance_pct, sgs)) return group_overloaded; if (sg_imbalanced(group)) @@ -8066,7 +7948,7 @@ group_type group_classify(struct lb_env *env, if (sgs->group_misfit_task_load) return group_misfit_task; - if (!group_has_capacity(env, sgs)) + if (!group_has_capacity(imbalance_pct, sgs)) return group_fully_busy; return group_has_spare; @@ -8167,7 +8049,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_weight = group->group_weight; - sgs->group_type = group_classify(env, group, sgs); + sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); /* Computing avg_load makes sense only when group is overloaded */ if (sgs->group_type == group_overloaded) @@ -8322,6 +8204,252 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) } #endif /* CONFIG_NUMA_BALANCING */ + +struct sg_lb_stats; + +/* + * update_sg_wakeup_stats - Update sched_group's statistics for wakeup. + * @denv: The ched_domain level to look for idlest group. + * @group: sched_group whose statistics are to be updated. + * @sgs: variable to hold the statistics for this group. + */ +static inline void update_sg_wakeup_stats(struct sched_domain *sd, + struct sched_group *group, + struct sg_lb_stats *sgs, + struct task_struct *p) +{ + int i, nr_running; + + memset(sgs, 0, sizeof(*sgs)); + + for_each_cpu(i, sched_group_span(group)) { + struct rq *rq = cpu_rq(i); + + sgs->group_load += cpu_load(rq); + sgs->group_util += cpu_util_without(i, p); + sgs->sum_h_nr_running += rq->cfs.h_nr_running; + + nr_running = rq->nr_running; + sgs->sum_nr_running += nr_running; + + /* + * No need to call idle_cpu() if nr_running is not 0 + */ + if (!nr_running && idle_cpu(i)) + sgs->idle_cpus++; + + + } + + /* Check if task fits in the group */ + if (sd->flags & SD_ASYM_CPUCAPACITY && + !task_fits_capacity(p, group->sgc->max_capacity)) { + sgs->group_misfit_task_load = 1; + } + + sgs->group_capacity = group->sgc->capacity; + + sgs->group_type = group_classify(sd->imbalance_pct, group, sgs); + + /* + * Computing avg_load makes sense only when group is fully busy or + * overloaded + */ + if (sgs->group_type < group_fully_busy) + sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / + sgs->group_capacity; +} + +static bool update_pick_idlest(struct sched_group *idlest, + struct sg_lb_stats *idlest_sgs, + struct sched_group *group, + struct sg_lb_stats *sgs) +{ + if (sgs->group_type < idlest_sgs->group_type) + return true; + + if (sgs->group_type > idlest_sgs->group_type) + return false; + + /* + * The candidate and the current idlest group are the same type of + * group. Let check which one is the idlest according to the type. + */ + + switch (sgs->group_type) { + case group_overloaded: + case group_fully_busy: + /* Select the group with lowest avg_load. */ + if (idlest_sgs->avg_load <= sgs->avg_load) + return false; + break; + + case group_imbalanced: + case group_asym_packing: + /* Those types are not used in the slow wakeup path */ + return false; + + case group_misfit_task: + /* Select group with the highest max capacity */ + if (idlest->sgc->max_capacity >= group->sgc->max_capacity) + return false; + break; + + case group_has_spare: + /* Select group with most idle CPUs */ + if (idlest_sgs->idle_cpus >= sgs->idle_cpus) + return false; + break; + } + + return true; +} + +/* + * find_idlest_group() finds and returns the least busy CPU group within the + * domain. + * + * Assumes p is allowed on at least one CPU in sd. + */ +static struct sched_group * +find_idlest_group(struct sched_domain *sd, struct task_struct *p, + int this_cpu, int sd_flag) +{ + struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups; + struct sg_lb_stats local_sgs, tmp_sgs; + struct sg_lb_stats *sgs; + unsigned long imbalance; + struct sg_lb_stats idlest_sgs = { + .avg_load = UINT_MAX, + .group_type = group_overloaded, + }; + + imbalance = scale_load_down(NICE_0_LOAD) * + (sd->imbalance_pct-100) / 100; + + do { + int local_group; + + /* Skip over this group if it has no CPUs allowed */ + if (!cpumask_intersects(sched_group_span(group), + p->cpus_ptr)) + continue; + + local_group = cpumask_test_cpu(this_cpu, + sched_group_span(group)); + + if (local_group) { + sgs = &local_sgs; + local = group; + } else { + sgs = &tmp_sgs; + } + + update_sg_wakeup_stats(sd, group, sgs, p); + + if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) { + idlest = group; + idlest_sgs = *sgs; + } + + } while (group = group->next, group != sd->groups); + + + /* There is no idlest group to push tasks to */ + if (!idlest) + return NULL; + + /* + * If the local group is idler than the selected idlest group + * don't try and push the task. + */ + if (local_sgs.group_type < idlest_sgs.group_type) + return NULL; + + /* + * If the local group is busier than the selected idlest group + * try and push the task. + */ + if (local_sgs.group_type > idlest_sgs.group_type) + return idlest; + + switch (local_sgs.group_type) { + case group_overloaded: + case group_fully_busy: + /* + * When comparing groups across NUMA domains, it's possible for + * the local domain to be very lightly loaded relative to the + * remote domains but "imbalance" skews the comparison making + * remote CPUs look much more favourable. When considering + * cross-domain, add imbalance to the load on the remote node + * and consider staying local. + */ + + if ((sd->flags & SD_NUMA) && + ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load)) + return NULL; + + /* + * If the local group is less loaded than the selected + * idlest group don't try and push any tasks. + */ + if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance)) + return NULL; + + if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load) + return NULL; + break; + + case group_imbalanced: + case group_asym_packing: + /* Those type are not used in the slow wakeup path */ + return NULL; + + case group_misfit_task: + /* Select group with the highest max capacity */ + if (local->sgc->max_capacity >= idlest->sgc->max_capacity) + return NULL; + break; + + case group_has_spare: + if (sd->flags & SD_NUMA) { +#ifdef CONFIG_NUMA_BALANCING + int idlest_cpu; + /* + * If there is spare capacity at NUMA, try to select + * the preferred node + */ + if (cpu_to_node(this_cpu) == p->numa_preferred_nid) + return NULL; + + idlest_cpu = cpumask_first(sched_group_span(idlest)); + if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid) + return idlest; +#endif + /* + * Otherwise, keep the task on this node to stay close + * its wakeup source and improve locality. If there is + * a real need of migration, periodic load balance will + * take care of it. + */ + if (local_sgs.idle_cpus) + return NULL; + } + + /* + * Select group with highest number of idle CPUs. We could also + * compare the utilization which is more stable but it can end + * up that the group has less spare capacity but finally more + * idle CPUs which means more opportunity to run task. + */ + if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus) + return NULL; + break; + } + + return idlest; +} + /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. From 9a3c67f2890882fc0859da40d8a950657709ce56 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 10 Jul 2020 17:24:26 +0200 Subject: [PATCH 0292/5344] sched/fair: handle case of task_h_load() returning 0 commit 01cfcde9c26d8555f0e6e9aea9d6049f87683998 upstream. task_h_load() can return 0 in some situations like running stress-ng mmapfork, which forks thousands of threads, in a sched group on a 224 cores system. The load balance doesn't handle this correctly because env->imbalance never decreases and it will stop pulling tasks only after reaching loop_max, which can be equal to the number of running tasks of the cfs. Make sure that imbalance will be decreased by at least 1. misfit task is the other feature that doesn't handle correctly such situation although it's probably more difficult to face the problem because of the smaller number of CPUs and running tasks on heterogenous system. We can't simply ensure that task_h_load() returns at least one because it would imply to handle underflow in other places. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Cc: # v4.4+ Link: https://lkml.kernel.org/r/20200710152426.16981-1-vincent.guittot@linaro.org Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b13a91daa1fe2..f13b961501c93 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3836,7 +3836,11 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) return; } - rq->misfit_task_load = task_h_load(p); + /* + * Make sure that misfit_task_load will not be null even if + * task_h_load() returns 0. + */ + rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); } #else /* CONFIG_SMP */ @@ -7308,7 +7312,14 @@ static int detach_tasks(struct lb_env *env) switch (env->migration_type) { case migrate_load: - load = task_h_load(p); + /* + * Depending of the number of CPUs and tasks and the + * cgroup hierarchy, task_h_load() can return a null + * value. Make sure that env->imbalance decreases + * otherwise detach_tasks() will stop only after + * detaching up to loop_max tasks. + */ + load = max_t(unsigned long, task_h_load(p), 1); if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) From 3931f3f58e6dc3eea0b5547d14884e7520f76663 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 2 Nov 2020 11:24:57 +0100 Subject: [PATCH 0293/5344] sched/fair: Ensure tasks spreading in LLC during LB commit 16b0a7a1a0af9db6e008fecd195fe4d6cb366d83 upstream. schbench shows latency increase for 95 percentile above since: commit 0b0695f2b34a ("sched/fair: Rework load_balance()") Align the behavior of the load balancer with the wake up path, which tries to select an idle CPU which belongs to the LLC for a waking task. calculate_imbalance() will use nr_running instead of the spare capacity when CPUs share resources (ie cache) at the domain level. This will ensure a better spread of tasks on idle CPUs. Running schbench on a hikey (8cores arm64) shows the problem: tip/sched/core : schbench -m 2 -t 4 -s 10000 -c 1000000 -r 10 Latency percentiles (usec) 50.0th: 33 75.0th: 45 90.0th: 51 95.0th: 4152 *99.0th: 14288 99.5th: 14288 99.9th: 14288 min=0, max=14276 tip/sched/core + patch : schbench -m 2 -t 4 -s 10000 -c 1000000 -r 10 Latency percentiles (usec) 50.0th: 34 75.0th: 47 90.0th: 52 95.0th: 78 *99.0th: 94 99.5th: 94 99.9th: 94 min=0, max=94 Fixes: 0b0695f2b34a ("sched/fair: Rework load_balance()") Reported-by: Chris Mason Suggested-by: Rik van Riel Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Rik van Riel Tested-by: Rik van Riel Link: https://lkml.kernel.org/r/20201102102457.28808-1-vincent.guittot@linaro.org Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f13b961501c93..e501185d22441 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8592,7 +8592,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * emptying busiest */ if (local->group_type == group_has_spare) { - if (busiest->group_type > group_fully_busy) { + if ((busiest->group_type > group_fully_busy) && + !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) { /* * If busiest is overloaded, try to fill spare * capacity. This might end up creating spare capacity From 2394a3c9f9da9caf5fb29b9741b43d4c13777cf4 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Thu, 22 Oct 2020 15:15:50 +0200 Subject: [PATCH 0294/5344] sched/fair: Check for idle core in wake_affine commit d8fcb81f1acf651a0e50eacecca43d0524984f87 upstream. In the case of a thread wakeup, wake_affine determines whether a core will be chosen for the thread on the socket where the thread ran previously or on the socket of the waker. This is done primarily by comparing the load of the core where th thread ran previously (prev) and the load of the waker (this). commit 11f10e5420f6 ("sched/fair: Use load instead of runnable load in wakeup path") changed the load computation from the runnable load to the load average, where the latter includes the load of threads that have already blocked on the core. When a short-running daemon processes happens to run on prev, this change raised the situation that prev could appear to have a greater load than this, even when prev is actually idle. When prev and this are on the same socket, the idle prev is detected later, in select_idle_sibling. But if that does not hold, prev is completely ignored, causing the waking thread to move to the socket of the waker. In the case of N mostly active threads on N cores, this triggers other migrations and hurts performance. In contrast, before commit 11f10e5420f6, the load on an idle core was 0, and in the case of a non-idle waker core, the effect of wake_affine was to select prev as the target for searching for a core for the waking thread. To avoid unnecessary migrations, extend wake_affine_idle to check whether the core where the thread previously ran is currently idle, and if so simply return that core as the target. [1] commit 11f10e5420f6ce ("sched/fair: Use load instead of runnable load in wakeup path") This particularly has an impact when using the ondemand power manager, where kworkers run every 0.004 seconds on all cores, increasing the likelihood that an idle core will be considered to have a load. The following numbers were obtained with the benchmarking tool hyperfine (https://github.com/sharkdp/hyperfine) on the NAS parallel benchmarks (https://www.nas.nasa.gov/publications/npb.html). The tests were run on an 80-core Intel(R) Xeon(R) CPU E7-8870 v4 @ 2.10GHz. Active (intel_pstate) and passive (intel_cpufreq) power management were used. Times are in seconds. All experiments use all 160 hardware threads. v5.9/intel-pstate v5.9+patch/intel-pstate bt.C.c 24.725724+-0.962340 23.349608+-1.607214 lu.C.x 29.105952+-4.804203 25.249052+-5.561617 sp.C.x 31.220696+-1.831335 30.227760+-2.429792 ua.C.x 26.606118+-1.767384 25.778367+-1.263850 v5.9/ondemand v5.9+patch/ondemand bt.C.c 25.330360+-1.028316 23.544036+-1.020189 lu.C.x 35.872659+-4.872090 23.719295+-3.883848 sp.C.x 32.141310+-2.289541 29.125363+-0.872300 ua.C.x 29.024597+-1.667049 25.728888+-1.539772 On the smaller data sets (A and B) and on the other NAS benchmarks there is no impact on performance. This also has a major impact on the splash2x.volrend benchmark of the parsec benchmark suite that goes from 1m25 without this patch to 0m45, in active (intel_pstate) mode. Fixes: 11f10e5420f6 ("sched/fair: Use load instead of runnable load in wakeup path") Signed-off-by: Julia Lawall Signed-off-by: Peter Zijlstra (Intel) Reviewed-by Vincent Guittot Acked-by: Mel Gorman Link: https://lkml.kernel.org/r/1603372550-14680-1-git-send-email-Julia.Lawall@inria.fr Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e501185d22441..46d9f6b7981d3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5520,6 +5520,9 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync) if (sync && cpu_rq(this_cpu)->nr_running == 1) return this_cpu; + if (available_idle_cpu(prev_cpu)) + return prev_cpu; + return nr_cpumask_bits; } From b7addaedfea842b061245f0ea0764d84650d3bdc Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 22 Oct 2019 18:46:38 +0200 Subject: [PATCH 0295/5344] sched/fair: Fix rework of find_idlest_group() commit 3318544b721d3072fdd1f85ee0f1f214c0b211ee upstream. The task, for which the scheduler looks for the idlest group of CPUs, must be discounted from all statistics in order to get a fair comparison between groups. This includes utilization, load, nr_running and idle_cpus. Such unfairness can be easily highlighted with the unixbench execl 1 task. This test continuously call execve() and the scheduler looks for the idlest group/CPU on which it should place the task. Because the task runs on the local group/CPU, the latter seems already busy even if there is nothing else running on it. As a result, the scheduler will always select another group/CPU than the local one. This recovers most of the performance regression on my system from the recent load-balancer rewrite. [ mingo: Minor cleanups. ] Reported-by: kernel test robot Tested-by: kernel test robot Signed-off-by: Vincent Guittot Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: hdanton@sina.com Cc: parth@linux.ibm.com Cc: pauld@redhat.com Cc: quentin.perret@arm.com Cc: riel@surriel.com Cc: srikar@linux.vnet.ibm.com Cc: valentin.schneider@arm.com Fixes: 57abff067a08 ("sched/fair: Rework find_idlest_group()") Link: https://lkml.kernel.org/r/1571762798-25900-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 91 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 84 insertions(+), 7 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 46d9f6b7981d3..ac836c889d182 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5435,6 +5435,37 @@ static unsigned long cpu_load(struct rq *rq) return cfs_rq_load_avg(&rq->cfs); } +/* + * cpu_load_without - compute CPU load without any contributions from *p + * @cpu: the CPU which load is requested + * @p: the task which load should be discounted + * + * The load of a CPU is defined by the load of tasks currently enqueued on that + * CPU as well as tasks which are currently sleeping after an execution on that + * CPU. + * + * This method returns the load of the specified CPU by discounting the load of + * the specified task, whenever the task is currently contributing to the CPU + * load. + */ +static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p) +{ + struct cfs_rq *cfs_rq; + unsigned int load; + + /* Task has no contribution or is new */ + if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) + return cpu_load(rq); + + cfs_rq = &rq->cfs; + load = READ_ONCE(cfs_rq->avg.load_avg); + + /* Discount task's util from CPU's util */ + lsub_positive(&load, task_h_load(p)); + + return load; +} + static unsigned long capacity_of(int cpu) { return cpu_rq(cpu)->cpu_capacity; @@ -8221,11 +8252,56 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) struct sg_lb_stats; +/* + * task_running_on_cpu - return 1 if @p is running on @cpu. + */ + +static unsigned int task_running_on_cpu(int cpu, struct task_struct *p) +{ + /* Task has no contribution or is new */ + if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) + return 0; + + if (task_on_rq_queued(p)) + return 1; + + return 0; +} + +/** + * idle_cpu_without - would a given CPU be idle without p ? + * @cpu: the processor on which idleness is tested. + * @p: task which should be ignored. + * + * Return: 1 if the CPU would be idle. 0 otherwise. + */ +static int idle_cpu_without(int cpu, struct task_struct *p) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->curr != rq->idle && rq->curr != p) + return 0; + + /* + * rq->nr_running can't be used but an updated version without the + * impact of p on cpu must be used instead. The updated nr_running + * be computed and tested before calling idle_cpu_without(). + */ + +#ifdef CONFIG_SMP + if (!llist_empty(&rq->wake_list)) + return 0; +#endif + + return 1; +} + /* * update_sg_wakeup_stats - Update sched_group's statistics for wakeup. - * @denv: The ched_domain level to look for idlest group. + * @sd: The sched_domain level to look for idlest group. * @group: sched_group whose statistics are to be updated. * @sgs: variable to hold the statistics for this group. + * @p: The task for which we look for the idlest group/CPU. */ static inline void update_sg_wakeup_stats(struct sched_domain *sd, struct sched_group *group, @@ -8238,21 +8314,22 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, for_each_cpu(i, sched_group_span(group)) { struct rq *rq = cpu_rq(i); + unsigned int local; - sgs->group_load += cpu_load(rq); + sgs->group_load += cpu_load_without(rq, p); sgs->group_util += cpu_util_without(i, p); - sgs->sum_h_nr_running += rq->cfs.h_nr_running; + local = task_running_on_cpu(i, p); + sgs->sum_h_nr_running += rq->cfs.h_nr_running - local; - nr_running = rq->nr_running; + nr_running = rq->nr_running - local; sgs->sum_nr_running += nr_running; /* - * No need to call idle_cpu() if nr_running is not 0 + * No need to call idle_cpu_without() if nr_running is not 0 */ - if (!nr_running && idle_cpu(i)) + if (!nr_running && idle_cpu_without(i, p)) sgs->idle_cpus++; - } /* Check if task fits in the group */ From 55171cbe5835e200b4e85faf7da7fe273572e1a6 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 4 Dec 2019 19:21:40 +0100 Subject: [PATCH 0296/5344] sched/fair: Fix find_idlest_group() to handle CPU affinity commit 7ed735c33104f3c6194fbc67e3a8b6e64ae84ad1 upstream. Because of CPU affinity, the local group can be skipped which breaks the assumption that statistics are always collected for local group. With uninitialized local_sgs, the comparison is meaningless and the behavior unpredictable. This can even end up to use local pointer which is to NULL in this case. If the local group has been skipped because of CPU affinity, we return the idlest group. Fixes: 57abff067a08 ("sched/fair: Rework find_idlest_group()") Reported-by: John Stultz Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Tested-by: John Stultz Cc: rostedt@goodmis.org Cc: valentin.schneider@arm.com Cc: mingo@redhat.com Cc: mgorman@suse.de Cc: juri.lelli@redhat.com Cc: dietmar.eggemann@arm.com Cc: bsegall@google.com Cc: qais.yousef@arm.com Link: https://lkml.kernel.org/r/1575483700-22153-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ac836c889d182..df8a3435770a5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8450,6 +8450,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, if (!idlest) return NULL; + /* The local group has been skipped because of CPU affinity */ + if (!local) + return idlest; + /* * If the local group is idler than the selected idlest group * don't try and push the task. From 02a127b39241d4ee226617a591ef01a05a3427d5 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 18 Feb 2020 15:45:34 +0100 Subject: [PATCH 0297/5344] sched/fair: Fix statistics for find_idlest_group() commit 289de35984815576793f579ec27248609e75976e upstream. sgs->group_weight is not set while gathering statistics in update_sg_wakeup_stats(). This means that a group can be classified as fully busy with 0 running tasks if utilization is high enough. This path is mainly used for fork and exec. Fixes: 57abff067a08 ("sched/fair: Rework find_idlest_group()") Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Mel Gorman Link: https://lore.kernel.org/r/20200218144534.4564-1-vincent.guittot@linaro.org Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index df8a3435770a5..b7af4a11cbaa1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8340,6 +8340,8 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, sgs->group_capacity = group->sgc->capacity; + sgs->group_weight = group->group_weight; + sgs->group_type = group_classify(sd->imbalance_pct, group, sgs); /* From 7185bfce4faeff618025bf5aa0825e08ca3dc751 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Thu, 19 Mar 2020 11:39:20 +0800 Subject: [PATCH 0298/5344] sched/fair: Fix condition of avg_load calculation commit 6c8116c914b65be5e4d6f66d69c8142eb0648c22 upstream. In update_sg_wakeup_stats(), the comment says: Computing avg_load makes sense only when group is fully busy or overloaded. But, the code below this comment does not check like this. From reading the code about avg_load in other functions, I confirm that avg_load should be calculated in fully busy or overloaded case. The comment is correct and the checking condition is wrong. So, change that condition. Fixes: 57abff067a08 ("sched/fair: Rework find_idlest_group()") Signed-off-by: Tao Zhou Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Acked-by: Mel Gorman Link: https://lkml.kernel.org/r/Message-ID: Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b7af4a11cbaa1..6a03c7e46dce3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8348,7 +8348,8 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, * Computing avg_load makes sense only when group is fully busy or * overloaded */ - if (sgs->group_type < group_fully_busy) + if (sgs->group_type == group_fully_busy || + sgs->group_type == group_overloaded) sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / sgs->group_capacity; } From 9dd62452ea98b62f488aea0cb401317e8a63e2ef Mon Sep 17 00:00:00 2001 From: Aubrey Li Date: Thu, 26 Mar 2020 13:42:29 +0800 Subject: [PATCH 0299/5344] sched/fair: Fix negative imbalance in imbalance calculation commit 111688ca1c4a43a7e482f5401f82c46326b8ed49 upstream. A negative imbalance value was observed after imbalance calculation, this happens when the local sched group type is group_fully_busy, and the average load of local group is greater than the selected busiest group. Fix this problem by comparing the average load of the local and busiest group before imbalance calculation formula. Suggested-by: Vincent Guittot Reviewed-by: Phil Auld Reviewed-by: Vincent Guittot Acked-by: Mel Gorman Signed-off-by: Aubrey Li Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/1585201349-70192-1-git-send-email-aubrey.li@intel.com Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6a03c7e46dce3..7bad02f8fa05c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8745,6 +8745,14 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / sds->total_capacity; + /* + * If the local group is more loaded than the selected + * busiest group don't try to pull any tasks. + */ + if (local->avg_load >= busiest->avg_load) { + env->imbalance = 0; + return; + } } /* From 90162e21665dac431e3f6667de6142cdb83ce7d2 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 24 Feb 2020 09:52:15 +0000 Subject: [PATCH 0300/5344] sched/numa: Replace runnable_load_avg by load_avg commit 6499b1b2dd1b8d404a16b9fbbf1af6b9b3c1d83d upstream. Similarly to what has been done for the normal load balancer, we can replace runnable_load_avg by load_avg in numa load balancing and track the other statistics like the utilization and the number of running tasks to get to better view of the current state of a node. Signed-off-by: Vincent Guittot Signed-off-by: Mel Gorman Signed-off-by: Ingo Molnar Reviewed-by: "Dietmar Eggemann " Acked-by: Peter Zijlstra Cc: Juri Lelli Cc: Valentin Schneider Cc: Phil Auld Cc: Hillf Danton Link: https://lore.kernel.org/r/20200224095223.13361-6-mgorman@techsingularity.net Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 102 ++++++++++++++++++++++++++++++-------------- 1 file changed, 70 insertions(+), 32 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7bad02f8fa05c..a428dd4187d2a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1475,38 +1475,35 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; } -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); - -static unsigned long cpu_runnable_load(struct rq *rq) -{ - return cfs_rq_runnable_load_avg(&rq->cfs); -} +/* + * 'numa_type' describes the node at the moment of load balancing. + */ +enum numa_type { + /* The node has spare capacity that can be used to run more tasks. */ + node_has_spare = 0, + /* + * The node is fully used and the tasks don't compete for more CPU + * cycles. Nevertheless, some tasks might wait before running. + */ + node_fully_busy, + /* + * The node is overloaded and can't provide expected CPU cycles to all + * tasks. + */ + node_overloaded +}; /* Cached statistics for all CPUs within a node */ struct numa_stats { unsigned long load; - + unsigned long util; /* Total compute capacity of CPUs on a node */ unsigned long compute_capacity; + unsigned int nr_running; + unsigned int weight; + enum numa_type node_type; }; -/* - * XXX borrowed from update_sg_lb_stats - */ -static void update_numa_stats(struct numa_stats *ns, int nid) -{ - int cpu; - - memset(ns, 0, sizeof(*ns)); - for_each_cpu(cpu, cpumask_of_node(nid)) { - struct rq *rq = cpu_rq(cpu); - - ns->load += cpu_runnable_load(rq); - ns->compute_capacity += capacity_of(cpu); - } - -} - struct task_numa_env { struct task_struct *p; @@ -1523,6 +1520,47 @@ struct task_numa_env { int best_cpu; }; +static unsigned long cpu_load(struct rq *rq); +static unsigned long cpu_util(int cpu); + +static inline enum +numa_type numa_classify(unsigned int imbalance_pct, + struct numa_stats *ns) +{ + if ((ns->nr_running > ns->weight) && + ((ns->compute_capacity * 100) < (ns->util * imbalance_pct))) + return node_overloaded; + + if ((ns->nr_running < ns->weight) || + ((ns->compute_capacity * 100) > (ns->util * imbalance_pct))) + return node_has_spare; + + return node_fully_busy; +} + +/* + * XXX borrowed from update_sg_lb_stats + */ +static void update_numa_stats(struct task_numa_env *env, + struct numa_stats *ns, int nid) +{ + int cpu; + + memset(ns, 0, sizeof(*ns)); + for_each_cpu(cpu, cpumask_of_node(nid)) { + struct rq *rq = cpu_rq(cpu); + + ns->load += cpu_load(rq); + ns->util += cpu_util(cpu); + ns->nr_running += rq->cfs.h_nr_running; + ns->compute_capacity += capacity_of(cpu); + } + + ns->weight = cpumask_weight(cpumask_of_node(nid)); + + ns->node_type = numa_classify(env->imbalance_pct, ns); +} + static void task_numa_assign(struct task_numa_env *env, struct task_struct *p, long imp) { @@ -1558,6 +1596,11 @@ static bool load_too_imbalanced(long src_load, long dst_load, long orig_src_load, orig_dst_load; long src_capacity, dst_capacity; + + /* If dst node has spare capacity, there is no real load imbalance */ + if (env->dst_stats.node_type == node_has_spare) + return false; + /* * The load is corrected for the CPU capacity available on each node. * @@ -1790,10 +1833,10 @@ static int task_numa_migrate(struct task_struct *p) dist = env.dist = node_distance(env.src_nid, env.dst_nid); taskweight = task_weight(p, env.src_nid, dist); groupweight = group_weight(p, env.src_nid, dist); - update_numa_stats(&env.src_stats, env.src_nid); + update_numa_stats(&env, &env.src_stats, env.src_nid); taskimp = task_weight(p, env.dst_nid, dist) - taskweight; groupimp = group_weight(p, env.dst_nid, dist) - groupweight; - update_numa_stats(&env.dst_stats, env.dst_nid); + update_numa_stats(&env, &env.dst_stats, env.dst_nid); /* Try to find a spot on the preferred nid. */ task_numa_find_cpu(&env, taskimp, groupimp); @@ -1826,7 +1869,7 @@ static int task_numa_migrate(struct task_struct *p) env.dist = dist; env.dst_nid = nid; - update_numa_stats(&env.dst_stats, env.dst_nid); + update_numa_stats(&env, &env.dst_stats, env.dst_nid); task_numa_find_cpu(&env, taskimp, groupimp); } } @@ -3686,11 +3729,6 @@ static void remove_entity_load_avg(struct sched_entity *se) raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags); } -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) -{ - return cfs_rq->avg.runnable_load_avg; -} - static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) { return cfs_rq->avg.load_avg; From ffc46a233c3d6719669da513e5e512a070eb8c41 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 24 Feb 2020 09:52:17 +0000 Subject: [PATCH 0301/5344] sched/pelt: Remove unused runnable load average commit 0dacee1bfa70e171be3a12a30414c228453048d2 upstream. Now that runnable_load_avg is no more used, we can remove it to make space for a new signal. Signed-off-by: Vincent Guittot Signed-off-by: Mel Gorman Signed-off-by: Ingo Molnar Reviewed-by: "Dietmar Eggemann " Acked-by: Peter Zijlstra Cc: Juri Lelli Cc: Valentin Schneider Cc: Phil Auld Cc: Hillf Danton Link: https://lore.kernel.org/r/20200224095223.13361-8-mgorman@techsingularity.net Signed-off-by: Chengming Zhou --- include/linux/sched.h | 5 +- kernel/sched/core.c | 2 - kernel/sched/debug.c | 8 --- kernel/sched/fair.c | 129 ++++++------------------------------------ kernel/sched/pelt.c | 62 ++++++++------------ kernel/sched/sched.h | 7 +-- 6 files changed, 43 insertions(+), 170 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 56b72c64e2834..d2421ef4e5338 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -353,7 +353,7 @@ struct util_est { /* * The load_avg/util_avg accumulates an infinite geometric series - * (see __update_load_avg() in kernel/sched/fair.c). + * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c). * * [load_avg definition] * @@ -397,11 +397,9 @@ struct util_est { struct sched_avg { u64 last_update_time; u64 load_sum; - u64 runnable_load_sum; u32 util_sum; u32 period_contrib; unsigned long load_avg; - unsigned long runnable_load_avg; unsigned long util_avg; struct util_est util_est; } ____cacheline_aligned; @@ -445,7 +443,6 @@ struct sched_statistics { struct sched_entity { /* For load-balancing: */ struct load_weight load; - unsigned long runnable_weight; struct rb_node run_node; struct list_head group_node; unsigned int on_rq; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6e124673c6d38..b3ab371d5d430 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -759,7 +759,6 @@ static void set_load_weight(struct task_struct *p, bool update_load) if (task_has_idle_policy(p)) { load->weight = scale_load(WEIGHT_IDLEPRIO); load->inv_weight = WMULT_IDLEPRIO; - p->se.runnable_weight = load->weight; return; } @@ -772,7 +771,6 @@ static void set_load_weight(struct task_struct *p, bool update_load) } else { load->weight = scale_load(sched_prio_to_weight[prio]); load->inv_weight = sched_prio_to_wmult[prio]; - p->se.runnable_weight = load->weight; } } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index faada713cfae8..95029b54d9c99 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -400,11 +400,9 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group } P(se->load.weight); - P(se->runnable_weight); #ifdef CONFIG_SMP P(se->avg.load_avg); P(se->avg.util_avg); - P(se->avg.runnable_load_avg); #endif #undef PN_SCHEDSTAT @@ -543,11 +541,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SMP - SEQ_printf(m, " .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight); SEQ_printf(m, " .%-30s: %lu\n", "load_avg", cfs_rq->avg.load_avg); - SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg", - cfs_rq->avg.runnable_load_avg); SEQ_printf(m, " .%-30s: %lu\n", "util_avg", cfs_rq->avg.util_avg); SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued", @@ -956,13 +951,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, "nr_involuntary_switches", (long long)p->nivcsw); P(se.load.weight); - P(se.runnable_weight); #ifdef CONFIG_SMP P(se.avg.load_sum); - P(se.avg.runnable_load_sum); P(se.avg.util_sum); P(se.avg.load_avg); - P(se.avg.runnable_load_avg); P(se.avg.util_avg); P(se.avg.last_update_time); P(se.avg.util_est.ewma); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a428dd4187d2a..9f7e48babb7f7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -743,9 +743,7 @@ void init_entity_runnable_average(struct sched_entity *se) * nothing has been attached to the task group yet. */ if (entity_is_task(se)) - sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight); - - se->runnable_weight = se->load.weight; + sa->load_avg = scale_load_down(se->load.weight); /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } @@ -2879,25 +2877,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) } while (0) #ifdef CONFIG_SMP -static inline void -enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - cfs_rq->runnable_weight += se->runnable_weight; - - cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg; - cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum; -} - -static inline void -dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - cfs_rq->runnable_weight -= se->runnable_weight; - - sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg); - sub_positive(&cfs_rq->avg.runnable_load_sum, - se_runnable(se) * se->avg.runnable_load_sum); -} - static inline void enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -2913,28 +2892,22 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) } #else static inline void -enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } -static inline void -dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } -static inline void enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } #endif static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, - unsigned long weight, unsigned long runnable) + unsigned long weight) { if (se->on_rq) { /* commit outstanding execution time */ if (cfs_rq->curr == se) update_curr(cfs_rq); account_entity_dequeue(cfs_rq, se); - dequeue_runnable_load_avg(cfs_rq, se); } dequeue_load_avg(cfs_rq, se); - se->runnable_weight = runnable; update_load_set(&se->load, weight); #ifdef CONFIG_SMP @@ -2942,16 +2915,13 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib; se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); - se->avg.runnable_load_avg = - div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider); } while (0); #endif enqueue_load_avg(cfs_rq, se); - if (se->on_rq) { + if (se->on_rq) account_entity_enqueue(cfs_rq, se); - enqueue_runnable_load_avg(cfs_rq, se); - } + } void reweight_task(struct task_struct *p, int prio) @@ -2961,7 +2931,7 @@ void reweight_task(struct task_struct *p, int prio) struct load_weight *load = &se->load; unsigned long weight = scale_load(sched_prio_to_weight[prio]); - reweight_entity(cfs_rq, se, weight, weight); + reweight_entity(cfs_rq, se, weight); load->inv_weight = sched_prio_to_wmult[prio]; } @@ -3073,50 +3043,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq) */ return clamp_t(long, shares, MIN_SHARES, tg_shares); } - -/* - * This calculates the effective runnable weight for a group entity based on - * the group entity weight calculated above. - * - * Because of the above approximation (2), our group entity weight is - * an load_avg based ratio (3). This means that it includes blocked load and - * does not represent the runnable weight. - * - * Approximate the group entity's runnable weight per ratio from the group - * runqueue: - * - * grq->avg.runnable_load_avg - * ge->runnable_weight = ge->load.weight * -------------------------- (7) - * grq->avg.load_avg - * - * However, analogous to above, since the avg numbers are slow, this leads to - * transients in the from-idle case. Instead we use: - * - * ge->runnable_weight = ge->load.weight * - * - * max(grq->avg.runnable_load_avg, grq->runnable_weight) - * ----------------------------------------------------- (8) - * max(grq->avg.load_avg, grq->load.weight) - * - * Where these max() serve both to use the 'instant' values to fix the slow - * from-idle and avoid the /0 on to-idle, similar to (6). - */ -static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares) -{ - long runnable, load_avg; - - load_avg = max(cfs_rq->avg.load_avg, - scale_load_down(cfs_rq->load.weight)); - - runnable = max(cfs_rq->avg.runnable_load_avg, - scale_load_down(cfs_rq->runnable_weight)); - - runnable *= shares; - if (load_avg) - runnable /= load_avg; - - return clamp_t(long, runnable, MIN_SHARES, shares); -} #endif /* CONFIG_SMP */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); @@ -3128,7 +3054,7 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); static void update_cfs_group(struct sched_entity *se) { struct cfs_rq *gcfs_rq = group_cfs_rq(se); - long shares, runnable; + long shares; if (!gcfs_rq) return; @@ -3137,16 +3063,15 @@ static void update_cfs_group(struct sched_entity *se) return; #ifndef CONFIG_SMP - runnable = shares = READ_ONCE(gcfs_rq->tg->shares); + shares = READ_ONCE(gcfs_rq->tg->shares); if (likely(se->load.weight == shares)) return; #else shares = calc_group_shares(gcfs_rq); - runnable = calc_group_runnable(gcfs_rq, shares); #endif - reweight_entity(cfs_rq_of(se), se, shares, runnable); + reweight_entity(cfs_rq_of(se), se, shares); } #else /* CONFIG_FAIR_GROUP_SCHED */ @@ -3271,11 +3196,11 @@ void set_task_rq_fair(struct sched_entity *se, * _IFF_ we look at the pure running and runnable sums. Because they * represent the very same entity, just at different points in the hierarchy. * - * Per the above update_tg_cfs_util() is trivial and simply copies the running - * sum over (but still wrong, because the group entity and group rq do not have - * their PELT windows aligned). + * Per the above update_tg_cfs_util() is trivial * and simply copies the + * running sum over (but still wrong, because the group entity and group rq do + * not have their PELT windows aligned). * - * However, update_tg_cfs_runnable() is more complex. So we have: + * However, update_tg_cfs_load() is more complex. So we have: * * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2) * @@ -3356,11 +3281,11 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq } static inline void -update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) +update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; - unsigned long runnable_load_avg, load_avg; - u64 runnable_load_sum, load_sum = 0; + unsigned long load_avg; + u64 load_sum = 0; s64 delta_sum; if (!runnable_sum) @@ -3408,19 +3333,6 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf se->avg.load_avg = load_avg; add_positive(&cfs_rq->avg.load_avg, delta_avg); add_positive(&cfs_rq->avg.load_sum, delta_sum); - - runnable_load_sum = (s64)se_runnable(se) * runnable_sum; - runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX); - delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum; - delta_avg = runnable_load_avg - se->avg.runnable_load_avg; - - se->avg.runnable_load_sum = runnable_sum; - se->avg.runnable_load_avg = runnable_load_avg; - - if (se->on_rq) { - add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg); - add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum); - } } static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) @@ -3448,7 +3360,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum); update_tg_cfs_util(cfs_rq, se, gcfs_rq); - update_tg_cfs_runnable(cfs_rq, se, gcfs_rq); + update_tg_cfs_load(cfs_rq, se, gcfs_rq); trace_pelt_cfs_tp(cfs_rq); trace_pelt_se_tp(se); @@ -3593,8 +3505,6 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se)); } - se->avg.runnable_load_sum = se->avg.load_sum; - enqueue_load_avg(cfs_rq, se); cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; @@ -4040,14 +3950,12 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* * When enqueuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. - * - Add its load to cfs_rq->runnable_avg * - For group_entity, update its weight to reflect the new share of * its group cfs_rq * - Add its new weight to cfs_rq->load.weight */ update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); update_cfs_group(se); - enqueue_runnable_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); if (flags & ENQUEUE_WAKEUP) @@ -4130,13 +4038,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* * When dequeuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. - * - Subtract its load from the cfs_rq->runnable_avg. * - Subtract its previous weight from cfs_rq->load.weight. * - For group entity, update its weight to reflect the new share * of its group cfs_rq. */ update_load_avg(cfs_rq, se, UPDATE_TG); - dequeue_runnable_load_avg(cfs_rq, se); update_stats_dequeue(cfs_rq, se, flags); @@ -7587,9 +7493,6 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) if (cfs_rq->avg.util_sum) return false; - if (cfs_rq->avg.runnable_load_sum) - return false; - return true; } diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 51beee02f74f6..e781da2065798 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -106,7 +106,7 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) */ static __always_inline u32 accumulate_sum(u64 delta, struct sched_avg *sa, - unsigned long load, unsigned long runnable, int running) + unsigned long load, int running) { u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ u64 periods; @@ -119,8 +119,6 @@ accumulate_sum(u64 delta, struct sched_avg *sa, */ if (periods) { sa->load_sum = decay_load(sa->load_sum, periods); - sa->runnable_load_sum = - decay_load(sa->runnable_load_sum, periods); sa->util_sum = decay_load((u64)(sa->util_sum), periods); /* @@ -134,8 +132,6 @@ accumulate_sum(u64 delta, struct sched_avg *sa, if (load) sa->load_sum += load * contrib; - if (runnable) - sa->runnable_load_sum += runnable * contrib; if (running) sa->util_sum += contrib << SCHED_CAPACITY_SHIFT; @@ -172,7 +168,7 @@ accumulate_sum(u64 delta, struct sched_avg *sa, */ static __always_inline int ___update_load_sum(u64 now, struct sched_avg *sa, - unsigned long load, unsigned long runnable, int running) + unsigned long load, int running) { u64 delta; @@ -206,7 +202,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa, * update_blocked_averages() */ if (!load) - runnable = running = 0; + running = 0; /* * Now we know we crossed measurement unit boundaries. The *_avg @@ -215,14 +211,14 @@ ___update_load_sum(u64 now, struct sched_avg *sa, * Step 1: accumulate *_sum since last_update_time. If we haven't * crossed period boundaries, finish. */ - if (!accumulate_sum(delta, sa, load, runnable, running)) + if (!accumulate_sum(delta, sa, load, running)) return 0; return 1; } static __always_inline void -___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable) +___update_load_avg(struct sched_avg *sa, unsigned long load) { u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; @@ -230,7 +226,6 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna * Step 2: update *_avg. */ sa->load_avg = div_u64(load * sa->load_sum, divider); - sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider); WRITE_ONCE(sa->util_avg, sa->util_sum / divider); } @@ -238,17 +233,13 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna * sched_entity: * * task: - * se_runnable() == se_weight() + * se_weight() = se->load.weight * * group: [ see update_cfs_group() ] * se_weight() = tg->weight * grq->load_avg / tg->load_avg - * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg * - * load_sum := runnable_sum - * load_avg = se_weight(se) * runnable_avg - * - * runnable_load_sum := runnable_sum - * runnable_load_avg = se_runnable(se) * runnable_avg + * load_sum := runnable + * load_avg = se_weight(se) * load_sum * * XXX collapse load_sum and runnable_load_sum * @@ -256,15 +247,12 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna * * load_sum = \Sum se_weight(se) * se->avg.load_sum * load_avg = \Sum se->avg.load_avg - * - * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum - * runnable_load_avg = \Sum se->avg.runable_load_avg */ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se) { - if (___update_load_sum(now, &se->avg, 0, 0, 0)) { - ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); + if (___update_load_sum(now, &se->avg, 0, 0)) { + ___update_load_avg(&se->avg, se_weight(se)); trace_pelt_se_tp(se); return 1; } @@ -274,10 +262,9 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se) int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (___update_load_sum(now, &se->avg, !!se->on_rq, !!se->on_rq, - cfs_rq->curr == se)) { + if (___update_load_sum(now, &se->avg, !!se->on_rq, cfs_rq->curr == se)) { - ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); + ___update_load_avg(&se->avg, se_weight(se)); cfs_se_util_change(&se->avg); trace_pelt_se_tp(se); return 1; @@ -290,10 +277,9 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq) { if (___update_load_sum(now, &cfs_rq->avg, scale_load_down(cfs_rq->load.weight), - scale_load_down(cfs_rq->runnable_weight), cfs_rq->curr != NULL)) { - ___update_load_avg(&cfs_rq->avg, 1, 1); + ___update_load_avg(&cfs_rq->avg, 1); trace_pelt_cfs_tp(cfs_rq); return 1; } @@ -306,20 +292,19 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq) * * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked * util_sum = cpu_scale * load_sum - * runnable_load_sum = load_sum + * runnable_sum = util_sum * - * load_avg and runnable_load_avg are not supported and meaningless. + * load_avg is not supported and meaningless. * */ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) { if (___update_load_sum(now, &rq->avg_rt, - running, running, running)) { - ___update_load_avg(&rq->avg_rt, 1, 1); + ___update_load_avg(&rq->avg_rt, 1); trace_pelt_rt_tp(rq); return 1; } @@ -332,18 +317,19 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) * * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked * util_sum = cpu_scale * load_sum - * runnable_load_sum = load_sum + * runnable_sum = util_sum + * + * load_avg is not supported and meaningless. * */ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) { if (___update_load_sum(now, &rq->avg_dl, - running, running, running)) { - ___update_load_avg(&rq->avg_dl, 1, 1); + ___update_load_avg(&rq->avg_dl, 1); trace_pelt_dl_tp(rq); return 1; } @@ -357,7 +343,9 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) * * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked * util_sum = cpu_scale * load_sum - * runnable_load_sum = load_sum + * runnable_sum = util_sum + * + * load_avg is not supported and meaningless. * */ @@ -385,16 +373,14 @@ int update_irq_load_avg(struct rq *rq, u64 running) * rq->clock += delta with delta >= running */ ret = ___update_load_sum(rq->clock - running, &rq->avg_irq, - 0, 0, 0); ret += ___update_load_sum(rq->clock, &rq->avg_irq, - 1, 1, 1); if (ret) { - ___update_load_avg(&rq->avg_irq, 1, 1); + ___update_load_avg(&rq->avg_irq, 1); trace_pelt_irq_tp(rq); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 42ed210b9552f..c7e73088efd33 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -493,7 +493,6 @@ struct cfs_bandwidth { }; /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; - unsigned long runnable_weight; unsigned int nr_running; unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ unsigned int idle_h_nr_running; /* SCHED_IDLE */ @@ -692,8 +691,10 @@ struct dl_rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* An entity is a task if it doesn't "own" a runqueue */ #define entity_is_task(se) (!se->my_q) + #else #define entity_is_task(se) 1 + #endif #ifdef CONFIG_SMP @@ -705,10 +706,6 @@ static inline long se_weight(struct sched_entity *se) return scale_load_down(se->load.weight); } -static inline long se_runnable(struct sched_entity *se) -{ - return scale_load_down(se->runnable_weight); -} static inline bool sched_asym_prefer(int a, int b) { From a68ad151f74763ab6ac02e0bd567d4eca4ff2cd0 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 24 Feb 2020 09:52:18 +0000 Subject: [PATCH 0302/5344] sched/pelt: Add a new runnable average signal commit 9f68395333ad7f5bfe2f83473fed363d4229f11c upstream. Now that runnable_load_avg has been removed, we can replace it by a new signal that will highlight the runnable pressure on a cfs_rq. This signal track the waiting time of tasks on rq and can help to better define the state of rqs. At now, only util_avg is used to define the state of a rq: A rq with more that around 80% of utilization and more than 1 tasks is considered as overloaded. But the util_avg signal of a rq can become temporaly low after that a task migrated onto another rq which can bias the classification of the rq. When tasks compete for the same rq, their runnable average signal will be higher than util_avg as it will include the waiting time and we can use this signal to better classify cfs_rqs. The new runnable_avg will track the runnable time of a task which simply adds the waiting time to the running time. The runnable _avg of cfs_rq will be the /Sum of se's runnable_avg and the runnable_avg of group entity will follow the one of the rq similarly to util_avg. Signed-off-by: Vincent Guittot Signed-off-by: Mel Gorman Signed-off-by: Ingo Molnar Reviewed-by: "Dietmar Eggemann " Acked-by: Peter Zijlstra Cc: Juri Lelli Cc: Valentin Schneider Cc: Phil Auld Cc: Hillf Danton Link: https://lore.kernel.org/r/20200224095223.13361-9-mgorman@techsingularity.net Signed-off-by: Chengming Zhou --- include/linux/sched.h | 26 +++++++++------ kernel/sched/debug.c | 9 +++-- kernel/sched/fair.c | 77 +++++++++++++++++++++++++++++++++++++++---- kernel/sched/pelt.c | 39 +++++++++++++++------- kernel/sched/sched.h | 22 ++++++++++++- 5 files changed, 142 insertions(+), 31 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index d2421ef4e5338..da4f5f2baced5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -352,28 +352,30 @@ struct util_est { } __attribute__((__aligned__(sizeof(u64)))); /* - * The load_avg/util_avg accumulates an infinite geometric series + * The load/runnable/util_avg accumulates an infinite geometric series * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c). * * [load_avg definition] * * load_avg = runnable% * scale_load_down(load) * - * where runnable% is the time ratio that a sched_entity is runnable. - * For cfs_rq, it is the aggregated load_avg of all runnable and - * blocked sched_entities. + * [runnable_avg definition] + * + * runnable_avg = runnable% * SCHED_CAPACITY_SCALE * * [util_avg definition] * * util_avg = running% * SCHED_CAPACITY_SCALE * - * where running% is the time ratio that a sched_entity is running on - * a CPU. For cfs_rq, it is the aggregated util_avg of all runnable - * and blocked sched_entities. + * where runnable% is the time ratio that a sched_entity is runnable and + * running% the time ratio that a sched_entity is running. + * + * For cfs_rq, they are the aggregated values of all runnable and blocked + * sched_entities. * - * load_avg and util_avg don't direcly factor frequency scaling and CPU - * capacity scaling. The scaling is done through the rq_clock_pelt that - * is used for computing those signals (see update_rq_clock_pelt()) + * The load/runnable/util_avg doesn't direcly factor frequency scaling and CPU + * capacity scaling. The scaling is done through the rq_clock_pelt that is used + * for computing those signals (see update_rq_clock_pelt()) * * N.B., the above ratios (runnable% and running%) themselves are in the * range of [0, 1]. To do fixed point arithmetics, we therefore scale them @@ -397,9 +399,11 @@ struct util_est { struct sched_avg { u64 last_update_time; u64 load_sum; + u64 runnable_sum; u32 util_sum; u32 period_contrib; unsigned long load_avg; + unsigned long runnable_avg; unsigned long util_avg; struct util_est util_est; } ____cacheline_aligned; @@ -463,6 +467,8 @@ struct sched_entity { struct cfs_rq *cfs_rq; /* rq "owned" by this entity/group: */ struct cfs_rq *my_q; + /* cached value of my_q->h_nr_running */ + unsigned long runnable_weight; #endif #ifdef CONFIG_SMP diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 95029b54d9c99..8b5ef338f8c66 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -403,6 +403,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group #ifdef CONFIG_SMP P(se->avg.load_avg); P(se->avg.util_avg); + P(se->avg.runnable_avg); #endif #undef PN_SCHEDSTAT @@ -543,6 +544,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #ifdef CONFIG_SMP SEQ_printf(m, " .%-30s: %lu\n", "load_avg", cfs_rq->avg.load_avg); + SEQ_printf(m, " .%-30s: %lu\n", "runnable_avg", + cfs_rq->avg.runnable_avg); SEQ_printf(m, " .%-30s: %lu\n", "util_avg", cfs_rq->avg.util_avg); SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued", @@ -551,8 +554,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->removed.load_avg); SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", cfs_rq->removed.util_avg); - SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_sum", - cfs_rq->removed.runnable_sum); + SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_avg", + cfs_rq->removed.runnable_avg); #ifdef CONFIG_FAIR_GROUP_SCHED SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib", cfs_rq->tg_load_avg_contrib); @@ -953,8 +956,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(se.load.weight); #ifdef CONFIG_SMP P(se.avg.load_sum); + P(se.avg.runnable_sum); P(se.avg.util_sum); P(se.avg.load_avg); + P(se.avg.runnable_avg); P(se.avg.util_avg); P(se.avg.last_update_time); P(se.avg.util_est.ewma); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9f7e48babb7f7..57c1f09615dd8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -796,6 +796,8 @@ void post_init_entity_util_avg(struct task_struct *p) } } + sa->runnable_avg = cpu_scale; + if (p->sched_class != &fair_sched_class) { /* * For !fair tasks do: @@ -3196,9 +3198,9 @@ void set_task_rq_fair(struct sched_entity *se, * _IFF_ we look at the pure running and runnable sums. Because they * represent the very same entity, just at different points in the hierarchy. * - * Per the above update_tg_cfs_util() is trivial * and simply copies the - * running sum over (but still wrong, because the group entity and group rq do - * not have their PELT windows aligned). + * Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial + * and simply copies the running/runnable sum over (but still wrong, because + * the group entity and group rq do not have their PELT windows aligned). * * However, update_tg_cfs_load() is more complex. So we have: * @@ -3280,6 +3282,32 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX; } +static inline void +update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) +{ + long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; + + /* Nothing to update */ + if (!delta) + return; + + /* + * The relation between sum and avg is: + * + * LOAD_AVG_MAX - 1024 + sa->period_contrib + * + * however, the PELT windows are not aligned between grq and gse. + */ + + /* Set new sched_entity's runnable */ + se->avg.runnable_avg = gcfs_rq->avg.runnable_avg; + se->avg.runnable_sum = se->avg.runnable_avg * LOAD_AVG_MAX; + + /* Update parent cfs_rq runnable */ + add_positive(&cfs_rq->avg.runnable_avg, delta); + cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * LOAD_AVG_MAX; +} + static inline void update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { @@ -3360,6 +3388,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum); update_tg_cfs_util(cfs_rq, se, gcfs_rq); + update_tg_cfs_runnable(cfs_rq, se, gcfs_rq); update_tg_cfs_load(cfs_rq, se, gcfs_rq); trace_pelt_cfs_tp(cfs_rq); @@ -3430,7 +3459,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { - unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0; + unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0; struct sched_avg *sa = &cfs_rq->avg; int decayed = 0; @@ -3441,7 +3470,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) raw_spin_lock(&cfs_rq->removed.lock); swap(cfs_rq->removed.util_avg, removed_util); swap(cfs_rq->removed.load_avg, removed_load); - swap(cfs_rq->removed.runnable_sum, removed_runnable_sum); + swap(cfs_rq->removed.runnable_avg, removed_runnable); cfs_rq->removed.nr = 0; raw_spin_unlock(&cfs_rq->removed.lock); @@ -3453,7 +3482,16 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) sub_positive(&sa->util_avg, r); sub_positive(&sa->util_sum, r * divider); - add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum); + r = removed_runnable; + sub_positive(&sa->runnable_avg, r); + sub_positive(&sa->runnable_sum, r * divider); + + /* + * removed_runnable is the unweighted version of removed_load so we + * can use it to estimate removed_load_sum. + */ + add_tg_cfs_propagate(cfs_rq, + -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT); decayed = 1; } @@ -3499,6 +3537,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s */ se->avg.util_sum = se->avg.util_avg * divider; + se->avg.runnable_sum = se->avg.runnable_avg * divider; + se->avg.load_sum = divider; if (se_weight(se)) { se->avg.load_sum = @@ -3508,6 +3548,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s enqueue_load_avg(cfs_rq, se); cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; + cfs_rq->avg.runnable_avg += se->avg.runnable_avg; + cfs_rq->avg.runnable_sum += se->avg.runnable_sum; add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); @@ -3529,6 +3571,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s dequeue_load_avg(cfs_rq, se); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); + sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); + sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum); add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); @@ -3635,10 +3679,15 @@ static void remove_entity_load_avg(struct sched_entity *se) ++cfs_rq->removed.nr; cfs_rq->removed.util_avg += se->avg.util_avg; cfs_rq->removed.load_avg += se->avg.load_avg; - cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */ + cfs_rq->removed.runnable_avg += se->avg.runnable_avg; raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags); } +static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq) +{ + return cfs_rq->avg.runnable_avg; +} + static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) { return cfs_rq->avg.load_avg; @@ -3950,11 +3999,13 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* * When enqueuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. + * - Add its load to cfs_rq->runnable_avg * - For group_entity, update its weight to reflect the new share of * its group cfs_rq * - Add its new weight to cfs_rq->load.weight */ update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); + se_update_runnable(se); update_cfs_group(se); account_entity_enqueue(cfs_rq, se); @@ -4038,11 +4089,13 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* * When dequeuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. + * - Subtract its load from the cfs_rq->runnable_avg. * - Subtract its previous weight from cfs_rq->load.weight. * - For group entity, update its weight to reflect the new share * of its group cfs_rq. */ update_load_avg(cfs_rq, se, UPDATE_TG); + se_update_runnable(se); update_stats_dequeue(cfs_rq, se, flags); @@ -5225,6 +5278,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq = cfs_rq_of(se); update_load_avg(cfs_rq, se, UPDATE_TG); + se_update_runnable(se); update_cfs_group(se); cfs_rq->h_nr_running++; @@ -5328,6 +5382,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq = cfs_rq_of(se); update_load_avg(cfs_rq, se, UPDATE_TG); + se_update_runnable(se); update_cfs_group(se); cfs_rq->h_nr_running--; @@ -5410,6 +5465,11 @@ static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p) return load; } +static unsigned long cpu_runnable(struct rq *rq) +{ + return cfs_rq_runnable_avg(&rq->cfs); +} + static unsigned long capacity_of(int cpu) { return cpu_rq(cpu)->cpu_capacity; @@ -7493,6 +7553,9 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) if (cfs_rq->avg.util_sum) return false; + if (cfs_rq->avg.runnable_sum) + return false; + return true; } diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index e781da2065798..7b8545ae60875 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -106,7 +106,7 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) */ static __always_inline u32 accumulate_sum(u64 delta, struct sched_avg *sa, - unsigned long load, int running) + unsigned long load, unsigned long runnable, int running) { u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ u64 periods; @@ -119,6 +119,8 @@ accumulate_sum(u64 delta, struct sched_avg *sa, */ if (periods) { sa->load_sum = decay_load(sa->load_sum, periods); + sa->runnable_sum = + decay_load(sa->runnable_sum, periods); sa->util_sum = decay_load((u64)(sa->util_sum), periods); /* @@ -132,6 +134,8 @@ accumulate_sum(u64 delta, struct sched_avg *sa, if (load) sa->load_sum += load * contrib; + if (runnable) + sa->runnable_sum += runnable * contrib << SCHED_CAPACITY_SHIFT; if (running) sa->util_sum += contrib << SCHED_CAPACITY_SHIFT; @@ -168,7 +172,7 @@ accumulate_sum(u64 delta, struct sched_avg *sa, */ static __always_inline int ___update_load_sum(u64 now, struct sched_avg *sa, - unsigned long load, int running) + unsigned long load, unsigned long runnable, int running) { u64 delta; @@ -202,7 +206,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa, * update_blocked_averages() */ if (!load) - running = 0; + runnable = running = 0; /* * Now we know we crossed measurement unit boundaries. The *_avg @@ -211,7 +215,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa, * Step 1: accumulate *_sum since last_update_time. If we haven't * crossed period boundaries, finish. */ - if (!accumulate_sum(delta, sa, load, running)) + if (!accumulate_sum(delta, sa, load, runnable, running)) return 0; return 1; @@ -226,6 +230,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load) * Step 2: update *_avg. */ sa->load_avg = div_u64(load * sa->load_sum, divider); + sa->runnable_avg = div_u64(sa->runnable_sum, divider); WRITE_ONCE(sa->util_avg, sa->util_sum / divider); } @@ -234,24 +239,30 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load) * * task: * se_weight() = se->load.weight + * se_runnable() = !!on_rq * * group: [ see update_cfs_group() ] * se_weight() = tg->weight * grq->load_avg / tg->load_avg + * se_runnable() = grq->h_nr_running + * + * runnable_sum = se_runnable() * runnable = grq->runnable_sum + * runnable_avg = runnable_sum * * load_sum := runnable * load_avg = se_weight(se) * load_sum * - * XXX collapse load_sum and runnable_load_sum - * * cfq_rq: * + * runnable_sum = \Sum se->avg.runnable_sum + * runnable_avg = \Sum se->avg.runnable_avg + * * load_sum = \Sum se_weight(se) * se->avg.load_sum * load_avg = \Sum se->avg.load_avg */ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se) { - if (___update_load_sum(now, &se->avg, 0, 0)) { + if (___update_load_sum(now, &se->avg, 0, 0, 0)) { ___update_load_avg(&se->avg, se_weight(se)); trace_pelt_se_tp(se); return 1; @@ -262,7 +273,8 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se) int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (___update_load_sum(now, &se->avg, !!se->on_rq, cfs_rq->curr == se)) { + if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se), + cfs_rq->curr == se)) { ___update_load_avg(&se->avg, se_weight(se)); cfs_se_util_change(&se->avg); @@ -277,6 +289,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq) { if (___update_load_sum(now, &cfs_rq->avg, scale_load_down(cfs_rq->load.weight), + cfs_rq->h_nr_running, cfs_rq->curr != NULL)) { ___update_load_avg(&cfs_rq->avg, 1); @@ -294,13 +307,14 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq) * util_sum = cpu_scale * load_sum * runnable_sum = util_sum * - * load_avg is not supported and meaningless. + * load_avg and runnable_avg are not supported and meaningless. * */ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) { if (___update_load_sum(now, &rq->avg_rt, + running, running, running)) { @@ -319,13 +333,14 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) * util_sum = cpu_scale * load_sum * runnable_sum = util_sum * - * load_avg is not supported and meaningless. + * load_avg and runnable_avg are not supported and meaningless. * */ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) { if (___update_load_sum(now, &rq->avg_dl, + running, running, running)) { @@ -345,7 +360,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) * util_sum = cpu_scale * load_sum * runnable_sum = util_sum * - * load_avg is not supported and meaningless. + * load_avg and runnable_avg are not supported and meaningless. * */ @@ -373,9 +388,11 @@ int update_irq_load_avg(struct rq *rq, u64 running) * rq->clock += delta with delta >= running */ ret = ___update_load_sum(rq->clock - running, &rq->avg_irq, + 0, 0, 0); ret += ___update_load_sum(rq->clock, &rq->avg_irq, + 1, 1, 1); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c7e73088efd33..6040c3d2bffb2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -531,7 +531,7 @@ struct cfs_rq { int nr; unsigned long load_avg; unsigned long util_avg; - unsigned long runnable_sum; + unsigned long runnable_avg; } removed; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -692,9 +692,29 @@ struct dl_rq { /* An entity is a task if it doesn't "own" a runqueue */ #define entity_is_task(se) (!se->my_q) +static inline void se_update_runnable(struct sched_entity *se) +{ + if (!entity_is_task(se)) + se->runnable_weight = se->my_q->h_nr_running; +} + +static inline long se_runnable(struct sched_entity *se) +{ + if (entity_is_task(se)) + return !!se->on_rq; + else + return se->runnable_weight; +} + #else #define entity_is_task(se) 1 +static inline void se_update_runnable(struct sched_entity *se) {} + +static inline long se_runnable(struct sched_entity *se) +{ + return !!se->on_rq; +} #endif #ifdef CONFIG_SMP From b23d06c29fa9430ed40e329925e2947babfc70ee Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 24 Feb 2020 09:52:19 +0000 Subject: [PATCH 0303/5344] sched/fair: Take into account runnable_avg to classify group commit 070f5e860ee2bf588c99ef7b4c202451faa48236 upstream. Take into account the new runnable_avg signal to classify a group and to mitigate the volatility of util_avg in face of intensive migration or new task with random utilization. Signed-off-by: Vincent Guittot Signed-off-by: Mel Gorman Signed-off-by: Ingo Molnar Reviewed-by: "Dietmar Eggemann " Acked-by: Peter Zijlstra Cc: Juri Lelli Cc: Steven Rostedt Cc: Valentin Schneider Cc: Phil Auld Cc: Hillf Danton Link: https://lore.kernel.org/r/20200224095223.13361-10-mgorman@techsingularity.net Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 57c1f09615dd8..d1201434836a8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5470,6 +5470,24 @@ static unsigned long cpu_runnable(struct rq *rq) return cfs_rq_runnable_avg(&rq->cfs); } +static unsigned long cpu_runnable_without(struct rq *rq, struct task_struct *p) +{ + struct cfs_rq *cfs_rq; + unsigned int runnable; + + /* Task has no contribution or is new */ + if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) + return cpu_runnable(rq); + + cfs_rq = &rq->cfs; + runnable = READ_ONCE(cfs_rq->avg.runnable_avg); + + /* Discount task's runnable from CPU's runnable */ + lsub_positive(&runnable, p->se.avg.runnable_avg); + + return runnable; +} + static unsigned long capacity_of(int cpu) { return cpu_rq(cpu)->cpu_capacity; @@ -7691,7 +7709,8 @@ struct sg_lb_stats { unsigned long avg_load; /*Avg load across the CPUs of the group */ unsigned long group_load; /* Total load over the CPUs of the group */ unsigned long group_capacity; - unsigned long group_util; /* Total utilization of the group */ + unsigned long group_util; /* Total utilization over the CPUs of the group */ + unsigned long group_runnable; /* Total runnable time over the CPUs of the group */ unsigned int sum_nr_running; /* Nr of tasks running in the group */ unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ unsigned int idle_cpus; @@ -7932,6 +7951,10 @@ group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs) if (sgs->sum_nr_running < sgs->group_weight) return true; + if ((sgs->group_capacity * imbalance_pct) < + (sgs->group_runnable * 100)) + return false; + if ((sgs->group_capacity * 100) > (sgs->group_util * imbalance_pct)) return true; @@ -7957,6 +7980,10 @@ group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs) (sgs->group_util * imbalance_pct)) return true; + if ((sgs->group_capacity * imbalance_pct) < + (sgs->group_runnable * 100)) + return true; + return false; } @@ -8051,6 +8078,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_load += cpu_load(rq); sgs->group_util += cpu_util(i); + sgs->group_runnable += cpu_runnable(rq); sgs->sum_h_nr_running += rq->cfs.h_nr_running; nr_running = rq->nr_running; @@ -8322,6 +8350,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, sgs->group_load += cpu_load_without(rq, p); sgs->group_util += cpu_util_without(i, p); + sgs->group_runnable += cpu_runnable_without(rq, p); local = task_running_on_cpu(i, p); sgs->sum_h_nr_running += rq->cfs.h_nr_running - local; From 158d23079e908fc4ac36198721b8082eb386c7e9 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 27 Feb 2020 16:41:15 +0100 Subject: [PATCH 0304/5344] sched/fair: Fix runnable_avg for throttled cfs commit 6212437f0f6043e825e021e4afc5cd63e248a2b4 upstream. When a cfs_rq is throttled, its group entity is dequeued and its running tasks are removed. We must update runnable_avg with the old h_nr_running and update group_se->runnable_weight with the new h_nr_running at each level of the hierarchy. Reviewed-by: Ben Segall Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Fixes: 9f68395333ad ("sched/pelt: Add a new runnable average signal") Link: https://lkml.kernel.org/r/20200227154115.8332-1-vincent.guittot@linaro.org Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d1201434836a8..1c6ee07ba3066 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4566,8 +4566,13 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) if (!se->on_rq) break; - if (dequeue) + if (dequeue) { dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); + } else { + update_load_avg(qcfs_rq, se, 0); + se_update_runnable(se); + } + qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; @@ -4630,6 +4635,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); + update_load_avg(cfs_rq, se, UPDATE_TG); + se_update_runnable(se); + cfs_rq->h_nr_running += task_delta; cfs_rq->idle_h_nr_running += idle_task_delta; From 0f819d7f23fc3de39683517e89f8a2f67b908de8 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 24 Jun 2020 17:44:22 +0200 Subject: [PATCH 0305/5344] sched/cfs: change initial value of runnable_avg commit e21cf43406a190adfcc4bfe592768066fb3aaa9b upstream. Some performance regression on reaim benchmark have been raised with commit 070f5e860ee2 ("sched/fair: Take into account runnable_avg to classify group") The problem comes from the init value of runnable_avg which is initialized with max value. This can be a problem if the newly forked task is finally a short task because the group of CPUs is wrongly set to overloaded and tasks are pulled less agressively. Set initial value of runnable_avg equals to util_avg to reflect that there is no waiting time so far. Fixes: 070f5e860ee2 ("sched/fair: Take into account runnable_avg to classify group") Reported-by: kernel test robot Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200624154422.29166-1-vincent.guittot@linaro.org Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1c6ee07ba3066..f2da637934aec 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -796,7 +796,7 @@ void post_init_entity_util_avg(struct task_struct *p) } } - sa->runnable_avg = cpu_scale; + sa->runnable_avg = sa->util_avg; if (p->sched_class != &fair_sched_class) { /* From 3fbe00efc037b6fa545d5979f18d8414ef3cca56 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 14 Nov 2019 16:19:27 +0530 Subject: [PATCH 0306/5344] sched/fair: Make sched-idle CPU selection consistent throughout commit 17346452b25b98acfb395d2a82ec2e4ad0cb7a01 upstream. There are instances where we keep searching for an idle CPU despite already having a sched-idle CPU (in find_idlest_group_cpu(), select_idle_smt() and select_idle_cpu() and then there are places where we don't necessarily do that and return a sched-idle CPU as soon as we find one (in select_idle_sibling()). This looks a bit inconsistent and it may be worth having the same policy everywhere. On the other hand, choosing a sched-idle CPU over a idle one shall be beneficial from performance and power point of view as well, as we don't need to get the CPU online from a deep idle state which wastes quite a lot of time and energy and delays the scheduling of the newly woken up task. This patch tries to simplify code around sched-idle CPU selection and make it consistent throughout. Testing is done with the help of rt-app on hikey board (ARM64 octa-core, 2 clusters, 0-3 and 4-7). The cpufreq governor was set to performance to avoid any side affects from CPU frequency. Following are the tests performed: Test 1: 1-cfs-task: A single SCHED_NORMAL task is pinned to CPU5 which runs for 2333 us out of 7777 us (so gives time for the cluster to go in deep idle state). Test 2: 1-cfs-1-idle-task: A single SCHED_NORMAL task is pinned on CPU5 and single SCHED_IDLE task is pinned on CPU6 (to make sure cluster 1 doesn't go in deep idle state). Test 3: 1-cfs-8-idle-task: A single SCHED_NORMAL task is pinned on CPU5 and eight SCHED_IDLE tasks are created which run forever (not pinned anywhere, so they run on all CPUs). Checked with kernelshark that as soon as NORMAL task sleeps, the SCHED_IDLE task starts running on CPU5. And here are the results on mean latency (in us), using the "st" tool. $ st 1-cfs-task/rt-app-cfs_thread-0.log N min max sum mean stddev 642 90 592 197180 307.134 109.906 $ st 1-cfs-1-idle-task/rt-app-cfs_thread-0.log N min max sum mean stddev 642 67 311 113850 177.336 41.4251 $ st 1-cfs-8-idle-task/rt-app-cfs_thread-0.log N min max sum mean stddev 643 29 173 41364 64.3297 13.2344 The mean latency when we need to: - wakeup from deep idle state is 307 us. - wakeup from shallow idle state is 177 us. - preempt a SCHED_IDLE task is 64 us. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/b90cbcce608cef4e02a7bbfe178335f76d201bab.1573728344.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f2da637934aec..adc7627e7982b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5664,7 +5664,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this unsigned int min_exit_latency = UINT_MAX; u64 latest_idle_timestamp = 0; int least_loaded_cpu = this_cpu; - int shallowest_idle_cpu = -1, si_cpu = -1; + int shallowest_idle_cpu = -1; int i; /* Check if we have any choice: */ @@ -5673,6 +5673,9 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { + if (sched_idle_cpu(i)) + return i; + if (available_idle_cpu(i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); @@ -5695,12 +5698,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; } - } else if (shallowest_idle_cpu == -1 && si_cpu == -1) { - if (sched_idle_cpu(i)) { - si_cpu = i; - continue; - } - + } else if (shallowest_idle_cpu == -1) { load = cpu_load(cpu_rq(i)); if (load < min_load) { min_load = load; @@ -5709,11 +5707,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this } } - if (shallowest_idle_cpu != -1) - return shallowest_idle_cpu; - if (si_cpu != -1) - return si_cpu; - return least_loaded_cpu; + return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, @@ -5866,7 +5860,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int */ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) { - int cpu, si_cpu = -1; + int cpu; if (!static_branch_likely(&sched_smt_present)) return -1; @@ -5875,13 +5869,11 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t if (!cpumask_test_cpu(cpu, p->cpus_ptr) || !cpumask_test_cpu(cpu, sched_domain_span(sd))) continue; - if (available_idle_cpu(cpu)) + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) return cpu; - if (si_cpu == -1 && sched_idle_cpu(cpu)) - si_cpu = cpu; } - return si_cpu; + return -1; } #else /* CONFIG_SCHED_SMT */ @@ -5911,7 +5903,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t u64 time, cost; s64 delta; int this = smp_processor_id(); - int cpu, nr = INT_MAX, si_cpu = -1; + int cpu, nr = INT_MAX; this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) @@ -5941,11 +5933,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t for_each_cpu_wrap(cpu, cpus, target) { if (!--nr) - return si_cpu; - if (available_idle_cpu(cpu)) + return -1; + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) break; - if (si_cpu == -1 && sched_idle_cpu(cpu)) - si_cpu = cpu; } time = cpu_clock(this) - time; From e65ddc286e584cd3f5960ab863f441570fbb0d3d Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 8 Jan 2020 13:57:04 +0530 Subject: [PATCH 0307/5344] sched/fair: Load balance aggressively for SCHED_IDLE CPUs commit 323af6deaf70f204880caf94678350802682e0dc upstream. The fair scheduler performs periodic load balance on every CPU to check if it can pull some tasks from other busy CPUs. The duration of this periodic load balance is set to sd->balance_interval for the idle CPUs and is calculated by multiplying the sd->balance_interval with the sd->busy_factor (set to 32 by default) for the busy CPUs. The multiplication is done for busy CPUs to avoid doing load balance too often and rather spend more time executing actual task. While that is the right thing to do for the CPUs busy with SCHED_OTHER or SCHED_BATCH tasks, it may not be the optimal thing for CPUs running only SCHED_IDLE tasks. With the recent enhancements in the fair scheduler around SCHED_IDLE CPUs, we now prefer to enqueue a newly-woken task to a SCHED_IDLE CPU instead of other busy or idle CPUs. The same reasoning should be applied to the load balancer as well to make it migrate tasks more aggressively to a SCHED_IDLE CPU, as that will reduce the scheduling latency of the migrated (SCHED_OTHER) tasks. This patch makes minimal changes to the fair scheduler to do the next load balance soon after the last non SCHED_IDLE task is dequeued from a runqueue, i.e. making the CPU SCHED_IDLE. Also the sd->busy_factor is ignored while calculating the balance_interval for such CPUs. This is done to avoid delaying the periodic load balance by few hundred milliseconds for SCHED_IDLE CPUs. This is tested on ARM64 Hikey620 platform (octa-core) with the help of rt-app and it is verified, using kernel traces, that the newly SCHED_IDLE CPU does load balancing shortly after it becomes SCHED_IDLE and pulls tasks from other busy CPUs. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/e485827eb8fe7db0943d6f3f6e0f5a4a70272781.1578471925.git.viresh.kumar@linaro.org Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index adc7627e7982b..a3db918b8f35b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5237,6 +5237,18 @@ static inline void update_overutilized_status(struct rq *rq) static inline void update_overutilized_status(struct rq *rq) { } #endif +/* Runqueue only has SCHED_IDLE tasks enqueued */ +static int sched_idle_rq(struct rq *rq) +{ + return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running && + rq->nr_running); +} + +static int sched_idle_cpu(int cpu) +{ + return sched_idle_rq(cpu_rq(cpu)); +} + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -5359,6 +5371,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; int idle_h_nr_running = task_has_idle_policy(p); + bool was_sched_idle = sched_idle_rq(rq); for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -5406,6 +5419,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) sub_nr_running(rq, 1); + /* balance early to pull high priority tasks */ + if (unlikely(!was_sched_idle && sched_idle_rq(rq))) + rq->next_balance = jiffies; + util_est_dequeue(&rq->cfs, p, task_sleep); hrtick_update(rq); } @@ -5428,15 +5445,6 @@ static struct { #endif /* CONFIG_NO_HZ_COMMON */ -/* CPU only has SCHED_IDLE tasks enqueued */ -static int sched_idle_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running && - rq->nr_running); -} - static unsigned long cpu_load(struct rq *rq) { return cfs_rq_load_avg(&rq->cfs); @@ -9594,6 +9602,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) { int continue_balancing = 1; int cpu = rq->cpu; + int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); unsigned long interval; struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ @@ -9630,7 +9639,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) break; } - interval = get_sd_balance_interval(sd, idle != CPU_IDLE); + interval = get_sd_balance_interval(sd, busy); need_serialize = sd->flags & SD_SERIALIZE; if (need_serialize) { @@ -9646,9 +9655,10 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) * state even if we migrated tasks. Update it. */ idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; + busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); } sd->last_balance = jiffies; - interval = get_sd_balance_interval(sd, idle != CPU_IDLE); + interval = get_sd_balance_interval(sd, busy); } if (need_serialize) spin_unlock(&balancing); From ff53bbd603a68ce7a134c76dc18f981f5d0d97e1 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 20 Jan 2020 11:29:05 +0530 Subject: [PATCH 0308/5344] sched/fair: Define sched_idle_cpu() only for SMP configurations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit afa70d941f663c69c9a64ec1021bbcfa82f0e54a upstream. sched_idle_cpu() isn't used for non SMP configuration and with a recent change, we have started getting following warning: kernel/sched/fair.c:5221:12: warning: ‘sched_idle_cpu’ defined but not used [-Wunused-function] Fix that by defining sched_idle_cpu() only for SMP configurations. Fixes: 323af6deaf70 ("sched/fair: Load balance aggressively for SCHED_IDLE CPUs") Reported-by: Stephen Rothwell Signed-off-by: Viresh Kumar Signed-off-by: Ingo Molnar Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Link: https://lore.kernel.org/r/f0554f590687478b33914a4aff9f0e6a62886d44.1579499907.git.viresh.kumar@linaro.org Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a3db918b8f35b..2956f20c8dfc6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5244,10 +5244,12 @@ static int sched_idle_rq(struct rq *rq) rq->nr_running); } +#ifdef CONFIG_SMP static int sched_idle_cpu(int cpu) { return sched_idle_rq(cpu_rq(cpu)); } +#endif /* * The enqueue_task method is called before nr_running is From 31c7d059ae555fe3cad5d4e6871f7eab960cd9e8 Mon Sep 17 00:00:00 2001 From: Peng Wang Date: Tue, 10 Nov 2020 10:11:59 +0800 Subject: [PATCH 0309/5344] sched/fair: Reorder throttle_cfs_rq() path commit b6d37a764a5b852db63101b3f2db0e699574b903 upstream. As commit: 39f23ce07b93 ("sched/fair: Fix unthrottle_cfs_rq() for leaf_cfs_rq list") does in unthrottle_cfs_rq(), throttle_cfs_rq() can also use the same pattern as dequeue_task_fair(). No functional changes. Signed-off-by: Peng Wang Signed-off-by: Ingo Molnar Cc: Vincent Guittot Cc: Peter Zijlstra (Intel) Cc: Phil Auld Cc: Ben Segall Link: https://lore.kernel.org/r/f11dd2e3ab35cc538e2eb57bf0c99b6eaffce127.1604973978.git.rocking@linux.alibaba.com Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2956f20c8dfc6..1e1a7e6cf3901 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4564,25 +4564,37 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_rq *qcfs_rq = cfs_rq_of(se); /* throttled entity or throttle-on-deactivate */ if (!se->on_rq) - break; + goto done; - if (dequeue) { - dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); - } else { - update_load_avg(qcfs_rq, se, 0); - se_update_runnable(se); - } + dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; - if (qcfs_rq->load.weight) - dequeue = 0; + if (qcfs_rq->load.weight) { + /* Avoid re-evaluating load for this entity: */ + se = parent_entity(se); + break; + } } - if (!se) - sub_nr_running(rq, task_delta); + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + /* throttled entity or throttle-on-deactivate */ + if (!se->on_rq) + goto done; + + update_load_avg(qcfs_rq, se, 0); + se_update_runnable(se); + + qcfs_rq->h_nr_running -= task_delta; + qcfs_rq->idle_h_nr_running -= idle_task_delta; + } + /* At this point se is NULL and we are at root level*/ + sub_nr_running(rq, task_delta); + +done: /* * Note: distribution will already see us throttled via the * throttled-list. rq->lock protects completion. From a54e7eb5366d2964f8e3f53b7ef161e45036d17d Mon Sep 17 00:00:00 2001 From: Josh Don Date: Thu, 29 Jul 2021 19:00:18 -0700 Subject: [PATCH 0310/5344] sched: Cgroup SCHED_IDLE support commit 304000390f88d049c85e9a0958ac5567f38816ee upstream. This extends SCHED_IDLE to cgroups. Interface: cgroup/cpu.idle. 0: default behavior 1: SCHED_IDLE Extending SCHED_IDLE to cgroups means that we incorporate the existing aspects of SCHED_IDLE; a SCHED_IDLE cgroup will count all of its descendant threads towards the idle_h_nr_running count of all of its ancestor cgroups. Thus, sched_idle_rq() will work properly. Additionally, SCHED_IDLE cgroups are configured with minimum weight. There are two key differences between the per-task and per-cgroup SCHED_IDLE interface: - The cgroup interface allows tasks within a SCHED_IDLE hierarchy to maintain their relative weights. The entity that is "idle" is the cgroup, not the tasks themselves. - Since the idle entity is the cgroup, our SCHED_IDLE wakeup preemption decision is not made by comparing the current task with the woken task, but rather by comparing their matching sched_entity. A typical use-case for this is a user that creates an idle and a non-idle subtree. The non-idle subtree will dominate competition vs the idle subtree, but the idle subtree will still be high priority vs other users on the system. The latter is accomplished via comparing matching sched_entity in the waken preemption path (this could also be improved by making the sched_idle_rq() decision dependent on the perspective of a specific task). For now, we maintain the existing SCHED_IDLE semantics. Future patches may make improvements that extend how we treat SCHED_IDLE entities. The per-task_group idle field is an integer that currently only holds either a 0 or a 1. This is explicitly typed as an integer to allow for further extensions to this API. For example, a negative value may indicate a highly latency-sensitive cgroup that should be preferred for preemption/placement/etc. Signed-off-by: Josh Don Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20210730020019.1487127-2-joshdon@google.com Signed-off-by: Chengming Zhou --- kernel/sched/core.c | 25 ++++++ kernel/sched/debug.c | 3 + kernel/sched/fair.c | 197 +++++++++++++++++++++++++++++++++++++------ kernel/sched/sched.h | 8 ++ 4 files changed, 208 insertions(+), 25 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b3ab371d5d430..071fd42898905 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7867,6 +7867,20 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, } #endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_FAIR_GROUP_SCHED +static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return css_tg(css)->idle; +} + +static int cpu_idle_write_s64(struct cgroup_subsys_state *css, + struct cftype *cft, s64 idle) +{ + return sched_group_set_idle(css_tg(css), idle); +} +#endif + static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { @@ -7874,6 +7888,11 @@ static struct cftype cpu_legacy_files[] = { .read_u64 = cpu_shares_read_u64, .write_u64 = cpu_shares_write_u64, }, + { + .name = "idle", + .read_s64 = cpu_idle_read_s64, + .write_s64 = cpu_idle_write_s64, + }, #endif #ifdef CONFIG_CFS_BANDWIDTH { @@ -8081,6 +8100,12 @@ static struct cftype cpu_files[] = { .read_s64 = cpu_weight_nice_read_s64, .write_s64 = cpu_weight_nice_write_s64, }, + { + .name = "idle", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = cpu_idle_read_s64, + .write_s64 = cpu_idle_write_s64, + }, #endif #ifdef CONFIG_CFS_BANDWIDTH { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 8b5ef338f8c66..dcd75081a1e76 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -540,6 +540,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); + SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running); + SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running", + cfs_rq->idle_h_nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SMP SEQ_printf(m, " .%-30s: %lu\n", "load_avg", diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1e1a7e6cf3901..6e3ad8e8c04de 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -434,6 +434,23 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) } } +static int tg_is_idle(struct task_group *tg) +{ + return tg->idle > 0; +} + +static int cfs_rq_is_idle(struct cfs_rq *cfs_rq) +{ + return cfs_rq->idle > 0; +} + +static int se_is_idle(struct sched_entity *se) +{ + if (entity_is_task(se)) + return task_has_idle_policy(task_of(se)); + return cfs_rq_is_idle(group_cfs_rq(se)); +} + #else /* !CONFIG_FAIR_GROUP_SCHED */ static inline struct task_struct *task_of(struct sched_entity *se) @@ -495,6 +512,21 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) { } +static int tg_is_idle(struct task_group *tg) +{ + return 0; +} + +static int cfs_rq_is_idle(struct cfs_rq *cfs_rq) +{ + return 0; +} + +static int se_is_idle(struct sched_entity *se) +{ + return 0; +} + #endif /* CONFIG_FAIR_GROUP_SCHED */ static __always_inline @@ -4568,6 +4600,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; + qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; @@ -4587,6 +4622,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) update_load_avg(qcfs_rq, se, 0); se_update_runnable(se); + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; + qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; } @@ -4631,39 +4669,45 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) task_delta = cfs_rq->h_nr_running; idle_task_delta = cfs_rq->idle_h_nr_running; for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + if (se->on_rq) break; - cfs_rq = cfs_rq_of(se); - enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); + + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; - cfs_rq->h_nr_running += task_delta; - cfs_rq->idle_h_nr_running += idle_task_delta; + qcfs_rq->h_nr_running += task_delta; + qcfs_rq->idle_h_nr_running += idle_task_delta; /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) + if (cfs_rq_throttled(qcfs_rq)) goto unthrottle_throttle; } for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); + struct cfs_rq *qcfs_rq = cfs_rq_of(se); - update_load_avg(cfs_rq, se, UPDATE_TG); + update_load_avg(qcfs_rq, se, UPDATE_TG); se_update_runnable(se); - cfs_rq->h_nr_running += task_delta; - cfs_rq->idle_h_nr_running += idle_task_delta; + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; + qcfs_rq->h_nr_running += task_delta; + qcfs_rq->idle_h_nr_running += idle_task_delta; /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) + if (cfs_rq_throttled(qcfs_rq)) goto unthrottle_throttle; /* * One parent has been throttled and cfs_rq removed from the * list. Add it back to not break the leaf list. */ - if (throttled_hierarchy(cfs_rq)) - list_add_leaf_cfs_rq(cfs_rq); + if (throttled_hierarchy(qcfs_rq)) + list_add_leaf_cfs_rq(qcfs_rq); } /* At this point se is NULL and we are at root level*/ @@ -4676,9 +4720,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) * assertion below. */ for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); + struct cfs_rq *qcfs_rq = cfs_rq_of(se); - if (list_add_leaf_cfs_rq(cfs_rq)) + if (list_add_leaf_cfs_rq(qcfs_rq)) break; } @@ -5301,6 +5345,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + if (cfs_rq_is_idle(cfs_rq)) + idle_h_nr_running = 1; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto enqueue_throttle; @@ -5318,6 +5365,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + if (cfs_rq_is_idle(cfs_rq)) + idle_h_nr_running = 1; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto enqueue_throttle; @@ -5394,6 +5444,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + if (cfs_rq_is_idle(cfs_rq)) + idle_h_nr_running = 1; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto dequeue_throttle; @@ -5423,6 +5476,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + if (cfs_rq_is_idle(cfs_rq)) + idle_h_nr_running = 1; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto dequeue_throttle; @@ -6618,24 +6674,22 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) static void set_last_buddy(struct sched_entity *se) { - if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se)))) - return; - for_each_sched_entity(se) { if (SCHED_WARN_ON(!se->on_rq)) return; + if (se_is_idle(se)) + return; cfs_rq_of(se)->last = se; } } static void set_next_buddy(struct sched_entity *se) { - if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se)))) - return; - for_each_sched_entity(se) { if (SCHED_WARN_ON(!se->on_rq)) return; + if (se_is_idle(se)) + return; cfs_rq_of(se)->next = se; } } @@ -6656,6 +6710,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct cfs_rq *cfs_rq = task_cfs_rq(curr); int scale = cfs_rq->nr_running >= sched_nr_latency; int next_buddy_marked = 0; + int cse_is_idle, pse_is_idle; if (unlikely(se == pse)) return; @@ -6700,8 +6755,21 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; find_matching_se(&se, &pse); - update_curr(cfs_rq_of(se)); BUG_ON(!pse); + + cse_is_idle = se_is_idle(se); + pse_is_idle = se_is_idle(pse); + + /* + * Preempt an idle group in favor of a non-idle group (and don't preempt + * in the inverse case). + */ + if (cse_is_idle && !pse_is_idle) + goto preempt; + if (cse_is_idle != pse_is_idle) + return; + + update_curr(cfs_rq_of(se)); if (wakeup_preempt_entity(se, pse) == 1) { /* * Bias pick_next to pick the sched entity that is @@ -10781,10 +10849,12 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, static DEFINE_MUTEX(shares_mutex); -int sched_group_set_shares(struct task_group *tg, unsigned long shares) +static int __sched_group_set_shares(struct task_group *tg, unsigned long shares) { int i; + lockdep_assert_held(&shares_mutex); + /* * We can't change the weight of the root cgroup. */ @@ -10793,9 +10863,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); - mutex_lock(&shares_mutex); if (tg->shares == shares) - goto done; + return 0; tg->shares = shares; for_each_possible_cpu(i) { @@ -10813,10 +10882,88 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) rq_unlock_irqrestore(rq, &rf); } -done: + return 0; +} + +int sched_group_set_shares(struct task_group *tg, unsigned long shares) +{ + int ret; + + mutex_lock(&shares_mutex); + if (tg_is_idle(tg)) + ret = -EINVAL; + else + ret = __sched_group_set_shares(tg, shares); + mutex_unlock(&shares_mutex); + + return ret; +} + +int sched_group_set_idle(struct task_group *tg, long idle) +{ + int i; + + if (tg == &root_task_group) + return -EINVAL; + + if (idle < 0 || idle > 1) + return -EINVAL; + + mutex_lock(&shares_mutex); + + if (tg->idle == idle) { + mutex_unlock(&shares_mutex); + return 0; + } + + tg->idle = idle; + + for_each_possible_cpu(i) { + struct rq *rq = cpu_rq(i); + struct sched_entity *se = tg->se[i]; + struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i]; + bool was_idle = cfs_rq_is_idle(grp_cfs_rq); + long idle_task_delta; + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + + grp_cfs_rq->idle = idle; + if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq))) + goto next_cpu; + + idle_task_delta = grp_cfs_rq->h_nr_running - + grp_cfs_rq->idle_h_nr_running; + if (!cfs_rq_is_idle(grp_cfs_rq)) + idle_task_delta *= -1; + + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (!se->on_rq) + break; + + cfs_rq->idle_h_nr_running += idle_task_delta; + + /* Already accounted at parent level and above. */ + if (cfs_rq_is_idle(cfs_rq)) + break; + } + +next_cpu: + rq_unlock_irqrestore(rq, &rf); + } + + /* Idle groups have minimum weight. */ + if (tg_is_idle(tg)) + __sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO)); + else + __sched_group_set_shares(tg, NICE_0_LOAD); + mutex_unlock(&shares_mutex); return 0; } + #else /* CONFIG_FAIR_GROUP_SCHED */ void free_fair_sched_group(struct task_group *tg) { } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6040c3d2bffb2..3d4d22aaf838d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -366,6 +366,9 @@ struct task_group { struct cfs_rq **cfs_rq; unsigned long shares; + /* A positive value indicates that this is a SCHED_IDLE group. */ + int idle; + #ifdef CONFIG_SMP /* * load_avg can be heavily contended at clock tick time, so put @@ -475,6 +478,8 @@ extern void sched_move_task(struct task_struct *tsk); #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); +extern int sched_group_set_idle(struct task_group *tg, long idle); + #ifdef CONFIG_SMP extern void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next); @@ -566,6 +571,9 @@ struct cfs_rq { struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ + /* Locally cached copy of our task_group's idle value */ + int idle; + #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; s64 runtime_remaining; From 8b805bdeae94180ab7abba90a791d77495e86607 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 26 Aug 2021 10:47:09 +0200 Subject: [PATCH 0311/5344] sched/fair: Mark tg_is_idle() an inline in the !CONFIG_FAIR_GROUP_SCHED case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 366e7ad6ba5f4cb2ffd0b7316e404d6ee9c0f401 upstream. It's not actually used in the !CONFIG_FAIR_GROUP_SCHED case: kernel/sched/fair.c:488:12: warning: ‘tg_is_idle’ defined but not used [-Wunused-function] Keep around a placeholder nevertheless, for API completeness. Mark it inline, so the compiler doesn't think it must be used. Fixes: 304000390f88: ("sched: Cgroup SCHED_IDLE support") Signed-off-by: Ingo Molnar Cc: Josh Don Cc: Peter Zijlstra (Intel) Cc: Vincent Guittot Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6e3ad8e8c04de..d38429eaab968 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -512,7 +512,7 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) { } -static int tg_is_idle(struct task_group *tg) +static inline int tg_is_idle(struct task_group *tg) { return 0; } From 28e583fc2837099d2353ede6346997a76604b9c2 Mon Sep 17 00:00:00 2001 From: Josh Don Date: Thu, 19 Aug 2021 18:04:01 -0700 Subject: [PATCH 0312/5344] sched: Account number of SCHED_IDLE entities on each cfs_rq commit a480addecc0d89c200ec0b41da62ae8ceddca8d7 upstream. Adds cfs_rq->idle_nr_running, which accounts the number of idle entities directly enqueued on the cfs_rq. Signed-off-by: Josh Don Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20210820010403.946838-3-joshdon@google.com Signed-off-by: Chengming Zhou --- kernel/sched/debug.c | 2 ++ kernel/sched/fair.c | 25 ++++++++++++++++++++++++- kernel/sched/sched.h | 1 + 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index dcd75081a1e76..6c12c913a497f 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -541,6 +541,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->nr_spread_over); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running); + SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running", + cfs_rq->idle_nr_running); SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running", cfs_rq->idle_h_nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d38429eaab968..1b22a337382e6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2847,6 +2847,8 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) } #endif cfs_rq->nr_running++; + if (se_is_idle(se)) + cfs_rq->idle_nr_running++; } static void @@ -2860,6 +2862,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) } #endif cfs_rq->nr_running--; + if (se_is_idle(se)) + cfs_rq->idle_nr_running--; } /* @@ -5300,6 +5304,17 @@ static int sched_idle_rq(struct rq *rq) rq->nr_running); } +/* + * Returns true if cfs_rq only has SCHED_IDLE entities enqueued. Note the use + * of idle_nr_running, which does not consider idle descendants of normal + * entities. + */ +static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq) +{ + return cfs_rq->nr_running && + cfs_rq->nr_running == cfs_rq->idle_nr_running; +} + #ifdef CONFIG_SMP static int sched_idle_cpu(int cpu) { @@ -10921,7 +10936,7 @@ int sched_group_set_idle(struct task_group *tg, long idle) for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); struct sched_entity *se = tg->se[i]; - struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i]; + struct cfs_rq *parent_cfs_rq, *grp_cfs_rq = tg->cfs_rq[i]; bool was_idle = cfs_rq_is_idle(grp_cfs_rq); long idle_task_delta; struct rq_flags rf; @@ -10932,6 +10947,14 @@ int sched_group_set_idle(struct task_group *tg, long idle) if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq))) goto next_cpu; + if (se->on_rq) { + parent_cfs_rq = cfs_rq_of(se); + if (cfs_rq_is_idle(grp_cfs_rq)) + parent_cfs_rq->idle_nr_running++; + else + parent_cfs_rq->idle_nr_running--; + } + idle_task_delta = grp_cfs_rq->h_nr_running - grp_cfs_rq->idle_h_nr_running; if (!cfs_rq_is_idle(grp_cfs_rq)) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3d4d22aaf838d..9e05e3870c47f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -500,6 +500,7 @@ struct cfs_rq { struct load_weight load; unsigned int nr_running; unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int idle_nr_running; /* SCHED_IDLE */ unsigned int idle_h_nr_running; /* SCHED_IDLE */ u64 exec_clock; From 0a29601855f4a1bd87b73080ccb75c1ea2db9965 Mon Sep 17 00:00:00 2001 From: Josh Don Date: Thu, 19 Aug 2021 18:04:03 -0700 Subject: [PATCH 0313/5344] sched: adjust sleeper credit for SCHED_IDLE entities commit 2cae3948edd488ebdef4deaf1d1043f92f47e665 upstream. Give reduced sleeper credit to SCHED_IDLE entities. As a result, woken SCHED_IDLE entities will take longer to preempt normal entities. The benefit of this change is to make it less likely that a newly woken SCHED_IDLE entity will preempt a short-running normal entity before it blocks. We still give a small sleeper credit to SCHED_IDLE entities, so that idle<->idle competition retains some fairness. Example: With HZ=1000, spawned four threads affined to one cpu, one of which was set to SCHED_IDLE. Without this patch, wakeup latency for the SCHED_IDLE thread was ~1-2ms, with the patch the wakeup latency was ~5ms. Signed-off-by: Josh Don Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Reviewed-by: Jiang Biao Link: https://lore.kernel.org/r/20210820010403.946838-5-joshdon@google.com Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1b22a337382e6..73d07d4a10d6c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3938,7 +3938,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) /* sleeps up to a single latency don't count. */ if (!initial) { - unsigned long thresh = sysctl_sched_latency; + unsigned long thresh; + + if (se_is_idle(se)) + thresh = sysctl_sched_min_granularity; + else + thresh = sysctl_sched_latency; /* * Halve their sleep time's effect, to allow From 7140cf4120e78eb8e824dd0d0d3d27a26fcca252 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Mar 2021 13:44:46 +0100 Subject: [PATCH 0314/5344] sched,fair: Alternative sched_slice() commit 0c2de3f054a59f15e01804b75a04355c48de628c upstream. The current sched_slice() seems to have issues; there's two possible things that could be improved: - the 'nr_running' used for __sched_period() is daft when cgroups are considered. Using the RQ wide h_nr_running seems like a much more consistent number. - (esp) cgroups can slice it real fine, which makes for easy over-scheduling, ensure min_gran is what the name says. Signed-off-by: Peter Zijlstra (Intel) Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210412102001.611897312@infradead.org Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 12 +++++++++++- kernel/sched/features.h | 3 +++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 73d07d4a10d6c..c30f1ebe1e83b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -724,7 +724,13 @@ static u64 __sched_period(unsigned long nr_running) */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq); + unsigned int nr_running = cfs_rq->nr_running; + u64 slice; + + if (sched_feat(ALT_PERIOD)) + nr_running = rq_of(cfs_rq)->cfs.h_nr_running; + + slice = __sched_period(nr_running + !se->on_rq); for_each_sched_entity(se) { struct load_weight *load; @@ -741,6 +747,10 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) } slice = __calc_delta(slice, se->load.weight, load); } + + if (sched_feat(BASE_SLICE)) + slice = max(slice, (u64)sysctl_sched_min_granularity); + return slice; } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 2410db5e9a353..debfcc11a3052 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -89,3 +89,6 @@ SCHED_FEAT(WA_BIAS, true) * UtilEstimation. Use estimated CPU utilization. */ SCHED_FEAT(UTIL_EST, true) + +SCHED_FEAT(ALT_PERIOD, true) +SCHED_FEAT(BASE_SLICE, true) From cc4429fd040f86cef7e0297e200a743ad0b2cfee Mon Sep 17 00:00:00 2001 From: Josh Don Date: Thu, 19 Aug 2021 18:04:02 -0700 Subject: [PATCH 0315/5344] sched: reduce sched slice for SCHED_IDLE entities commit 51ce83ed523b00d58f2937ec014b12daaad55185 upstream. Use a small, non-scaled min granularity for SCHED_IDLE entities, when competing with normal entities. This reduces the latency of getting a normal entity back on cpu, at the expense of increased context switch frequency of SCHED_IDLE entities. The benefit of this change is to reduce the round-robin latency for normal entities when competing with a SCHED_IDLE entity. Example: on a machine with HZ=1000, spawned two threads, one of which is SCHED_IDLE, and affined to one cpu. Without this patch, the SCHED_IDLE thread runs for 4ms then waits for 1.4s. With this patch, it runs for 1ms and waits 340ms (as it round-robins with the other thread). Signed-off-by: Josh Don Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20210820010403.946838-4-joshdon@google.com Signed-off-by: Chengming Zhou --- include/linux/sched/sysctl.h | 1 + kernel/sched/debug.c | 1 + kernel/sched/fair.c | 29 ++++++++++++++++++++++++----- 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index d4f6215ee03f7..a83ac55138e13 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -22,6 +22,7 @@ enum { sysctl_hung_task_timeout_secs = 0 }; extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; +extern unsigned int sysctl_sched_idle_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 6c12c913a497f..883ea8f01cb52 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -740,6 +740,7 @@ static void sched_debug_header(struct seq_file *m) SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) PN(sysctl_sched_latency); PN(sysctl_sched_min_granularity); + PN(sysctl_sched_idle_min_granularity); PN(sysctl_sched_wakeup_granularity); P(sysctl_sched_child_runs_first); P(sysctl_sched_features); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c30f1ebe1e83b..c26adfbd5b114 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -59,6 +59,14 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_L unsigned int sysctl_sched_min_granularity = 750000ULL; static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; +/* + * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks. + * Applies only when SCHED_IDLE tasks compete with normal tasks. + * + * (default: 0.75 msec) + */ +unsigned int sysctl_sched_idle_min_granularity = 750000ULL; + /* * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity */ @@ -716,6 +724,8 @@ static u64 __sched_period(unsigned long nr_running) return sysctl_sched_latency; } +static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq); + /* * We calculate the wall-time slice from the period by taking a part * proportional to the weight. @@ -725,6 +735,8 @@ static u64 __sched_period(unsigned long nr_running) static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { unsigned int nr_running = cfs_rq->nr_running; + struct sched_entity *init_se = se; + unsigned int min_gran; u64 slice; if (sched_feat(ALT_PERIOD)) @@ -735,12 +747,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) for_each_sched_entity(se) { struct load_weight *load; struct load_weight lw; + struct cfs_rq *qcfs_rq; - cfs_rq = cfs_rq_of(se); - load = &cfs_rq->load; + qcfs_rq = cfs_rq_of(se); + load = &qcfs_rq->load; if (unlikely(!se->on_rq)) { - lw = cfs_rq->load; + lw = qcfs_rq->load; update_load_add(&lw, se->load.weight); load = &lw; @@ -748,8 +761,14 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) slice = __calc_delta(slice, se->load.weight, load); } - if (sched_feat(BASE_SLICE)) - slice = max(slice, (u64)sysctl_sched_min_granularity); + if (sched_feat(BASE_SLICE)) { + if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq)) + min_gran = sysctl_sched_idle_min_granularity; + else + min_gran = sysctl_sched_min_granularity; + + slice = max_t(u64, slice, min_gran); + } return slice; } From bfb6c857075eb09d877d9139e45d0e8c6b01c35a Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Fri, 15 Apr 2022 17:55:04 +0800 Subject: [PATCH 0316/5344] sched/fair: Revise comment about lb decision matrix commit a658353167bf2ea6052cee071dbcc13e0f229dc9 upstream. If busiest group type is group_misfit_task, the local group type must be group_has_spare according to below code in update_sd_pick_busiest(): if (sgs->group_type == group_misfit_task && (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) || sds->local_stat.group_type != group_has_spare)) return false; group type imbalanced and overloaded and fully_busy are filtered in here. misfit and asym are filtered before in update_sg_lb_stats(). So, change the decision matrix to: busiest \ local has_spare fully_busy misfit asym imbalanced overloaded has_spare nr_idle balanced N/A N/A balanced balanced fully_busy nr_idle nr_idle N/A N/A balanced balanced misfit_task force N/A N/A N/A *N/A* *N/A* asym_packing force force N/A N/A force force imbalanced force force N/A N/A force force overloaded force force N/A N/A force avg_load Fixes: 0b0695f2b34a ("sched/fair: Rework load_balance()") Signed-off-by: Tao Zhou Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20220415095505.7765-1-tao.zhou@linux.dev Signed-off-by: Chengming Zhou --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c26adfbd5b114..7b5e3081b68dc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8948,7 +8948,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded * has_spare nr_idle balanced N/A N/A balanced balanced * fully_busy nr_idle nr_idle N/A N/A balanced balanced - * misfit_task force N/A N/A N/A force force + * misfit_task force N/A N/A N/A N/A N/A * asym_packing force force N/A N/A force force * imbalanced force force N/A N/A force force * overloaded force force N/A N/A force avg_load From 95118e5dbbceecc975cad8d63e1e5701084842ee Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Fri, 6 Dec 2019 22:54:22 +0530 Subject: [PATCH 0317/5344] sched/fair: Optimize select_idle_core() commit bec2860a2bd6cd38ea34434d04f4033eb32f0f31 upstream. Currently we loop through all threads of a core to evaluate if the core is idle or not. This is unnecessary. If a thread of a core is not idle, skip evaluating other threads of a core. Also while clearing the cpumask, bits of all CPUs of a core can be cleared in one-shot. Collecting ticks on a Power 9 SMT 8 system around select_idle_core while running schbench shows us (units are in ticks, hence lesser is better) Without patch N Min Max Median Avg Stddev x 130 151 1083 284 322.72308 144.41494 With patch N Min Max Median Avg Stddev Improvement x 164 88 610 201 225.79268 106.78943 30.03% Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Valentin Schneider Reviewed-by: Vincent Guittot Acked-by: Mel Gorman Link: https://lkml.kernel.org/r/20191206172422.6578-1-srikar@linux.vnet.ibm.com Signed-off-by: Hao Jia --- kernel/sched/fair.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7b5e3081b68dc..4642a52285025 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5965,10 +5965,12 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int bool idle = true; for_each_cpu(cpu, cpu_smt_mask(core)) { - __cpumask_clear_cpu(cpu, cpus); - if (!available_idle_cpu(cpu)) + if (!available_idle_cpu(cpu)) { idle = false; + break; + } } + cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); if (idle) return core; From 484999c723d0a9187ef5eff7fa231974915ca738 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 30 Mar 2020 10:01:27 +0100 Subject: [PATCH 0318/5344] sched/fair: Align rq->avg_idle and rq->avg_scan_cost commit d76343c6b2b79f5e89c392bc9ce9dabc4c9e90cb upstream. sched/core.c uses update_avg() for rq->avg_idle and sched/fair.c uses an open-coded version (with the exact same decay factor) for rq->avg_scan_cost. On top of that, select_idle_cpu() expects to be able to compare these two fields. The only difference between the two is that rq->avg_scan_cost is computed using a pure division rather than a shift. Turns out it actually matters, first of all because the shifted value can be negative, and the standard has this to say about it: """ The result of E1 >> E2 is E1 right-shifted E2 bit positions. [...] If E1 has a signed type and a negative value, the resulting value is implementation-defined. """ Not only this, but (arithmetic) right shifting a negative value (using 2's complement) is *not* equivalent to dividing it by the corresponding power of 2. Let's look at a few examples: -4 -> 0xF..FC -4 >> 3 -> 0xF..FF == -1 != -4 / 8 -8 -> 0xF..F8 -8 >> 3 -> 0xF..FF == -1 == -8 / 8 -9 -> 0xF..F7 -9 >> 3 -> 0xF..FE == -2 != -9 / 8 Make update_avg() use a division, and export it to the private scheduler header to reuse it where relevant. Note that this still lets compilers use a shift here, but should prevent any unwanted surprise. The disassembly of select_idle_cpu() remains unchanged on arm64, and ttwu_do_wakeup() gains 2 instructions; the diff sort of looks like this: - sub x1, x1, x0 + subs x1, x1, x0 // set condition codes + add x0, x1, #0x7 + csel x0, x0, x1, mi // x0 = x1 < 0 ? x0 : x1 add x0, x3, x0, asr #3 which does the right thing (i.e. gives us the expected result while still using an arithmetic shift) Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200330090127.16294-1-valentin.schneider@arm.com Signed-off-by: Hao Jia --- kernel/sched/core.c | 6 ------ kernel/sched/fair.c | 7 ++----- kernel/sched/sched.h | 6 ++++++ 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 071fd42898905..99770ff45c773 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2211,12 +2211,6 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) return cpu; } -static void update_avg(u64 *avg, u64 sample) -{ - s64 diff = sample - *avg; - *avg += diff >> 3; -} - void sched_set_stop_task(int cpu, struct task_struct *stop) { struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4642a52285025..c7ff9bb3c6076 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6029,8 +6029,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); struct sched_domain *this_sd; u64 avg_cost, avg_idle; - u64 time, cost; - s64 delta; + u64 time; int this = smp_processor_id(); int cpu, nr = INT_MAX; @@ -6068,9 +6067,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t } time = cpu_clock(this) - time; - cost = this_sd->avg_scan_cost; - delta = (s64)(time - cost) / 8; - this_sd->avg_scan_cost += delta; + update_avg(&this_sd->avg_scan_cost, time); return cpu; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9e05e3870c47f..d3f8e6b4d243a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -199,6 +199,12 @@ static inline int task_has_dl_policy(struct task_struct *p) #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) +static inline void update_avg(u64 *avg, u64 sample) +{ + s64 diff = sample - *avg; + *avg += diff / 8; +} + /* * !! For sched_setattr_nocheck() (kernel) only !! * From 9ce5e11b57051d4b2d8acf2cb9a0d7f5d9a89d53 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 30 Nov 2020 14:40:20 +0000 Subject: [PATCH 0319/5344] sched/fair: Clear SMT siblings after determining the core is not idle commit 13d5a5e9f9b8515da3c04305ae1bb03ab91be7a7 upstream. The clearing of SMT siblings from the SIS mask before checking for an idle core is a small but unnecessary cost. Defer the clearing of the siblings until the scan moves to the next potential target. The cost of this was not measured as it is borderline noise but it should be self-evident. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20201130144020.GS3371@techsingularity.net Signed-off-by: Hao Jia --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c7ff9bb3c6076..6181783d7b7cf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5970,10 +5970,11 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int break; } } - cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); if (idle) return core; + + cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); } /* From 3f96a4ec7d63abc30594ae7c40c67544eb05a567 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 25 Jan 2021 08:59:06 +0000 Subject: [PATCH 0320/5344] sched/fair: Remove SIS_AVG_CPU commit e6e0dc2d5497f7f3ed970052917e2923c6f453f4 upstream. SIS_AVG_CPU was introduced as a means of avoiding a search when the average search cost indicated that the search would likely fail. It was a blunt instrument and disabled by commit 4c77b18cf8b7 ("sched/fair: Make select_idle_cpu() more aggressive") and later replaced with a proportional search depth by commit 1ad3aaf3fcd2 ("sched/core: Implement new approach to scale select_idle_cpu()"). While there are corner cases where SIS_AVG_CPU is better, it has now been disabled for almost three years. As the intent of SIS_PROP is to reduce the time complexity of select_idle_cpu(), lets drop SIS_AVG_CPU and focus on SIS_PROP as a throttling mechanism. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210125085909.4600-2-mgorman@techsingularity.net Signed-off-by: Hao Jia --- kernel/sched/fair.c | 20 +++++++++----------- kernel/sched/features.h | 1 - 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6181783d7b7cf..7123922abfd73 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6029,7 +6029,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); struct sched_domain *this_sd; - u64 avg_cost, avg_idle; u64 time; int this = smp_processor_id(); int cpu, nr = INT_MAX; @@ -6038,18 +6037,17 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t if (!this_sd) return -1; - /* - * Due to large variance we need a large fuzz factor; hackbench in - * particularly is sensitive here. - */ - avg_idle = this_rq()->avg_idle / 512; - avg_cost = this_sd->avg_scan_cost + 1; + if (sched_feat(SIS_PROP)) { + u64 avg_cost, avg_idle, span_avg; - if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost) - return -1; + /* + * Due to large variance we need a large fuzz factor; + * hackbench in particularly is sensitive here. + */ + avg_idle = this_rq()->avg_idle / 512; + avg_cost = this_sd->avg_scan_cost + 1; - if (sched_feat(SIS_PROP)) { - u64 span_avg = sd->span_weight * avg_idle; + span_avg = sd->span_weight * avg_idle; if (span_avg > 4*avg_cost) nr = div_u64(span_avg, avg_cost); else diff --git a/kernel/sched/features.h b/kernel/sched/features.h index debfcc11a3052..a08c0fc518a6e 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -54,7 +54,6 @@ SCHED_FEAT(TTWU_QUEUE, true) /* * When doing wakeups, attempt to limit superfluous scans of the LLC domain. */ -SCHED_FEAT(SIS_AVG_CPU, false) SCHED_FEAT(SIS_PROP, true) /* From b9b52215984e3a4c2fd4e44a4821d2097dd2c6a3 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 25 Jan 2021 08:59:07 +0000 Subject: [PATCH 0321/5344] sched/fair: Move avg_scan_cost calculations under SIS_PROP commit bae4ec13640b0915e7dd86da7e65c5d085160571 upstream. As noted by Vincent Guittot, avg_scan_costs are calculated for SIS_PROP even if SIS_PROP is disabled. Move the time calculations under a SIS_PROP check and while we are at it, exclude the cost of initialising the CPU mask from the average scan cost. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210125085909.4600-3-mgorman@techsingularity.net Signed-off-by: Hao Jia --- kernel/sched/fair.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7123922abfd73..56517fee7f5e9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6037,6 +6037,8 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t if (!this_sd) return -1; + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + if (sched_feat(SIS_PROP)) { u64 avg_cost, avg_idle, span_avg; @@ -6052,11 +6054,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t nr = div_u64(span_avg, avg_cost); else nr = 4; - } - - time = cpu_clock(this); - cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + time = cpu_clock(this); + } for_each_cpu_wrap(cpu, cpus, target) { if (!--nr) @@ -6065,8 +6065,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t break; } - time = cpu_clock(this) - time; - update_avg(&this_sd->avg_scan_cost, time); + if (sched_feat(SIS_PROP)) { + time = cpu_clock(this) - time; + update_avg(&this_sd->avg_scan_cost, time); + } return cpu; } From 4507e084e5a87d13df3fa6a4ad300f89c8403c5a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 25 Jan 2021 08:59:08 +0000 Subject: [PATCH 0322/5344] sched/fair: Remove select_idle_smt() commit 6cd56ef1df399a004f90ecb682427f9964969fc9 upstream. In order to make the next patch more readable, and to quantify the actual effectiveness of this pass, start by removing it. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210125085909.4600-4-mgorman@techsingularity.net Signed-off-by: Hao Jia --- kernel/sched/fair.c | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 56517fee7f5e9..e20dfcee570e6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5985,27 +5985,6 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int return -1; } -/* - * Scan the local SMT mask for idle CPUs. - */ -static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) -{ - int cpu; - - if (!static_branch_likely(&sched_smt_present)) - return -1; - - for_each_cpu(cpu, cpu_smt_mask(target)) { - if (!cpumask_test_cpu(cpu, p->cpus_ptr) || - !cpumask_test_cpu(cpu, sched_domain_span(sd))) - continue; - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) - return cpu; - } - - return -1; -} - #else /* CONFIG_SCHED_SMT */ static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) @@ -6013,11 +5992,6 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s return -1; } -static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) -{ - return -1; -} - #endif /* CONFIG_SCHED_SMT */ /* @@ -6118,10 +6092,6 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if ((unsigned)i < nr_cpumask_bits) return i; - i = select_idle_smt(p, sd, target); - if ((unsigned)i < nr_cpumask_bits) - return i; - return target; } From bd495bfa264cc1f62380da45ce56f3cbdd74959b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 27 Jan 2021 13:52:03 +0000 Subject: [PATCH 0323/5344] sched/fair: Merge select_idle_core/cpu() commit 9fe1f127b913318c631d0041ecf71486e38c2c2d upstream. Both select_idle_core() and select_idle_cpu() do a loop over the same cpumask. Observe that by clearing the already visited CPUs, we can fold the iteration and iterate a core at a time. All we need to do is remember any non-idle CPU we encountered while scanning for an idle core. This way we'll only iterate every CPU once. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210127135203.19633-5-mgorman@techsingularity.net Signed-off-by: Hao Jia --- kernel/sched/fair.c | 99 +++++++++++++++++++++++++++------------------ 1 file changed, 59 insertions(+), 40 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e20dfcee570e6..7dba05e71e68b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5890,6 +5890,14 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p return new_cpu; } +static inline int __select_idle_cpu(int cpu) +{ + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) + return cpu; + + return -1; +} + #ifdef CONFIG_SCHED_SMT DEFINE_STATIC_KEY_FALSE(sched_smt_present); EXPORT_SYMBOL_GPL(sched_smt_present); @@ -5948,48 +5956,51 @@ void __update_idle_core(struct rq *rq) * there are no idle cores left in the system; tracked through * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above. */ -static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) +static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) { - struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); - int core, cpu; + bool idle = true; + int cpu; if (!static_branch_likely(&sched_smt_present)) - return -1; - - if (!test_idle_cores(target, false)) - return -1; - - cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + return __select_idle_cpu(core); - for_each_cpu_wrap(core, cpus, target) { - bool idle = true; - - for_each_cpu(cpu, cpu_smt_mask(core)) { - if (!available_idle_cpu(cpu)) { - idle = false; - break; + for_each_cpu(cpu, cpu_smt_mask(core)) { + if (!available_idle_cpu(cpu)) { + idle = false; + if (*idle_cpu == -1) { + if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) { + *idle_cpu = cpu; + break; + } + continue; } + break; } - - if (idle) - return core; - - cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); + if (*idle_cpu == -1 && cpumask_test_cpu(cpu, p->cpus_ptr)) + *idle_cpu = cpu; } - /* - * Failed to find an idle core; stop looking for one. - */ - set_idle_cores(target, 0); + if (idle) + return core; + cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); return -1; } #else /* CONFIG_SCHED_SMT */ -static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) +static inline void set_idle_cores(int cpu, int val) { - return -1; +} + +static inline bool test_idle_cores(int cpu, bool def) +{ + return def; +} + +static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) +{ + return __select_idle_cpu(core); } #endif /* CONFIG_SCHED_SMT */ @@ -6002,10 +6013,11 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); + int i, cpu, idle_cpu = -1, nr = INT_MAX; + bool smt = test_idle_cores(target, false); + int this = smp_processor_id(); struct sched_domain *this_sd; u64 time; - int this = smp_processor_id(); - int cpu, nr = INT_MAX; this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) @@ -6013,7 +6025,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); - if (sched_feat(SIS_PROP)) { + if (sched_feat(SIS_PROP) && !smt) { u64 avg_cost, avg_idle, span_avg; /* @@ -6033,18 +6045,29 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t } for_each_cpu_wrap(cpu, cpus, target) { - if (!--nr) - return -1; - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) - break; + if (smt) { + i = select_idle_core(p, cpu, cpus, &idle_cpu); + if ((unsigned int)i < nr_cpumask_bits) + return i; + + } else { + if (!--nr) + return -1; + idle_cpu = __select_idle_cpu(cpu); + if ((unsigned int)idle_cpu < nr_cpumask_bits) + break; + } } - if (sched_feat(SIS_PROP)) { + if (smt) + set_idle_cores(this, false); + + if (sched_feat(SIS_PROP) && !smt) { time = cpu_clock(this) - time; update_avg(&this_sd->avg_scan_cost, time); } - return cpu; + return idle_cpu; } /* @@ -6084,10 +6107,6 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (!sd) return target; - i = select_idle_core(p, sd, target); - if ((unsigned)i < nr_cpumask_bits) - return i; - i = select_idle_cpu(p, sd, target); if ((unsigned)i < nr_cpumask_bits) return i; From 155a53ef626ea43636ef4302e1f7346e41f07986 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Sun, 21 Mar 2021 11:14:32 +1300 Subject: [PATCH 0324/5344] sched/fair: Optimize test_idle_cores() for !SMT commit c8987ae5af793a73e2c0d6ce804d8ff454ea377c upstream. update_idle_core() is only done for the case of sched_smt_present. but test_idle_cores() is done for all machines even those without SMT. This can contribute to up 8%+ hackbench performance loss on a machine like kunpeng 920 which has no SMT. This patch removes the redundant test_idle_cores() for !SMT machines. Hackbench is ran with -g {2..14}, for each g it is ran 10 times to get an average. $ numactl -N 0 hackbench -p -T -l 20000 -g $1 The below is the result of hackbench w/ and w/o this patch: g= 2 4 6 8 10 12 14 w/o: 1.8151 3.8499 5.5142 7.2491 9.0340 10.7345 12.0929 w/ : 1.8428 3.7436 5.4501 6.9522 8.2882 9.9535 11.3367 +4.1% +8.3% +7.3% +6.3% Signed-off-by: Barry Song Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Acked-by: Mel Gorman Link: https://lkml.kernel.org/r/20210320221432.924-1-song.bao.hua@hisilicon.com Signed-off-by: Hao Jia --- kernel/sched/fair.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7dba05e71e68b..c755a5a4bd4e5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5915,9 +5915,11 @@ static inline bool test_idle_cores(int cpu, bool def) { struct sched_domain_shared *sds; - sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); - if (sds) - return READ_ONCE(sds->has_idle_cores); + if (static_branch_likely(&sched_smt_present)) { + sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + if (sds) + return READ_ONCE(sds->has_idle_cores); + } return def; } From 441810c62370371f7a9551c9a72028511c327354 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 26 Mar 2021 15:19:32 -0400 Subject: [PATCH 0325/5344] sched/fair: Bring back select_idle_smt(), but differently commit c722f35b513f807629603bbf24640b1a48be21b5 upstream. Mel Gorman did some nice work in 9fe1f127b913 ("sched/fair: Merge select_idle_core/cpu()"), resulting in the kernel being more efficient at finding an idle CPU, and in tasks spending less time waiting to be run, both according to the schedstats run_delay numbers, and according to measured application latencies. Yay. The flip side of this is that we see more task migrations (about 30% more), higher cache misses, higher memory bandwidth utilization, and higher CPU use, for the same number of requests/second. This is most pronounced on a memcache type workload, which saw a consistent 1-3% increase in total CPU use on the system, due to those increased task migrations leading to higher L2 cache miss numbers, and higher memory utilization. The exclusive L3 cache on Skylake does us no favors there. On our web serving workload, that effect is usually negligible. It appears that the increased number of CPU migrations is generally a good thing, since it leads to lower cpu_delay numbers, reflecting the fact that tasks get to run faster. However, the reduced locality and the corresponding increase in L2 cache misses hurts a little. The patch below appears to fix the regression, while keeping the benefit of the lower cpu_delay numbers, by reintroducing select_idle_smt with a twist: when a socket has no idle cores, check to see if the sibling of "prev" is idle, before searching all the other CPUs. This fixes both the occasional 9% regression on the web serving workload, and the continuous 2% CPU use regression on the memcache type workload. With Mel's patches and this patch together, task migrations are still high, but L2 cache misses, memory bandwidth, and CPU time used are back down to what they were before. The p95 and p99 response times for the memcache type application improve by about 10% over what they were before Mel's patches got merged. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mel Gorman Acked-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210326151932.2c187840@imladris.surriel.com Signed-off-by: Hao Jia --- kernel/sched/fair.c | 55 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 12 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c755a5a4bd4e5..ea58ecb38ca68 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5915,11 +5915,9 @@ static inline bool test_idle_cores(int cpu, bool def) { struct sched_domain_shared *sds; - if (static_branch_likely(&sched_smt_present)) { - sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); - if (sds) - return READ_ONCE(sds->has_idle_cores); - } + sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + if (sds) + return READ_ONCE(sds->has_idle_cores); return def; } @@ -5989,6 +5987,24 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu return -1; } +/* + * Scan the local SMT mask for idle CPUs. + */ +static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +{ + int cpu; + + for_each_cpu(cpu, cpu_smt_mask(target)) { + if (!cpumask_test_cpu(cpu, p->cpus_ptr) || + !cpumask_test_cpu(cpu, sched_domain_span(sd))) + continue; + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) + return cpu; + } + + return -1; +} + #else /* CONFIG_SCHED_SMT */ static inline void set_idle_cores(int cpu, int val) @@ -6005,6 +6021,11 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma return __select_idle_cpu(core); } +static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +{ + return -1; +} + #endif /* CONFIG_SCHED_SMT */ /* @@ -6012,11 +6033,10 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma * comparing the average scan cost (tracked in sd->avg_scan_cost) against the * average idle time for this rq (as found in rq->avg_idle). */ -static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) +static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); int i, cpu, idle_cpu = -1, nr = INT_MAX; - bool smt = test_idle_cores(target, false); int this = smp_processor_id(); struct sched_domain *this_sd; u64 time; @@ -6027,7 +6047,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); - if (sched_feat(SIS_PROP) && !smt) { + if (sched_feat(SIS_PROP) && !has_idle_core) { u64 avg_cost, avg_idle, span_avg; /* @@ -6047,7 +6067,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t } for_each_cpu_wrap(cpu, cpus, target) { - if (smt) { + if (has_idle_core) { i = select_idle_core(p, cpu, cpus, &idle_cpu); if ((unsigned int)i < nr_cpumask_bits) return i; @@ -6061,10 +6081,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t } } - if (smt) + if (has_idle_core) set_idle_cores(this, false); - if (sched_feat(SIS_PROP) && !smt) { + if (sched_feat(SIS_PROP) && !has_idle_core) { time = cpu_clock(this) - time; update_avg(&this_sd->avg_scan_cost, time); } @@ -6077,6 +6097,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t */ static int select_idle_sibling(struct task_struct *p, int prev, int target) { + bool has_idle_core = false; struct sched_domain *sd; int i, recent_used_cpu; @@ -6109,7 +6130,17 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (!sd) return target; - i = select_idle_cpu(p, sd, target); + if (sched_smt_active()) { + has_idle_core = test_idle_cores(target, false); + + if (!has_idle_core && cpus_share_cache(prev, target)) { + i = select_idle_smt(p, sd, prev); + if ((unsigned int)i < nr_cpumask_bits) + return i; + } + } + + i = select_idle_cpu(p, sd, has_idle_core, target); if ((unsigned)i < nr_cpumask_bits) return i; From 35fae84ab18db5e757f559c6d22f433073b53185 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Tue, 11 May 2021 20:46:09 +0530 Subject: [PATCH 0326/5344] sched/fair: Fix clearing of has_idle_cores flag in select_idle_cpu() commit 02dbb7246c5bbbbe1607ebdc546ba5c454a664b1 upstream. In commit: 9fe1f127b913 ("sched/fair: Merge select_idle_core/cpu()") in select_idle_cpu(), we check if an idle core is present in the LLC of the target CPU via the flag "has_idle_cores". We look for the idle core in select_idle_cores(). If select_idle_cores() isn't able to find an idle core/CPU, we need to unset the has_idle_cores flag in the LLC of the target to prevent other CPUs from going down this route. However, the current code is unsetting it in the LLC of the current CPU instead of the target CPU. This patch fixes this issue. Fixes: 9fe1f127b913 ("sched/fair: Merge select_idle_core/cpu()") Signed-off-by: Gautham R. Shenoy Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Reviewed-by: Srikar Dronamraju Acked-by: Mel Gorman Link: https://lore.kernel.org/r/1620746169-13996-1-git-send-email-ego@linux.vnet.ibm.com Signed-off-by: Hao Jia --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ea58ecb38ca68..51021667fce8c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6082,7 +6082,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool } if (has_idle_core) - set_idle_cores(this, false); + set_idle_cores(target, false); if (sched_feat(SIS_PROP) && !has_idle_core) { time = cpu_clock(this) - time; From ff9bb4c0abd695b8abcbf6d2697c4b42e3300a41 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Jun 2021 12:16:11 +0100 Subject: [PATCH 0327/5344] sched/fair: Age the average idle time commit 94aafc3ee31dc199d1078ffac9edd976b7f47b3d upstream. This is a partial forward-port of Peter Ziljstra's work first posted at: https://lore.kernel.org/lkml/20180530142236.667774973@infradead.org/ Currently select_idle_cpu()'s proportional scheme uses the average idle time *for when we are idle*, that is temporally challenged. When a CPU is not at all idle, we'll happily continue using whatever value we did see when the CPU goes idle. To fix this, introduce a separate average idle and age it (the existing value still makes sense for things like new-idle balancing, which happens when we do go idle). The overall goal is to not spend more time scanning for idle CPUs than we're idle for. Otherwise we're inhibiting work. This means that we need to consider the cost over all the wake-ups between consecutive idle periods. To track this, the scan cost is subtracted from the estimated average idle time. The impact of this patch is related to workloads that have domains that are fully busy or overloaded. Without the patch, the scan depth may be too high because a CPU is not reaching idle. Due to the nature of the patch, this is a regression magnet. It potentially wins when domains are almost fully busy or overloaded -- at that point searches are likely to fail but idle is not being aged as CPUs are active so search depth is too large and useless. It will potentially show regressions when there are idle CPUs and a deep search is beneficial. This tbench result on a 2-socket broadwell machine partially illustates the problem 5.13.0-rc2 5.13.0-rc2 vanilla sched-avgidle-v1r5 Hmean 1 445.02 ( 0.00%) 451.36 * 1.42%* Hmean 2 830.69 ( 0.00%) 846.03 * 1.85%* Hmean 4 1350.80 ( 0.00%) 1505.56 * 11.46%* Hmean 8 2888.88 ( 0.00%) 2586.40 * -10.47%* Hmean 16 5248.18 ( 0.00%) 5305.26 * 1.09%* Hmean 32 8914.03 ( 0.00%) 9191.35 * 3.11%* Hmean 64 10663.10 ( 0.00%) 10192.65 * -4.41%* Hmean 128 18043.89 ( 0.00%) 18478.92 * 2.41%* Hmean 256 16530.89 ( 0.00%) 17637.16 * 6.69%* Hmean 320 16451.13 ( 0.00%) 17270.97 * 4.98%* Note that 8 was a regression point where a deeper search would have helped but it gains for high thread counts when searches are useless. Hackbench is a more extreme example although not perfect as the tasks idle rapidly hackbench-process-pipes 5.13.0-rc2 5.13.0-rc2 vanilla sched-avgidle-v1r5 Amean 1 0.3950 ( 0.00%) 0.3887 ( 1.60%) Amean 4 0.9450 ( 0.00%) 0.9677 ( -2.40%) Amean 7 1.4737 ( 0.00%) 1.4890 ( -1.04%) Amean 12 2.3507 ( 0.00%) 2.3360 * 0.62%* Amean 21 4.0807 ( 0.00%) 4.0993 * -0.46%* Amean 30 5.6820 ( 0.00%) 5.7510 * -1.21%* Amean 48 8.7913 ( 0.00%) 8.7383 ( 0.60%) Amean 79 14.3880 ( 0.00%) 13.9343 * 3.15%* Amean 110 21.2233 ( 0.00%) 19.4263 * 8.47%* Amean 141 28.2930 ( 0.00%) 25.1003 * 11.28%* Amean 172 34.7570 ( 0.00%) 30.7527 * 11.52%* Amean 203 41.0083 ( 0.00%) 36.4267 * 11.17%* Amean 234 47.7133 ( 0.00%) 42.0623 * 11.84%* Amean 265 53.0353 ( 0.00%) 47.7720 * 9.92%* Amean 296 60.0170 ( 0.00%) 53.4273 * 10.98%* Stddev 1 0.0052 ( 0.00%) 0.0025 ( 51.57%) Stddev 4 0.0357 ( 0.00%) 0.0370 ( -3.75%) Stddev 7 0.0190 ( 0.00%) 0.0298 ( -56.64%) Stddev 12 0.0064 ( 0.00%) 0.0095 ( -48.38%) Stddev 21 0.0065 ( 0.00%) 0.0097 ( -49.28%) Stddev 30 0.0185 ( 0.00%) 0.0295 ( -59.54%) Stddev 48 0.0559 ( 0.00%) 0.0168 ( 69.92%) Stddev 79 0.1559 ( 0.00%) 0.0278 ( 82.17%) Stddev 110 1.1728 ( 0.00%) 0.0532 ( 95.47%) Stddev 141 0.7867 ( 0.00%) 0.0968 ( 87.69%) Stddev 172 1.0255 ( 0.00%) 0.0420 ( 95.91%) Stddev 203 0.8106 ( 0.00%) 0.1384 ( 82.92%) Stddev 234 1.1949 ( 0.00%) 0.1328 ( 88.89%) Stddev 265 0.9231 ( 0.00%) 0.0820 ( 91.11%) Stddev 296 1.0456 ( 0.00%) 0.1327 ( 87.31%) Again, higher thread counts benefit and the standard deviation shows that results are also a lot more stable when the idle time is aged. The patch potentially matters when a socket was multiple LLCs as the maximum search depth is lower. However, some of the test results were suspiciously good (e.g. specjbb2005 gaining 50% on a Zen1 machine) and other results were not dramatically different to other mcahines. Given the nature of the patch, Peter's full series is not being forward ported as each part should stand on its own. Preferably they would be merged at different times to reduce the risk of false bisections. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210615111611.GH30378@techsingularity.net Signed-off-by: Hao Jia --- kernel/sched/core.c | 5 +++++ kernel/sched/fair.c | 25 +++++++++++++++++++++---- kernel/sched/sched.h | 3 +++ 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 99770ff45c773..332da08bc792a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2320,6 +2320,9 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, if (rq->avg_idle > max) rq->avg_idle = max; + rq->wake_stamp = jiffies; + rq->wake_avg_idle = rq->avg_idle / 2; + rq->idle_stamp = 0; } #endif @@ -6810,6 +6813,8 @@ void __init sched_init(void) rq->online = 0; rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; + rq->wake_stamp = jiffies; + rq->wake_avg_idle = rq->avg_idle; rq->max_idle_balance_cost = sysctl_sched_migration_cost; INIT_LIST_HEAD(&rq->cfs_tasks); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 51021667fce8c..ff7222bdecc3a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6037,9 +6037,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); int i, cpu, idle_cpu = -1, nr = INT_MAX; + struct rq *this_rq = this_rq(); int this = smp_processor_id(); struct sched_domain *this_sd; - u64 time; + u64 time = 0; this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) @@ -6049,12 +6050,21 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool if (sched_feat(SIS_PROP) && !has_idle_core) { u64 avg_cost, avg_idle, span_avg; + unsigned long now = jiffies; /* - * Due to large variance we need a large fuzz factor; - * hackbench in particularly is sensitive here. + * If we're busy, the assumption that the last idle period + * predicts the future is flawed; age away the remaining + * predicted idle time. */ - avg_idle = this_rq()->avg_idle / 512; + if (unlikely(this_rq->wake_stamp < now)) { + while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) { + this_rq->wake_stamp++; + this_rq->wake_avg_idle >>= 1; + } + } + + avg_idle = this_rq->wake_avg_idle; avg_cost = this_sd->avg_scan_cost + 1; span_avg = sd->span_weight * avg_idle; @@ -6086,6 +6096,13 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool if (sched_feat(SIS_PROP) && !has_idle_core) { time = cpu_clock(this) - time; + + /* + * Account for the scan cost of wakeups against the average + * idle time. + */ + this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time); + update_avg(&this_sd->avg_scan_cost, time); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d3f8e6b4d243a..a52232dd13f2c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -985,6 +985,9 @@ struct rq { u64 idle_stamp; u64 avg_idle; + unsigned long wake_stamp; + u64 wake_avg_idle; + /* This is used to determine avg_idle's max value */ u64 max_idle_balance_cost; #endif From 63703040b124b02a530f93e424186237de7cf341 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Aug 2021 12:58:56 +0100 Subject: [PATCH 0328/5344] sched/fair: Use prev instead of new target as recent_used_cpu commit 89aafd67f28c9e3b725aa30b44b7f61ad3e348ce upstream. After select_idle_sibling, p->recent_used_cpu is set to the new target. However on the next wakeup, prev will be the same as recent_used_cpu unless the load balancer has moved the task since the last wakeup. It still works, but is less efficient than it could be. This patch preserves recent_used_cpu for longer. The impact on SIS efficiency is tiny so the SIS statistic patches were used to track the hit rate for using recent_used_cpu. With perf bench pipe on a 2-socket Cascadelake machine, the hit rate went from 57.14% to 85.32%. For more intensive wakeup loads like hackbench, the hit rate is almost negligible but rose from 0.21% to 6.64%. For scaling loads like tbench, the hit rate goes from almost 0% to 25.42% overall. Broadly speaking, on tbench, the success rate is much higher for lower thread counts and drops to almost 0 as the workload scales to towards saturation. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210804115857.6253-2-mgorman@techsingularity.net Signed-off-by: Hao Jia --- kernel/sched/fair.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ff7222bdecc3a..68b061b260246 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6130,6 +6130,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) /* Check a recently used CPU as a potential idle candidate: */ recent_used_cpu = p->recent_used_cpu; + p->recent_used_cpu = prev; if (recent_used_cpu != prev && recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && @@ -6623,9 +6624,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f /* Fast path */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); - - if (want_affine) - current->recent_used_cpu = cpu; } rcu_read_unlock(); From 464c8005548271c79b9d88df11eba182085d0409 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Aug 2021 12:58:57 +0100 Subject: [PATCH 0329/5344] sched/fair: Avoid a second scan of target in select_idle_cpu commit 56498cfb045d7147cdcba33795d19429afcd1d00 upstream. When select_idle_cpu starts scanning for an idle CPU, it starts with a target CPU that has already been checked by select_idle_sibling. This patch starts with the next CPU instead. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210804115857.6253-3-mgorman@techsingularity.net Signed-off-by: Hao Jia --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 68b061b260246..fdf45fa8d4471 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6076,7 +6076,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool time = cpu_clock(this); } - for_each_cpu_wrap(cpu, cpus, target) { + for_each_cpu_wrap(cpu, cpus, target + 1) { if (has_idle_core) { i = select_idle_core(p, cpu, cpus, &idle_cpu); if ((unsigned int)i < nr_cpumask_bits) From 8d2dc74d47c2d1f9f153585aadb5dd16e60ac3bd Mon Sep 17 00:00:00 2001 From: Aubrey Li Date: Wed, 24 Feb 2021 16:15:49 +0800 Subject: [PATCH 0330/5344] sched/fair: Reduce long-tail newly idle balance cost commit acb4decc1e900468d51b33c5f1ee445278e716a7 upstream. A long-tail load balance cost is observed on the newly idle path, this is caused by a race window between the first nr_running check of the busiest runqueue and its nr_running recheck in detach_tasks. Before the busiest runqueue is locked, the tasks on the busiest runqueue could be pulled by other CPUs and nr_running of the busiest runqueu becomes 1 or even 0 if the running task becomes idle, this causes detach_tasks breaks with LBF_ALL_PINNED flag set, and triggers load_balance redo at the same sched_domain level. In order to find the new busiest sched_group and CPU, load balance will recompute and update the various load statistics, which eventually leads to the long-tail load balance cost. This patch clears LBF_ALL_PINNED flag for this race condition, and hence reduces the long-tail cost of newly idle balance. Signed-off-by: Aubrey Li Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/1614154549-116078-1-git-send-email-aubrey.li@intel.com Signed-off-by: Hao Jia --- kernel/sched/fair.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fdf45fa8d4471..61b39a84d0375 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7512,6 +7512,15 @@ static int detach_tasks(struct lb_env *env) lockdep_assert_held(&env->src_rq->lock); + /* + * Source run queue has been emptied by another CPU, clear + * LBF_ALL_PINNED flag as we will not test any task. + */ + if (env->src_rq->nr_running <= 1) { + env->flags &= ~LBF_ALL_PINNED; + return 0; + } + if (env->imbalance <= 0) return 0; From 3de78105ff1e714d66c04b38f2c9181627f42b79 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 28 Sep 2021 12:35:44 +0200 Subject: [PATCH 0331/5344] sched/fair: Removed useless update of p->recent_used_cpu commit a7ba894821b6ade7bb420455f87020b2838d6180 upstream. Since commit 89aafd67f28c ("sched/fair: Use prev instead of new target as recent_used_cpu"), p->recent_used_cpu is unconditionnaly set with prev. Fixes: 89aafd67f28c ("sched/fair: Use prev instead of new target as recent_used_cpu") Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Link: https://lkml.kernel.org/r/20210928103544.27489-1-vincent.guittot@linaro.org Signed-off-by: Hao Jia --- kernel/sched/fair.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 61b39a84d0375..d39ed2e45023f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6136,11 +6136,6 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) cpus_share_cache(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) { - /* - * Replace recent_used_cpu with prev as it is a potential - * candidate for the next wake: - */ - p->recent_used_cpu = prev; return recent_used_cpu; } From 18c8fe946288a26681edb26b26220f2d0358e140 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 13 Apr 2020 12:27:55 -0400 Subject: [PATCH 0332/5344] blk-iocost: switch to fixed non-auto-decaying use_delay commit 54c52e10dc9b939084a7e6e3d32ce8fd8dee7898 upstream. Note: There are a few conflicts need to be resolved when origin patch applied. The use_delay mechanism was introduced by blk-iolatency to hold memory allocators accountable for the reclaim and other shared IOs they cause. The duration of the delay is dynamically balanced between iolatency increasing the value on each target miss and it auto-decaying as time passes and threads get delayed on it. While this works well for iolatency, iocost's control model isn't compatible with it. There is no repeated "violation" events which can be balanced against auto-decaying. iocost instead knows how much a given cgroup is over budget and wants to prevent that cgroup from issuing IOs while over budget. Until now, iocost has been adding the cost of force-issued IOs. However, this doesn't reflect the amount which is already over budget and is simply not enough to counter the auto-decaying allowing anon-memory leaking low priority cgroup to go over its alloted share of IOs. As auto-decaying doesn't make much sense for iocost, this patch introduces a different mode of operation for use_delay - when blkcg_set_delay() are used insted of blkcg_add/use_delay(), the delay duration is not auto-decayed until it is explicitly cleared with blkcg_clear_delay(). iocost is updated to keep the delay duration synchronized to the budget overage amount. With this change, iocost can effectively police cgroups which generate significant amount of force-issued IOs. Signed-off-by: Tejun Heo Cc: Josef Bacik Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-cgroup.c | 6 ++++++ block/blk-iocost.c | 25 +++++++++------------- include/linux/blk-cgroup.h | 43 +++++++++++++++++++++++++++++--------- 3 files changed, 49 insertions(+), 25 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 381047f68c720..bf0438f41a0e9 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1639,6 +1639,10 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) { u64 old = atomic64_read(&blkg->delay_start); + /* negative use_delay means no scaling, see blkcg_set_delay() */ + if (atomic_read(&blkg->use_delay) < 0) + return; + /* * We only want to scale down every second. The idea here is that we * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain @@ -1826,6 +1830,8 @@ void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) */ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) { + if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) + return; blkcg_scale_delay(blkg, now); atomic64_add(delta, &blkg->delay_nsec); } diff --git a/block/blk-iocost.c b/block/blk-iocost.c index ef287c33d6d97..d11e8535448e6 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1209,14 +1209,14 @@ static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } -static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost) +static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct blkcg_gq *blkg = iocg_to_blkg(iocg); u64 vtime = atomic64_read(&iocg->vtime); u64 vmargin = ioc->margin_us * now->vrate; u64 margin_ns = ioc->margin_us * NSEC_PER_USEC; - u64 expires, oexpires; + u64 delta_ns, expires, oexpires; u32 hw_inuse; lockdep_assert_held(&iocg->waitq.lock); @@ -1239,15 +1239,10 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost) return false; /* use delay */ - if (cost) { - u64 cost_ns = DIV64_U64_ROUND_UP(cost * NSEC_PER_USEC, - now->vrate); - blkcg_add_delay(blkg, now->now_ns, cost_ns); - } - blkcg_use_delay(blkg); - - expires = now->now_ns + DIV64_U64_ROUND_UP(vtime - now->vnow, - now->vrate) * NSEC_PER_USEC; + delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow, + now->vrate) * NSEC_PER_USEC; + blkcg_set_delay(blkg, delta_ns); + expires = now->now_ns + delta_ns; /* if already active and close enough, don't bother */ oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer)); @@ -1268,7 +1263,7 @@ static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer) spin_lock_irqsave(&iocg->waitq.lock, flags); ioc_now(iocg->ioc, &now); - iocg_kick_delay(iocg, &now, 0); + iocg_kick_delay(iocg, &now); spin_unlock_irqrestore(&iocg->waitq.lock, flags); return HRTIMER_NORESTART; @@ -1386,7 +1381,7 @@ static void ioc_timer_fn(struct timer_list *timer) if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) { /* might be oversleeping vtime / hweight changes, kick */ iocg_kick_waitq(iocg, &now); - iocg_kick_delay(iocg, &now, 0); + iocg_kick_delay(iocg, &now); } else if (iocg_is_idle(iocg)) { /* no waiter and idle, deactivate */ iocg->last_inuse = iocg->inuse; @@ -1785,7 +1780,7 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) */ if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) { iocg->abs_vdebt += abs_cost; - if (iocg_kick_delay(iocg, &now, cost)) + if (iocg_kick_delay(iocg, &now)) blkcg_schedule_throttle(rqos->q, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); spin_unlock_irq(&iocg->waitq.lock); @@ -1873,7 +1868,7 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, spin_lock_irqsave(&iocg->waitq.lock, flags); if (likely(!list_empty(&iocg->active_list))) { iocg->abs_vdebt += abs_cost; - iocg_kick_delay(iocg, &now, cost); + iocg_kick_delay(iocg, &now); } else { iocg_commit_bio(iocg, bio, cost); } diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 868b8a05c7e35..2121adf3cdb29 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -630,6 +630,8 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, static inline void blkcg_use_delay(struct blkcg_gq *blkg) { + if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) + return; if (atomic_add_return(1, &blkg->use_delay) == 1) atomic_inc(&blkg->blkcg->css.cgroup->congestion_count); } @@ -638,6 +640,8 @@ static inline int blkcg_unuse_delay(struct blkcg_gq *blkg) { int old = atomic_read(&blkg->use_delay); + if (WARN_ON_ONCE(old < 0)) + return 0; if (old == 0) return 0; @@ -662,20 +666,39 @@ static inline int blkcg_unuse_delay(struct blkcg_gq *blkg) return 1; } +/** + * blkcg_set_delay - Enable allocator delay mechanism with the specified delay amount + * @blkg: target blkg + * @delay: delay duration in nsecs + * + * When enabled with this function, the delay is not decayed and must be + * explicitly cleared with blkcg_clear_delay(). Must not be mixed with + * blkcg_[un]use_delay() and blkcg_add_delay() usages. + */ +static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay) +{ + int old = atomic_read(&blkg->use_delay); + + /* We only want 1 person setting the congestion count for this blkg. */ + if (!old && atomic_cmpxchg(&blkg->use_delay, old, -1) == old) + atomic_inc(&blkg->blkcg->css.cgroup->congestion_count); + + atomic64_set(&blkg->delay_nsec, delay); +} + +/** + * blkcg_clear_delay - Disable allocator delay mechanism + * @blkg: target blkg + * + * Disable use_delay mechanism. See blkcg_set_delay(). + */ static inline void blkcg_clear_delay(struct blkcg_gq *blkg) { int old = atomic_read(&blkg->use_delay); - if (!old) - return; + /* We only want 1 person clearing the congestion count for this blkg. */ - while (old) { - int cur = atomic_cmpxchg(&blkg->use_delay, old, 0); - if (cur == old) { - atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); - break; - } - old = cur; - } + if (old && atomic_cmpxchg(&blkg->use_delay, old, 0) == old) + atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); } void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta); From a4979a2f2d4a8697fd58c443d0539f10041e30a5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 13 Apr 2020 12:27:56 -0400 Subject: [PATCH 0333/5344] blk-iocost: account for IO size when testing latencies commit cd006509b0a93cb7ee9d9fd50ae274098997a460 upstream. On each IO completion, iocost decides whether the IO met or missed its latency target. Currently, the targets are fixed numbers per IO type. While this can be good enough for loose latency targets way higher than typical completion latencies, the effect of IO size makes it difficult to tighten the latency target - a target adequate for 4k IOs might be too tight for 512k IOs and vice-versa. iocost already has all the necessary information to account for different IO sizes when testing whether the latency target is met as iocost can calculate the size vtime cost of a given IO. This patch updates the completion path to calculate the size vtime cost of the IO, deduct the nsec equivalent from the observed latency and use the adjusted value to decide whether the target is met. This makes latency targets independent from IO size and enables determining adequate latency targets with fixed size fio runs. Signed-off-by: Tejun Heo Cc: Andy Newell Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/Kconfig | 1 + block/blk-iocost.c | 33 +++++++++++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/block/Kconfig b/block/Kconfig index c23094a14a2be..86ff6ecd7c275 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -142,6 +142,7 @@ config BLK_CGROUP_IOLATENCY config BLK_CGROUP_IOCOST bool "Enable support for cost model based cgroup IO controller" depends on BLK_CGROUP=y + select BLK_RQ_IO_DATA_LEN select BLK_RQ_ALLOC_TIME ---help--- Enabling this option enables the .weight interface for cost diff --git a/block/blk-iocost.c b/block/blk-iocost.c index d11e8535448e6..cc61b896637b3 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -263,6 +263,7 @@ enum { VTIME_PER_SEC_SHIFT = 37, VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT, VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC, + VTIME_PER_NSEC = VTIME_PER_SEC / NSEC_PER_SEC, /* bound vrate adjustments within two orders of magnitude */ VRATE_MIN_PPM = 10000, /* 1% */ @@ -1696,6 +1697,31 @@ static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge) return cost; } +static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc, + u64 *costp) +{ + unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT; + + switch (req_op(rq)) { + case REQ_OP_READ: + *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE]; + break; + case REQ_OP_WRITE: + *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE]; + break; + default: + *costp = 0; + } +} + +static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc) +{ + u64 cost; + + calc_size_vtime_cost_builtin(rq, ioc, &cost); + return cost; +} + static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) { struct blkcg_gq *blkg = bio->bi_blkg; @@ -1886,7 +1912,7 @@ static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio) static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq) { struct ioc *ioc = rqos_to_ioc(rqos); - u64 on_q_ns, rq_wait_ns; + u64 on_q_ns, rq_wait_ns, size_nsec; int pidx, rw; if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns) @@ -1907,8 +1933,10 @@ static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq) on_q_ns = ktime_get_ns() - rq->alloc_time_ns; rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns; + size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC); - if (on_q_ns <= ioc->params.qos[pidx] * NSEC_PER_USEC) + if (on_q_ns <= size_nsec || + on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC) this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met); else this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed); @@ -2316,6 +2344,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, spin_lock_irq(&ioc->lock); if (enable) { + blk_stat_enable_accounting(ioc->rqos.q); blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q); ioc->enabled = true; } else { From b1ba97d141a380bbdffd6a0444c4332fc34c25ef Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 13 Apr 2020 12:27:57 -0400 Subject: [PATCH 0334/5344] iocost_monitor: exit successfully if interval is zero commit f4fe3ea636385a51f1dfbb27c387a04b12b919e9 upstream. This is to help external tools to decide whether iocost_monitor has all its requirements met or not based on the exit status of an -i0 run. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- tools/cgroup/iocost_monitor.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/cgroup/iocost_monitor.py b/tools/cgroup/iocost_monitor.py index 103605f5be8c0..3c21de88af9e5 100644 --- a/tools/cgroup/iocost_monitor.py +++ b/tools/cgroup/iocost_monitor.py @@ -28,7 +28,8 @@ parser.add_argument('--cgroup', action='append', metavar='REGEX', help='Regex for target cgroups, ') parser.add_argument('--interval', '-i', metavar='SECONDS', type=float, default=1, - help='Monitoring interval in seconds') + help='Monitoring interval in seconds (0 exits immediately ' + 'after checking requirements)') parser.add_argument('--json', action='store_true', help='Output in json') args = parser.parse_args() @@ -248,6 +249,9 @@ def table_row_str(self, path): if ioc is None: err(f'Could not find ioc for {devname}'); +if interval == 0: + sys.exit(0) + # Keep printing while True: now = time.time() From 48f2d6798715162f0fc43552b27dae4cce887107 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 19 Jun 2020 18:08:30 -0500 Subject: [PATCH 0335/5344] blk-iocost: Use struct_size() in kzalloc_node() commit f61d6e259c7ebb9a134dee5cd0b32c192d726984 upstream. Make use of the struct_size() helper instead of an open-coded version in order to avoid any potential type mistakes. This code was detected with the help of Coccinelle and, audited and fixed manually. Signed-off-by: Gustavo A. R. Silva Addresses-KSPP-ID: https://github.com/KSPP/linux/issues/83 Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-iocost.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index cc61b896637b3..b59a62545905d 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2048,8 +2048,7 @@ static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q, int levels = blkcg->css.cgroup->level + 1; struct ioc_gq *iocg; - iocg = kzalloc_node(sizeof(*iocg) + levels * sizeof(iocg->ancestors[0]), - gfp, q->node); + iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node); if (!iocg) return NULL; From 87755236870c79a762c834970a1a9fca89404069 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 30 Jul 2020 20:31:04 +0800 Subject: [PATCH 0336/5344] iocost_monitor: start from the oldest usage index commit 1bf6ece5731db14f1556637494d2048a8bb15a41 upstream. iocg usage_idx is the latest usage index, we should start from the oldest usage index to show the consecutive NR_USAGE_SLOTS usages. Signed-off-by: Chengming Zhou Acked-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- tools/cgroup/iocost_monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cgroup/iocost_monitor.py b/tools/cgroup/iocost_monitor.py index 3c21de88af9e5..f4699f9b46ba7 100644 --- a/tools/cgroup/iocost_monitor.py +++ b/tools/cgroup/iocost_monitor.py @@ -173,7 +173,7 @@ def __init__(self, iocg): self.usages = [] self.usage = 0 for i in range(NR_USAGE_SLOTS): - usage = iocg.usages[(usage_idx + i) % NR_USAGE_SLOTS].value_() + usage = iocg.usages[(usage_idx + 1 + i) % NR_USAGE_SLOTS].value_() upct = usage * 100 / HWEIGHT_WHOLE self.usages.append(upct) self.usage = max(self.usage, upct) From c1423d452ed09d96d3fdde110b6adaa18c4970c5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:32 -0400 Subject: [PATCH 0337/5344] blk-stat: make q->stats->lock irqsafe commit e11d80a849e010f78243bb6f6af7dccef3a71a90 upstream. blk-iocost calls blk_stat_enable_accounting() while holding an irqsafe lock which triggers a lockdep splat because q->stats->lock isn't irqsafe. Let's make it irqsafe. Signed-off-by: Tejun Heo Fixes: cd006509b0a9 ("blk-iocost: account for IO size when testing latencies") Cc: stable@vger.kernel.org # v5.8+ Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-stat.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/block/blk-stat.c b/block/blk-stat.c index 940f15d600f8a..a381a52149098 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -136,6 +136,7 @@ void blk_stat_add_callback(struct request_queue *q, struct blk_stat_callback *cb) { unsigned int bucket; + unsigned long flags; int cpu; for_each_possible_cpu(cpu) { @@ -146,20 +147,22 @@ void blk_stat_add_callback(struct request_queue *q, blk_rq_stat_init(&cpu_stat[bucket]); } - spin_lock(&q->stats->lock); + spin_lock_irqsave(&q->stats->lock, flags); list_add_tail_rcu(&cb->list, &q->stats->callbacks); blk_queue_flag_set(QUEUE_FLAG_STATS, q); - spin_unlock(&q->stats->lock); + spin_unlock_irqrestore(&q->stats->lock, flags); } void blk_stat_remove_callback(struct request_queue *q, struct blk_stat_callback *cb) { - spin_lock(&q->stats->lock); + unsigned long flags; + + spin_lock_irqsave(&q->stats->lock, flags); list_del_rcu(&cb->list); if (list_empty(&q->stats->callbacks) && !q->stats->enable_accounting) blk_queue_flag_clear(QUEUE_FLAG_STATS, q); - spin_unlock(&q->stats->lock); + spin_unlock_irqrestore(&q->stats->lock, flags); del_timer_sync(&cb->timer); } @@ -182,10 +185,12 @@ void blk_stat_free_callback(struct blk_stat_callback *cb) void blk_stat_enable_accounting(struct request_queue *q) { - spin_lock(&q->stats->lock); + unsigned long flags; + + spin_lock_irqsave(&q->stats->lock, flags); q->stats->enable_accounting = true; blk_queue_flag_set(QUEUE_FLAG_STATS, q); - spin_unlock(&q->stats->lock); + spin_unlock_irqrestore(&q->stats->lock, flags); } EXPORT_SYMBOL_GPL(blk_stat_enable_accounting); From 12ffd58def139fb607fe9e2a18ad73ab6ed19b79 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:33 -0400 Subject: [PATCH 0338/5344] blk-iocost: use local[64]_t for percpu stat commit 5e124f74325d8181b3275a704e87e44246c484f6 upstream. blk-iocost has been reading percpu stat counters from remote cpus which on some archs can lead to torn reads in really rare occassions. Use local[64]_t for those counters. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-iocost.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index b59a62545905d..10a6235723a90 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -182,6 +182,8 @@ #include #include #include +#include +#include #include "blk-rq-qos.h" #include "blk-stat.h" #include "blk-wbt.h" @@ -376,8 +378,8 @@ struct ioc_params { }; struct ioc_missed { - u32 nr_met; - u32 nr_missed; + local_t nr_met; + local_t nr_missed; u32 last_met; u32 last_missed; }; @@ -385,7 +387,7 @@ struct ioc_missed { struct ioc_pcpu_stat { struct ioc_missed missed[2]; - u64 rq_wait_ns; + local64_t rq_wait_ns; u64 last_rq_wait_ns; }; @@ -1282,8 +1284,8 @@ static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p u64 this_rq_wait_ns; for (rw = READ; rw <= WRITE; rw++) { - u32 this_met = READ_ONCE(stat->missed[rw].nr_met); - u32 this_missed = READ_ONCE(stat->missed[rw].nr_missed); + u32 this_met = local_read(&stat->missed[rw].nr_met); + u32 this_missed = local_read(&stat->missed[rw].nr_missed); nr_met[rw] += this_met - stat->missed[rw].last_met; nr_missed[rw] += this_missed - stat->missed[rw].last_missed; @@ -1291,7 +1293,7 @@ static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p stat->missed[rw].last_missed = this_missed; } - this_rq_wait_ns = READ_ONCE(stat->rq_wait_ns); + this_rq_wait_ns = local64_read(&stat->rq_wait_ns); rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns; stat->last_rq_wait_ns = this_rq_wait_ns; } @@ -1912,6 +1914,7 @@ static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio) static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq) { struct ioc *ioc = rqos_to_ioc(rqos); + struct ioc_pcpu_stat *ccs; u64 on_q_ns, rq_wait_ns, size_nsec; int pidx, rw; @@ -1935,13 +1938,17 @@ static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq) rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns; size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC); + ccs = get_cpu_ptr(ioc->pcpu_stat); + if (on_q_ns <= size_nsec || on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC) - this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met); + local_inc(&ccs->missed[rw].nr_met); else - this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed); + local_inc(&ccs->missed[rw].nr_missed); + + local64_add(rq_wait_ns, &ccs->rq_wait_ns); - this_cpu_add(ioc->pcpu_stat->rq_wait_ns, rq_wait_ns); + put_cpu_ptr(ccs); } static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos) @@ -1981,7 +1988,7 @@ static int blk_iocost_init(struct request_queue *q) { struct ioc *ioc; struct rq_qos *rqos; - int ret; + int i, cpu, ret; ioc = kzalloc(sizeof(*ioc), GFP_KERNEL); if (!ioc) @@ -1993,6 +2000,16 @@ static int blk_iocost_init(struct request_queue *q) return -ENOMEM; } + for_each_possible_cpu(cpu) { + struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu); + + for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) { + local_set(&ccs->missed[i].nr_met, 0); + local_set(&ccs->missed[i].nr_missed, 0); + } + local64_set(&ccs->rq_wait_ns, 0); + } + rqos = &ioc->rqos; rqos->id = RQ_QOS_COST; rqos->ops = &ioc_rqos_ops; From dfd57f00e0fdcc4c55be784c7726530af7622252 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:34 -0400 Subject: [PATCH 0339/5344] blk-iocost: rename propagate_active_weights() to propagate_weights() commit 00410f1b09fe7c9a12bde07f0bb4b978a3367f3a upstream. It already propagates two weights - active and inuse - and there will be another soon. Let's drop the confusing misnomers. Rename [__]propagate_active_weights() to [__]propagate_weights() and commit_active_weights() to commit_weights(). This is pure rename. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-iocost.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 10a6235723a90..1474d8b5b43f3 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -482,7 +482,7 @@ struct ioc_gq { atomic64_t active_period; struct list_head active_list; - /* see __propagate_active_weight() and current_hweight() for details */ + /* see __propagate_weights() and current_hweight() for details */ u64 child_active_sum; u64 child_inuse_sum; int hweight_gen; @@ -894,7 +894,7 @@ static void ioc_start_period(struct ioc *ioc, struct ioc_now *now) * Update @iocg's `active` and `inuse` to @active and @inuse, update level * weight sums and propagate upwards accordingly. */ -static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse) +static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse) { struct ioc *ioc = iocg->ioc; int lvl; @@ -939,7 +939,7 @@ static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse ioc->weights_updated = true; } -static void commit_active_weights(struct ioc *ioc) +static void commit_weights(struct ioc *ioc) { lockdep_assert_held(&ioc->lock); @@ -951,10 +951,10 @@ static void commit_active_weights(struct ioc *ioc) } } -static void propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse) +static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse) { - __propagate_active_weight(iocg, active, inuse); - commit_active_weights(iocg->ioc); + __propagate_weights(iocg, active, inuse); + commit_weights(iocg->ioc); } static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep) @@ -970,9 +970,9 @@ static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep goto out; /* - * Paired with wmb in commit_active_weights(). If we saw the - * updated hweight_gen, all the weight updates from - * __propagate_active_weight() are visible too. + * Paired with wmb in commit_weights(). If we saw the updated + * hweight_gen, all the weight updates from __propagate_weights() are + * visible too. * * We can race with weight updates during calculation and get it * wrong. However, hweight_gen would have changed and a future @@ -1022,7 +1022,7 @@ static void weight_updated(struct ioc_gq *iocg) weight = iocg->cfg_weight ?: iocc->dfl_weight; if (weight != iocg->weight && iocg->active) - propagate_active_weight(iocg, weight, + propagate_weights(iocg, weight, DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight)); iocg->weight = weight; } @@ -1094,8 +1094,8 @@ static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) */ iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1; list_add(&iocg->active_list, &ioc->active_iocgs); - propagate_active_weight(iocg, iocg->weight, - iocg->last_inuse ?: iocg->weight); + propagate_weights(iocg, iocg->weight, + iocg->last_inuse ?: iocg->weight); TRACE_IOCG_PATH(iocg_activate, iocg, now, last_period, cur_period, vtime); @@ -1388,13 +1388,13 @@ static void ioc_timer_fn(struct timer_list *timer) } else if (iocg_is_idle(iocg)) { /* no waiter and idle, deactivate */ iocg->last_inuse = iocg->inuse; - __propagate_active_weight(iocg, 0, 0); + __propagate_weights(iocg, 0, 0); list_del_init(&iocg->active_list); } spin_unlock(&iocg->waitq.lock); } - commit_active_weights(ioc); + commit_weights(ioc); /* calc usages and see whether some weights need to be moved around */ list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { @@ -1487,8 +1487,8 @@ static void ioc_timer_fn(struct timer_list *timer) TRACE_IOCG_PATH(inuse_takeback, iocg, &now, iocg->inuse, new_inuse, hw_inuse, new_hwi); - __propagate_active_weight(iocg, iocg->weight, - new_inuse); + __propagate_weights(iocg, iocg->weight, + new_inuse); } } else { /* genuninely out of vtime */ @@ -1528,11 +1528,11 @@ static void ioc_timer_fn(struct timer_list *timer) TRACE_IOCG_PATH(inuse_giveaway, iocg, &now, iocg->inuse, new_inuse, hw_inuse, new_hwi); - __propagate_active_weight(iocg, iocg->weight, new_inuse); + __propagate_weights(iocg, iocg->weight, new_inuse); } } skip_surplus_transfers: - commit_active_weights(ioc); + commit_weights(ioc); /* * If q is getting clogged or we're missing too much, we're issuing @@ -1757,7 +1757,7 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) TRACE_IOCG_PATH(inuse_reset, iocg, &now, iocg->inuse, iocg->weight, hw_inuse, hw_active); spin_lock_irq(&ioc->lock); - propagate_active_weight(iocg, iocg->weight, iocg->weight); + propagate_weights(iocg, iocg->weight, iocg->weight); spin_unlock_irq(&ioc->lock); current_hweight(iocg, &hw_active, &hw_inuse); } @@ -2118,7 +2118,7 @@ static void ioc_pd_free(struct blkg_policy_data *pd) if (ioc) { spin_lock_irqsave(&ioc->lock, flags); if (!list_empty(&iocg->active_list)) { - propagate_active_weight(iocg, 0, 0); + propagate_weights(iocg, 0, 0); list_del_init(&iocg->active_list); } spin_unlock_irqrestore(&ioc->lock, flags); From 7499d1c58ca653606bcef81afa1f924f76396df2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:35 -0400 Subject: [PATCH 0340/5344] blk-iocost: clamp inuse and skip noops in __propagate_weights() commit db84a72af6be422abf2089a5896293590dda5066 upstream. __propagate_weights() currently expects the callers to clamp inuse within [1, active], which is needlessly fragile. The inuse adjustment logic is going to be revamped, in preparation, let's make __propagate_weights() clamp inuse on entry. Also, make it avoid weight updates altogether if neither active or inuse is changed. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-iocost.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 1474d8b5b43f3..d960f48d0aa62 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -901,7 +901,10 @@ static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse) lockdep_assert_held(&ioc->lock); - inuse = min(active, inuse); + inuse = clamp_t(u32, inuse, 1, active); + + if (active == iocg->active && inuse == iocg->inuse) + return; for (lvl = iocg->level - 1; lvl >= 0; lvl--) { struct ioc_gq *parent = iocg->ancestors[lvl]; From d9309bb0c5b87400faeddec7b989b0f1146ec9e6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:36 -0400 Subject: [PATCH 0341/5344] blk-iocost: move iocg_kick_delay() above iocg_kick_waitq() commit 6ef20f787b0aa337dc901d42efa44854e09c87e1 upstream. We'll make iocg_kick_waitq() call iocg_kick_delay(). Reorder them in preparation. This is pure code reorganization. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-iocost.c | 120 ++++++++++++++++++++++----------------------- 1 file changed, 60 insertions(+), 60 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index d960f48d0aa62..679d8d875ee94 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1119,6 +1119,66 @@ static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) return false; } +static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) +{ + struct ioc *ioc = iocg->ioc; + struct blkcg_gq *blkg = iocg_to_blkg(iocg); + u64 vtime = atomic64_read(&iocg->vtime); + u64 vmargin = ioc->margin_us * now->vrate; + u64 margin_ns = ioc->margin_us * NSEC_PER_USEC; + u64 delta_ns, expires, oexpires; + u32 hw_inuse; + + lockdep_assert_held(&iocg->waitq.lock); + + /* debt-adjust vtime */ + current_hweight(iocg, NULL, &hw_inuse); + vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse); + + /* + * Clear or maintain depending on the overage. Non-zero vdebt is what + * guarantees that @iocg is online and future iocg_kick_delay() will + * clear use_delay. Don't leave it on when there's no vdebt. + */ + if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) { + blkcg_clear_delay(blkg); + return false; + } + if (!atomic_read(&blkg->use_delay) && + time_before_eq64(vtime, now->vnow + vmargin)) + return false; + + /* use delay */ + delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow, + now->vrate) * NSEC_PER_USEC; + blkcg_set_delay(blkg, delta_ns); + expires = now->now_ns + delta_ns; + + /* if already active and close enough, don't bother */ + oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer)); + if (hrtimer_is_queued(&iocg->delay_timer) && + abs(oexpires - expires) <= margin_ns / 4) + return true; + + hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires), + margin_ns / 4, HRTIMER_MODE_ABS); + return true; +} + +static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer) +{ + struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer); + struct ioc_now now; + unsigned long flags; + + spin_lock_irqsave(&iocg->waitq.lock, flags); + ioc_now(iocg->ioc, &now); + iocg_kick_delay(iocg, &now); + spin_unlock_irqrestore(&iocg->waitq.lock, flags); + + return HRTIMER_NORESTART; +} + static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key) { @@ -1215,66 +1275,6 @@ static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } -static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) -{ - struct ioc *ioc = iocg->ioc; - struct blkcg_gq *blkg = iocg_to_blkg(iocg); - u64 vtime = atomic64_read(&iocg->vtime); - u64 vmargin = ioc->margin_us * now->vrate; - u64 margin_ns = ioc->margin_us * NSEC_PER_USEC; - u64 delta_ns, expires, oexpires; - u32 hw_inuse; - - lockdep_assert_held(&iocg->waitq.lock); - - /* debt-adjust vtime */ - current_hweight(iocg, NULL, &hw_inuse); - vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse); - - /* - * Clear or maintain depending on the overage. Non-zero vdebt is what - * guarantees that @iocg is online and future iocg_kick_delay() will - * clear use_delay. Don't leave it on when there's no vdebt. - */ - if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) { - blkcg_clear_delay(blkg); - return false; - } - if (!atomic_read(&blkg->use_delay) && - time_before_eq64(vtime, now->vnow + vmargin)) - return false; - - /* use delay */ - delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow, - now->vrate) * NSEC_PER_USEC; - blkcg_set_delay(blkg, delta_ns); - expires = now->now_ns + delta_ns; - - /* if already active and close enough, don't bother */ - oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer)); - if (hrtimer_is_queued(&iocg->delay_timer) && - abs(oexpires - expires) <= margin_ns / 4) - return true; - - hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires), - margin_ns / 4, HRTIMER_MODE_ABS); - return true; -} - -static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer) -{ - struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer); - struct ioc_now now; - unsigned long flags; - - spin_lock_irqsave(&iocg->waitq.lock, flags); - ioc_now(iocg->ioc, &now); - iocg_kick_delay(iocg, &now); - spin_unlock_irqrestore(&iocg->waitq.lock, flags); - - return HRTIMER_NORESTART; -} - static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p) { u32 nr_met[2] = { }; From be7c2124d5ecd1fc831674471e86685c0d222a28 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:37 -0400 Subject: [PATCH 0342/5344] blk-iocost: make iocg_kick_waitq() call iocg_kick_delay() after paying debt commit 7b84b49e381a8538626f200785dd918a402c62d7 upstream. iocg_kick_waitq() is the function which pays debt and iocg_kick_delay() updates the actual delay status accordingly. If iocg_kick_delay() is not called after iocg_kick_delay() updated debt, unnecessarily large delays can be applied temporarily. Let's make sure such conditions don't occur by making iocg_kick_waitq() always call iocg_kick_delay() after paying debt. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-iocost.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 679d8d875ee94..830203f40a524 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1230,6 +1230,8 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now) atomic64_add(delta, &iocg->vtime); atomic64_add(delta, &iocg->done_vtime); iocg->abs_vdebt -= abs_delta; + + iocg_kick_delay(iocg, now); } /* @@ -1387,7 +1389,6 @@ static void ioc_timer_fn(struct timer_list *timer) if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) { /* might be oversleeping vtime / hweight changes, kick */ iocg_kick_waitq(iocg, &now); - iocg_kick_delay(iocg, &now); } else if (iocg_is_idle(iocg)) { /* no waiter and idle, deactivate */ iocg->last_inuse = iocg->inuse; From 58a945b9b9bfeb6ed14ce21aa9580f72a326f81d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:38 -0400 Subject: [PATCH 0343/5344] blk-iocost: s/HWEIGHT_WHOLE/WEIGHT_ONE/g commit fe20cdb516377e6ac6300254d5ca0dd7d8621ea8 upstream. We're gonna use HWEIGHT_WHOLE for regular weights too. Let's rename it to WEIGHT_ONE. Pure rename. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-iocost.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 830203f40a524..5d9af506c5ba6 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -71,7 +71,7 @@ * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest, * 12.5% each. The distribution mechanism only cares about these flattened * shares. They're called hweights (hierarchical weights) and always add - * upto 1 (HWEIGHT_WHOLE). + * upto 1 (WEIGHT_ONE). * * A given cgroup's vtime runs slower in inverse proportion to its hweight. * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5) @@ -249,7 +249,7 @@ enum { MIN_VALID_USAGES = 2, /* 1/64k is granular enough and can easily be handled w/ u32 */ - HWEIGHT_WHOLE = 1 << 16, + WEIGHT_ONE = 1 << 16, /* * As vtime is used to calculate the cost of each IO, it needs to @@ -288,8 +288,8 @@ enum { * donate the surplus. */ SURPLUS_SCALE_PCT = 125, /* * 125% */ - SURPLUS_SCALE_ABS = HWEIGHT_WHOLE / 50, /* + 2% */ - SURPLUS_MIN_ADJ_DELTA = HWEIGHT_WHOLE / 33, /* 3% */ + SURPLUS_SCALE_ABS = WEIGHT_ONE / 50, /* + 2% */ + SURPLUS_MIN_ADJ_DELTA = WEIGHT_ONE / 33, /* 3% */ /* switch iff the conditions are met for longer than this */ AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC, @@ -494,7 +494,7 @@ struct ioc_gq { struct hrtimer waitq_timer; struct hrtimer delay_timer; - /* usage is recorded as fractions of HWEIGHT_WHOLE */ + /* usage is recorded as fractions of WEIGHT_ONE */ int usage_idx; u32 usages[NR_USAGE_SLOTS]; @@ -661,7 +661,7 @@ static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg) */ static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse) { - return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse); + return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse); } /* @@ -669,7 +669,7 @@ static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse) */ static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse) { - return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE); + return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE); } static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost) @@ -984,7 +984,7 @@ static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep */ smp_rmb(); - hwa = hwi = HWEIGHT_WHOLE; + hwa = hwi = WEIGHT_ONE; for (lvl = 0; lvl <= iocg->level - 1; lvl++) { struct ioc_gq *parent = iocg->ancestors[lvl]; struct ioc_gq *child = iocg->ancestors[lvl + 1]; @@ -2092,8 +2092,8 @@ static void ioc_pd_init(struct blkg_policy_data *pd) atomic64_set(&iocg->done_vtime, now.vnow); atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period)); INIT_LIST_HEAD(&iocg->active_list); - iocg->hweight_active = HWEIGHT_WHOLE; - iocg->hweight_inuse = HWEIGHT_WHOLE; + iocg->hweight_active = WEIGHT_ONE; + iocg->hweight_inuse = WEIGHT_ONE; init_waitqueue_head(&iocg->waitq); hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); From 560d083fab041891d4721a0255727ccaeae18525 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:39 -0400 Subject: [PATCH 0344/5344] blk-iocost: use WEIGHT_ONE based fixed point number for weights commit bd0adb91a68b2fbf384ace5287bb46f135e8d889 upstream. To improve weight donations, we want to able to scale inuse with a greater accuracy and down below 1. Let's make non-hierarchical weights to use WEIGHT_ONE based fixed point numbers too like hierarchical ones. This doesn't cause any behavior changes yet. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-iocost.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 5d9af506c5ba6..c3c27fb8af0f6 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -988,8 +988,8 @@ static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep for (lvl = 0; lvl <= iocg->level - 1; lvl++) { struct ioc_gq *parent = iocg->ancestors[lvl]; struct ioc_gq *child = iocg->ancestors[lvl + 1]; - u32 active_sum = READ_ONCE(parent->child_active_sum); - u32 inuse_sum = READ_ONCE(parent->child_inuse_sum); + u64 active_sum = READ_ONCE(parent->child_active_sum); + u64 inuse_sum = READ_ONCE(parent->child_inuse_sum); u32 active = READ_ONCE(child->active); u32 inuse = READ_ONCE(child->inuse); @@ -997,11 +997,11 @@ static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep if (!active_sum || !inuse_sum) continue; - active_sum = max(active, active_sum); - hwa = hwa * active / active_sum; /* max 16bits * 10000 */ + active_sum = max_t(u64, active, active_sum); + hwa = div64_u64((u64)hwa * active, active_sum); - inuse_sum = max(inuse, inuse_sum); - hwi = hwi * inuse / inuse_sum; /* max 16bits * 10000 */ + inuse_sum = max_t(u64, inuse, inuse_sum); + hwi = div64_u64((u64)hwi * inuse, inuse_sum); } iocg->hweight_active = max_t(u32, hwa, 1); @@ -1026,7 +1026,8 @@ static void weight_updated(struct ioc_gq *iocg) weight = iocg->cfg_weight ?: iocc->dfl_weight; if (weight != iocg->weight && iocg->active) propagate_weights(iocg, weight, - DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight)); + DIV64_U64_ROUND_UP((u64)iocg->inuse * weight, + iocg->weight)); iocg->weight = weight; } @@ -2054,7 +2055,7 @@ static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp) if (!iocc) return NULL; - iocc->dfl_weight = CGROUP_WEIGHT_DFL; + iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE; return &iocc->cpd; } @@ -2140,7 +2141,7 @@ static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd, struct ioc_gq *iocg = pd_to_iocg(pd); if (dname && iocg->cfg_weight) - seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight); + seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE); return 0; } @@ -2150,7 +2151,7 @@ static int ioc_weight_show(struct seq_file *sf, void *v) struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg); - seq_printf(sf, "default %u\n", iocc->dfl_weight); + seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE); blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill, &blkcg_policy_iocost, seq_cft(sf)->private, false); return 0; @@ -2176,7 +2177,7 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, return -EINVAL; spin_lock(&blkcg->lock); - iocc->dfl_weight = v; + iocc->dfl_weight = v * WEIGHT_ONE; hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { struct ioc_gq *iocg = blkg_to_iocg(blkg); @@ -2207,7 +2208,7 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, } spin_lock(&iocg->ioc->lock); - iocg->cfg_weight = v; + iocg->cfg_weight = v * WEIGHT_ONE; weight_updated(iocg); spin_unlock(&iocg->ioc->lock); From 5e00a97ce2b264962bdde91a04c0beafc9f6780f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:40 -0400 Subject: [PATCH 0345/5344] blk-iocost: make ioc_now->now and ioc->period_at 64bit commit ce95570acf741ea306baddcb43aba0b59b920a21 upstream. Note: there are a few conflics need to resolved when the origin patch applied. They are in microseconds and wrap in around 1.2 hours with u32. While unlikely, confusions from wraparounds are still possible. We aren't saving anything meaningful by keeping these u32. Let's make them u64. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-iocost.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index c3c27fb8af0f6..df18d6d55f5de 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -412,7 +412,7 @@ struct ioc { atomic64_t vtime_rate; seqcount_t period_seqcount; - u32 period_at; /* wallclock starttime */ + u64 period_at; /* wallclock starttime */ u64 period_at_vtime; /* vtime starttime */ atomic64_t cur_period; /* inc'd each period */ @@ -511,7 +511,7 @@ struct ioc_cgrp { struct ioc_now { u64 now_ns; - u32 now; + u64 now; u64 vnow; u64 vrate; }; From 320ecead3de27fe69cbdc827ee4089a83effed6c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:41 -0400 Subject: [PATCH 0346/5344] blk-iocost: streamline vtime margin and timer slack handling commit 7ca5b2e60bfa4aa5b8d52e9c3e2a757c581bec1d upstream. The margin handling was pretty inconsistent. * ioc->margin_us and ioc->inuse_margin_vtime were used as vtime margin thresholds. However, the two are in different units with the former requiring conversion to vtime on use. * iocg_kick_waitq() was using a quarter of WAITQ_TIMER_MARGIN_PCT of period_us as the timer slack - ~1.2%. While iocg_kick_delay() was using a quarter of ioc->margin_us - ~12.5%. There aren't strong reasons to use different values for the two. This patch cleans up margin and timer slack handling: * vtime margins are now recorded in ioc->margins.{min, max} on period duration changes and used consistently. * Timer slack is now 1% of period_us and recorded in ioc->timer_slack_ns and used consistently for iocg_kick_waitq() and iocg_kick_delay(). The only functional change is shortening of timer slack. No meaningful visible change is expected. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-iocost.c | 67 ++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index df18d6d55f5de..79b04bcc0e1d9 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -224,11 +224,11 @@ enum { * serves as its IO credit buffer. Surplus weight adjustment is * immediately canceled if the vtime margin runs below 10%. */ - MARGIN_PCT = 50, - INUSE_MARGIN_PCT = 10, + MARGIN_MIN_PCT = 10, + MARGIN_MAX_PCT = 50, - /* Have some play in waitq timer operations */ - WAITQ_TIMER_MARGIN_PCT = 5, + /* Have some play in timer operations */ + TIMER_SLACK_PCT = 1, /* * vtime can wrap well within a reasonable uptime when vrate is @@ -377,6 +377,11 @@ struct ioc_params { u32 too_slow_vrate_pct; }; +struct ioc_margins { + s64 min; + s64 max; +}; + struct ioc_missed { local_t nr_met; local_t nr_missed; @@ -398,8 +403,9 @@ struct ioc { bool enabled; struct ioc_params params; + struct ioc_margins margins; u32 period_us; - u32 margin_us; + u32 timer_slack_ns; u64 vrate_min; u64 vrate_max; @@ -418,7 +424,6 @@ struct ioc { atomic64_t cur_period; /* inc'd each period */ int busy_level; /* saturation history */ - u64 inuse_margin_vtime; bool weights_updated; atomic_t hweight_gen; /* for lazy hweights */ @@ -681,6 +686,16 @@ static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost) #define CREATE_TRACE_POINTS #include +static void ioc_refresh_margins(struct ioc *ioc) +{ + struct ioc_margins *margins = &ioc->margins; + u32 period_us = ioc->period_us; + u64 vrate = atomic64_read(&ioc->vtime_rate); + + margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate; + margins->max = (period_us * MARGIN_MAX_PCT / 100) * vrate; +} + /* latency Qos params changed, update period_us and all the dependent params */ static void ioc_refresh_period_us(struct ioc *ioc) { @@ -714,9 +729,10 @@ static void ioc_refresh_period_us(struct ioc *ioc) /* calculate dependent params */ ioc->period_us = period_us; - ioc->margin_us = period_us * MARGIN_PCT / 100; - ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP( - period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100); + ioc->timer_slack_ns = div64_u64( + (u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT, + 100); + ioc_refresh_margins(ioc); } static int ioc_autop_idx(struct ioc *ioc) @@ -1035,7 +1051,7 @@ static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; u64 last_period, cur_period, max_period_delta; - u64 vtime, vmargin, vmin; + u64 vtime, vmin; int i; /* @@ -1081,8 +1097,7 @@ static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) */ max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us); vtime = atomic64_read(&iocg->vtime); - vmargin = ioc->margin_us * now->vrate; - vmin = now->vnow - vmargin; + vmin = now->vnow - ioc->margins.max; if (last_period + max_period_delta < cur_period || time_before64(vtime, vmin)) { @@ -1125,8 +1140,6 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) struct ioc *ioc = iocg->ioc; struct blkcg_gq *blkg = iocg_to_blkg(iocg); u64 vtime = atomic64_read(&iocg->vtime); - u64 vmargin = ioc->margin_us * now->vrate; - u64 margin_ns = ioc->margin_us * NSEC_PER_USEC; u64 delta_ns, expires, oexpires; u32 hw_inuse; @@ -1146,7 +1159,7 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) return false; } if (!atomic_read(&blkg->use_delay) && - time_before_eq64(vtime, now->vnow + vmargin)) + time_before_eq64(vtime, now->vnow + ioc->margins.max)) return false; /* use delay */ @@ -1158,11 +1171,11 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) /* if already active and close enough, don't bother */ oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer)); if (hrtimer_is_queued(&iocg->delay_timer) && - abs(oexpires - expires) <= margin_ns / 4) + abs(oexpires - expires) <= ioc->timer_slack_ns) return true; hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires), - margin_ns / 4, HRTIMER_MODE_ABS); + ioc->timer_slack_ns, HRTIMER_MODE_ABS); return true; } @@ -1210,8 +1223,6 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct iocg_wake_ctx ctx = { .iocg = iocg }; - u64 margin_ns = (u64)(ioc->period_us * - WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC; u64 vdebt, vshortage, expires, oexpires; s64 vbudget; u32 hw_inuse; @@ -1247,20 +1258,20 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now) if (WARN_ON_ONCE(ctx.vbudget >= 0)) return; - /* determine next wakeup, add a quarter margin to guarantee chunking */ + /* determine next wakeup, add a timer margin to guarantee chunking */ vshortage = -ctx.vbudget; expires = now->now_ns + DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC; - expires += margin_ns / 4; + expires += ioc->timer_slack_ns; /* if already active and close enough, don't bother */ oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer)); if (hrtimer_is_queued(&iocg->waitq_timer) && - abs(oexpires - expires) <= margin_ns / 4) + abs(oexpires - expires) <= ioc->timer_slack_ns) return; hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires), - margin_ns / 4, HRTIMER_MODE_ABS); + ioc->timer_slack_ns, HRTIMER_MODE_ABS); } static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer) @@ -1403,7 +1414,7 @@ static void ioc_timer_fn(struct timer_list *timer) /* calc usages and see whether some weights need to be moved around */ list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { - u64 vdone, vtime, vusage, vmargin, vmin; + u64 vdone, vtime, vusage, vmin; u32 hw_active, hw_inuse, usage; /* @@ -1454,8 +1465,7 @@ static void ioc_timer_fn(struct timer_list *timer) } /* see whether there's surplus vtime */ - vmargin = ioc->margin_us * now.vrate; - vmin = now.vnow - vmargin; + vmin = now.vnow - ioc->margins.max; iocg->has_surplus = false; @@ -1627,8 +1637,7 @@ static void ioc_timer_fn(struct timer_list *timer) nr_surpluses); atomic64_set(&ioc->vtime_rate, vrate); - ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP( - ioc->period_us * vrate * INUSE_MARGIN_PCT, 100); + ioc_refresh_margins(ioc); } else if (ioc->busy_level != prev_busy_level || nr_lagging) { trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate), missed_ppm, rq_wait_pct, nr_lagging, @@ -1758,7 +1767,7 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) current_hweight(iocg, &hw_active, &hw_inuse); if (hw_inuse < hw_active && - time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) { + time_after_eq64(vtime + ioc->margins.min, now.vnow)) { TRACE_IOCG_PATH(inuse_reset, iocg, &now, iocg->inuse, iocg->weight, hw_inuse, hw_active); spin_lock_irq(&ioc->lock); From 6fb5901b4cab70be90206e20e84e48ea60f21a97 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:42 -0400 Subject: [PATCH 0347/5344] blk-iocost: grab ioc->lock for debt handling commit da437b95db83106319d337a5859e6dfb464b085a upstream. Currently, debt handling requires only iocg->waitq.lock. In the future, we want to adjust and propagate inuse changes depending on debt status. Let's grab ioc->lock in debt handling paths in preparation. * Because ioc->lock nests outside iocg->waitq.lock, the decision to grab ioc->lock needs to be made before entering the critical sections. * Add and use iocg_[un]lock() which handles the conditional double locking. * Add @pay_debt to iocg_kick_waitq() so that debt payment happens only when the caller grabbed both locks. This patch is prepatory and the comments contain references to future changes. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-iocost.c | 92 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 73 insertions(+), 19 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 79b04bcc0e1d9..ec41468b25a76 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -683,6 +683,26 @@ static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost) atomic64_add(cost, &iocg->vtime); } +static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags) +{ + if (lock_ioc) { + spin_lock_irqsave(&iocg->ioc->lock, *flags); + spin_lock(&iocg->waitq.lock); + } else { + spin_lock_irqsave(&iocg->waitq.lock, *flags); + } +} + +static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags) +{ + if (unlock_ioc) { + spin_unlock(&iocg->waitq.lock); + spin_unlock_irqrestore(&iocg->ioc->lock, *flags); + } else { + spin_unlock_irqrestore(&iocg->waitq.lock, *flags); + } +} + #define CREATE_TRACE_POINTS #include @@ -1219,11 +1239,17 @@ static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode, return 0; } -static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now) +/* + * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters + * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in + * addition to iocg->waitq.lock. + */ +static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt, + struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct iocg_wake_ctx ctx = { .iocg = iocg }; - u64 vdebt, vshortage, expires, oexpires; + u64 vshortage, expires, oexpires; s64 vbudget; u32 hw_inuse; @@ -1233,25 +1259,39 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now) vbudget = now->vnow - atomic64_read(&iocg->vtime); /* pay off debt */ - vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse); - if (vdebt && vbudget > 0) { + if (pay_debt && iocg->abs_vdebt && vbudget > 0) { + u64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse); u64 delta = min_t(u64, vbudget, vdebt); u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse), iocg->abs_vdebt); + lockdep_assert_held(&ioc->lock); + atomic64_add(delta, &iocg->vtime); atomic64_add(delta, &iocg->done_vtime); iocg->abs_vdebt -= abs_delta; + vbudget -= vdebt; iocg_kick_delay(iocg, now); } + /* + * Debt can still be outstanding if we haven't paid all yet or the + * caller raced and called without @pay_debt. Shouldn't wake up waiters + * under debt. Make sure @vbudget reflects the outstanding amount and is + * not positive. + */ + if (iocg->abs_vdebt) { + s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse); + vbudget = min_t(s64, 0, vbudget - vdebt); + } + /* * Wake up the ones which are due and see how much vtime we'll need * for the next one. */ ctx.hw_inuse = hw_inuse; - ctx.vbudget = vbudget - vdebt; + ctx.vbudget = vbudget; __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx); if (!waitqueue_active(&iocg->waitq)) return; @@ -1277,14 +1317,15 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now) static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer) { struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer); + bool pay_debt = READ_ONCE(iocg->abs_vdebt); struct ioc_now now; unsigned long flags; ioc_now(iocg->ioc, &now); - spin_lock_irqsave(&iocg->waitq.lock, flags); - iocg_kick_waitq(iocg, &now); - spin_unlock_irqrestore(&iocg->waitq.lock, flags); + iocg_lock(iocg, pay_debt, &flags); + iocg_kick_waitq(iocg, pay_debt, &now); + iocg_unlock(iocg, pay_debt, &flags); return HRTIMER_NORESTART; } @@ -1400,7 +1441,7 @@ static void ioc_timer_fn(struct timer_list *timer) if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) { /* might be oversleeping vtime / hweight changes, kick */ - iocg_kick_waitq(iocg, &now); + iocg_kick_waitq(iocg, true, &now); } else if (iocg_is_idle(iocg)) { /* no waiter and idle, deactivate */ iocg->last_inuse = iocg->inuse; @@ -1747,6 +1788,8 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) struct iocg_wait wait; u32 hw_active, hw_inuse; u64 abs_cost, cost, vtime; + bool use_debt, ioc_locked; + unsigned long flags; /* bypass IOs if disabled or for root cgroup */ if (!ioc->enabled || !iocg->level) @@ -1790,15 +1833,26 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) } /* - * We activated above but w/o any synchronization. Deactivation is - * synchronized with waitq.lock and we won't get deactivated as long - * as we're waiting or has debt, so we're good if we're activated - * here. In the unlikely case that we aren't, just issue the IO. + * We're over budget. This can be handled in two ways. IOs which may + * cause priority inversions are punted to @ioc->aux_iocg and charged as + * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling + * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine + * whether debt handling is needed and acquire locks accordingly. */ - spin_lock_irq(&iocg->waitq.lock); + use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current); + ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt); + iocg_lock(iocg, ioc_locked, &flags); + + /* + * @iocg must stay activated for debt and waitq handling. Deactivation + * is synchronized against both ioc->lock and waitq.lock and we won't + * get deactivated as long as we're waiting or has debt, so we're good + * if we're activated here. In the unlikely cases that we aren't, just + * issue the IO. + */ if (unlikely(list_empty(&iocg->active_list))) { - spin_unlock_irq(&iocg->waitq.lock); + iocg_unlock(iocg, ioc_locked, &flags); iocg_commit_bio(iocg, bio, cost); return; } @@ -1820,12 +1874,12 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) * clear them and leave @iocg inactive w/ dangling use_delay heavily * penalizing the cgroup and its descendants. */ - if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) { + if (use_debt) { iocg->abs_vdebt += abs_cost; if (iocg_kick_delay(iocg, &now)) blkcg_schedule_throttle(rqos->q, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); - spin_unlock_irq(&iocg->waitq.lock); + iocg_unlock(iocg, ioc_locked, &flags); return; } @@ -1849,9 +1903,9 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) wait.committed = false; /* will be set true by waker */ __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait); - iocg_kick_waitq(iocg, &now); + iocg_kick_waitq(iocg, ioc_locked, &now); - spin_unlock_irq(&iocg->waitq.lock); + iocg_unlock(iocg, ioc_locked, &flags); while (true) { set_current_state(TASK_UNINTERRUPTIBLE); From b5e3353cfab4a162216bfd87ade7958ba4ff059b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:43 -0400 Subject: [PATCH 0348/5344] blk-iocost: add absolute usage stat commit 97eb19751f155903489f8d42ea658ace2c59faf7 upstream. Currently, iocost doesn't collect or expose any statistics punting off all monitoring duties to drgn based iocost_monitor.py. While it works for some scenarios, there are some usability and data availability challenges. For example, accurate per-cgroup usage information can't be tracked by vtime progression at all and the number available in iocg->usages[] are really short-term snapshots used for control heuristics with possibly significant errors. This patch implements per-cgroup absolute usage stat counter and exposes it through io.stat along with the current vrate. Usage stat collection and flushing employ the same method as cgroup rstat on the active iocg's and the only hot path overhead is preemption toggling and adding to a percpu counter. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-iocost.c | 155 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 149 insertions(+), 6 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index ec41468b25a76..9afebd52a76e8 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -434,6 +434,14 @@ struct ioc { bool user_cost_model:1; }; +struct iocg_pcpu_stat { + local64_t abs_vusage; +}; + +struct iocg_stat { + u64 usage_us; +}; + /* per device-cgroup pair */ struct ioc_gq { struct blkg_policy_data pd; @@ -495,10 +503,19 @@ struct ioc_gq { u32 hweight_inuse; bool has_surplus; + struct list_head walk_list; + struct wait_queue_head waitq; struct hrtimer waitq_timer; struct hrtimer delay_timer; + /* statistics */ + struct iocg_pcpu_stat __percpu *pcpu_stat; + struct iocg_stat local_stat; + struct iocg_stat desc_stat; + struct iocg_stat last_stat; + u64 last_stat_abs_vusage; + /* usage is recorded as fractions of WEIGHT_ONE */ int usage_idx; u32 usages[NR_USAGE_SLOTS]; @@ -677,10 +694,17 @@ static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse) return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE); } -static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost) +static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, + u64 abs_cost, u64 cost) { + struct iocg_pcpu_stat *gcs; + bio->bi_iocost_cost = cost; atomic64_add(cost, &iocg->vtime); + + gcs = get_cpu_ptr(iocg->pcpu_stat); + local64_add(abs_cost, &gcs->abs_vusage); + put_cpu_ptr(gcs); } static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags) @@ -1225,7 +1249,7 @@ static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode, if (ctx->vbudget < 0) return -1; - iocg_commit_bio(ctx->iocg, wait->bio, cost); + iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost); /* * autoremove_wake_function() removes the wait entry only when it @@ -1386,6 +1410,87 @@ static bool iocg_is_idle(struct ioc_gq *iocg) return true; } +/* + * Call this function on the target leaf @iocg's to build pre-order traversal + * list of all the ancestors in @inner_walk. The inner nodes are linked through + * ->walk_list and the caller is responsible for dissolving the list after use. + */ +static void iocg_build_inner_walk(struct ioc_gq *iocg, + struct list_head *inner_walk) +{ + int lvl; + + WARN_ON_ONCE(!list_empty(&iocg->walk_list)); + + /* find the first ancestor which hasn't been visited yet */ + for (lvl = iocg->level - 1; lvl >= 0; lvl--) { + if (!list_empty(&iocg->ancestors[lvl]->walk_list)) + break; + } + + /* walk down and visit the inner nodes to get pre-order traversal */ + while (++lvl <= iocg->level - 1) { + struct ioc_gq *inner = iocg->ancestors[lvl]; + + /* record traversal order */ + list_add_tail(&inner->walk_list, inner_walk); + } +} + +/* collect per-cpu counters and propagate the deltas to the parent */ +static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now) +{ + struct iocg_stat new_stat; + u64 abs_vusage = 0; + u64 vusage_delta; + int cpu; + + lockdep_assert_held(&iocg->ioc->lock); + + /* collect per-cpu counters */ + for_each_possible_cpu(cpu) { + abs_vusage += local64_read( + per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu)); + } + vusage_delta = abs_vusage - iocg->last_stat_abs_vusage; + iocg->last_stat_abs_vusage = abs_vusage; + + iocg->local_stat.usage_us += div64_u64(vusage_delta, now->vrate); + + new_stat.usage_us = + iocg->local_stat.usage_us + iocg->desc_stat.usage_us; + + /* propagate the deltas to the parent */ + if (iocg->level > 0) { + struct iocg_stat *parent_stat = + &iocg->ancestors[iocg->level - 1]->desc_stat; + + parent_stat->usage_us += + new_stat.usage_us - iocg->last_stat.usage_us; + } + + iocg->last_stat = new_stat; +} + +/* get stat counters ready for reading on all active iocgs */ +static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now) +{ + LIST_HEAD(inner_walk); + struct ioc_gq *iocg, *tiocg; + + /* flush leaves and build inner node walk list */ + list_for_each_entry(iocg, target_iocgs, active_list) { + iocg_flush_stat_one(iocg, now); + iocg_build_inner_walk(iocg, &inner_walk); + } + + /* keep flushing upwards by walking the inner list backwards */ + list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) { + iocg_flush_stat_one(iocg, now); + list_del_init(&iocg->walk_list); + } +} + /* returns usage with margin added if surplus is large enough */ static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse) { @@ -1426,6 +1531,8 @@ static void ioc_timer_fn(struct timer_list *timer) return; } + iocg_flush_stat(&ioc->active_iocgs, &now); + /* * Waiters determine the sleep durations based on the vrate they * saw at the time of sleep. If vrate has increased, some waiters @@ -1828,7 +1935,7 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) */ if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && time_before_eq64(vtime + cost, now.vnow)) { - iocg_commit_bio(iocg, bio, cost); + iocg_commit_bio(iocg, bio, abs_cost, cost); return; } @@ -1853,7 +1960,7 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) */ if (unlikely(list_empty(&iocg->active_list))) { iocg_unlock(iocg, ioc_locked, &flags); - iocg_commit_bio(iocg, bio, cost); + iocg_commit_bio(iocg, bio, abs_cost, cost); return; } @@ -1952,7 +2059,7 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, */ if (rq->bio && rq->bio->bi_iocost_cost && time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) { - iocg_commit_bio(iocg, bio, cost); + iocg_commit_bio(iocg, bio, abs_cost, cost); return; } @@ -1966,7 +2073,7 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, iocg->abs_vdebt += abs_cost; iocg_kick_delay(iocg, &now); } else { - iocg_commit_bio(iocg, bio, cost); + iocg_commit_bio(iocg, bio, abs_cost, cost); } spin_unlock_irqrestore(&iocg->waitq.lock, flags); } @@ -2137,6 +2244,12 @@ static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q, if (!iocg) return NULL; + iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp); + if (!iocg->pcpu_stat) { + kfree(iocg); + return NULL; + } + return &iocg->pd; } @@ -2156,6 +2269,7 @@ static void ioc_pd_init(struct blkg_policy_data *pd) atomic64_set(&iocg->done_vtime, now.vnow); atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period)); INIT_LIST_HEAD(&iocg->active_list); + INIT_LIST_HEAD(&iocg->walk_list); iocg->hweight_active = WEIGHT_ONE; iocg->hweight_inuse = WEIGHT_ONE; @@ -2185,18 +2299,46 @@ static void ioc_pd_free(struct blkg_policy_data *pd) if (ioc) { spin_lock_irqsave(&ioc->lock, flags); + if (!list_empty(&iocg->active_list)) { propagate_weights(iocg, 0, 0); list_del_init(&iocg->active_list); } + + WARN_ON_ONCE(!list_empty(&iocg->walk_list)); + spin_unlock_irqrestore(&ioc->lock, flags); hrtimer_cancel(&iocg->waitq_timer); hrtimer_cancel(&iocg->delay_timer); } + free_percpu(iocg->pcpu_stat); kfree(iocg); } +static size_t ioc_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) +{ + struct ioc_gq *iocg = pd_to_iocg(pd); + struct ioc *ioc = iocg->ioc; + size_t pos = 0; + + if (!ioc->enabled) + return 0; + + if (iocg->level == 0) { + unsigned vp10k = DIV64_U64_ROUND_CLOSEST( + atomic64_read(&ioc->vtime_rate) * 10000, + VTIME_PER_USEC); + pos += scnprintf(buf + pos, size - pos, " cost.vrate=%u.%02u", + vp10k / 100, vp10k % 100); + } + + pos += scnprintf(buf + pos, size - pos, " cost.usage=%llu", + iocg->last_stat.usage_us); + + return pos; +} + static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd, int off) { @@ -2610,6 +2752,7 @@ static struct blkcg_policy blkcg_policy_iocost = { .pd_alloc_fn = ioc_pd_alloc, .pd_init_fn = ioc_pd_init, .pd_free_fn = ioc_pd_free, + .pd_stat_fn = ioc_pd_stat, }; static int __init ioc_init(void) From e553918bf08eb3ccea5443912a8bec6399c01503 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:44 -0400 Subject: [PATCH 0349/5344] blk-iocost: calculate iocg->usages[] from iocg->local_stat.usage_us commit 1aa50d020c7148f5f0bde15ca80fe6f91a8c5a4e upstream. Currently, iocg->usages[] which are used to guide inuse adjustments are calculated from vtime deltas. This, however, assumes that the hierarchical inuse weight at the time of calculation held for the entire period, which often isn't true and can lead to significant errors. Now that we have absolute usage information collected, we can derive iocg->usages[] from iocg->local_stat.usage_us so that inuse adjustment decisions are made based on actual absolute usage. The calculated usage is clamped between 1 and WEIGHT_ONE and WEIGHT_ONE is also used to signal saturation regardless of the current hierarchical inuse weight. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-iocost.c | 72 ++++++++++++++++++++++------------- include/trace/events/iocost.h | 7 +--- 2 files changed, 47 insertions(+), 32 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 9afebd52a76e8..23c914f29dd56 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -479,14 +479,10 @@ struct ioc_gq { * `vtime_done` is the same but progressed on completion rather * than issue. The delta behind `vtime` represents the cost of * currently in-flight IOs. - * - * `last_vtime` is used to remember `vtime` at the end of the last - * period to calculate utilization. */ atomic64_t vtime; atomic64_t done_vtime; u64 abs_vdebt; - u64 last_vtime; /* * The period this iocg was last active in. Used for deactivation @@ -509,6 +505,9 @@ struct ioc_gq { struct hrtimer waitq_timer; struct hrtimer delay_timer; + /* timestamp at the latest activation */ + u64 activated_at; + /* statistics */ struct iocg_pcpu_stat __percpu *pcpu_stat; struct iocg_stat local_stat; @@ -517,6 +516,7 @@ struct ioc_gq { u64 last_stat_abs_vusage; /* usage is recorded as fractions of WEIGHT_ONE */ + u32 usage_delta_us; int usage_idx; u32 usages[NR_USAGE_SLOTS]; @@ -1163,7 +1163,7 @@ static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) TRACE_IOCG_PATH(iocg_activate, iocg, now, last_period, cur_period, vtime); - iocg->last_vtime = vtime; + iocg->activated_at = now->now; if (ioc->running == IOC_IDLE) { ioc->running = IOC_RUNNING; @@ -1455,7 +1455,8 @@ static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now) vusage_delta = abs_vusage - iocg->last_stat_abs_vusage; iocg->last_stat_abs_vusage = abs_vusage; - iocg->local_stat.usage_us += div64_u64(vusage_delta, now->vrate); + iocg->usage_delta_us = div64_u64(vusage_delta, now->vrate); + iocg->local_stat.usage_us += iocg->usage_delta_us; new_stat.usage_us = iocg->local_stat.usage_us + iocg->desc_stat.usage_us; @@ -1562,8 +1563,9 @@ static void ioc_timer_fn(struct timer_list *timer) /* calc usages and see whether some weights need to be moved around */ list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { - u64 vdone, vtime, vusage, vmin; + u64 vdone, vtime, usage_us, vmin; u32 hw_active, hw_inuse, usage; + int uidx; /* * Collect unused and wind vtime closer to vnow to prevent @@ -1587,27 +1589,44 @@ static void ioc_timer_fn(struct timer_list *timer) time_before64(vdone, now.vnow - period_vtime)) nr_lagging++; - if (waitqueue_active(&iocg->waitq)) - vusage = now.vnow - iocg->last_vtime; - else if (time_before64(iocg->last_vtime, vtime)) - vusage = vtime - iocg->last_vtime; - else - vusage = 0; - - iocg->last_vtime += vusage; /* - * Factor in in-flight vtime into vusage to avoid - * high-latency completions appearing as idle. This should - * be done after the above ->last_time adjustment. + * Determine absolute usage factoring in pending and in-flight + * IOs to avoid stalls and high-latency completions appearing as + * idle. */ - vusage = max(vusage, vtime - vdone); - - /* calculate hweight based usage ratio and record */ - if (vusage) { - usage = DIV64_U64_ROUND_UP(vusage * hw_inuse, - period_vtime); - iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS; - iocg->usages[iocg->usage_idx] = usage; + usage_us = iocg->usage_delta_us; + if (waitqueue_active(&iocg->waitq) && time_before64(vtime, now.vnow)) + usage_us += DIV64_U64_ROUND_UP( + cost_to_abs_cost(now.vnow - vtime, hw_inuse), + now.vrate); + if (vdone != vtime) { + u64 inflight_us = DIV64_U64_ROUND_UP( + cost_to_abs_cost(vtime - vdone, hw_inuse), + now.vrate); + usage_us = max(usage_us, inflight_us); + } + + /* convert to hweight based usage ratio and record */ + uidx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS; + + if (time_after64(vtime, now.vnow - ioc->margins.min)) { + iocg->usage_idx = uidx; + iocg->usages[uidx] = WEIGHT_ONE; + } else if (usage_us) { + u64 started_at, dur; + + if (time_after64(iocg->activated_at, ioc->period_at)) + started_at = iocg->activated_at; + else + started_at = ioc->period_at; + + dur = max_t(u64, now.now - started_at, 1); + usage = clamp_t(u32, + DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, dur), + 1, WEIGHT_ONE); + + iocg->usage_idx = uidx; + iocg->usages[uidx] = usage; } else { usage = 0; } @@ -1624,7 +1643,6 @@ static void ioc_timer_fn(struct timer_list *timer) /* throw away surplus vtime */ atomic64_add(delta, &iocg->vtime); atomic64_add(delta, &iocg->done_vtime); - iocg->last_vtime += delta; /* if usage is sufficiently low, maybe it can donate */ if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) { iocg->has_surplus = true; diff --git a/include/trace/events/iocost.h b/include/trace/events/iocost.h index c2f580fd371b1..a905ecc0342fd 100644 --- a/include/trace/events/iocost.h +++ b/include/trace/events/iocost.h @@ -26,7 +26,6 @@ TRACE_EVENT(iocost_iocg_activate, __field(u64, vrate) __field(u64, last_period) __field(u64, cur_period) - __field(u64, last_vtime) __field(u64, vtime) __field(u32, weight) __field(u32, inuse) @@ -42,7 +41,6 @@ TRACE_EVENT(iocost_iocg_activate, __entry->vrate = now->vrate; __entry->last_period = last_period; __entry->cur_period = cur_period; - __entry->last_vtime = iocg->last_vtime; __entry->vtime = vtime; __entry->weight = iocg->weight; __entry->inuse = iocg->inuse; @@ -51,13 +49,12 @@ TRACE_EVENT(iocost_iocg_activate, ), TP_printk("[%s:%s] now=%llu:%llu vrate=%llu " - "period=%llu->%llu vtime=%llu->%llu " + "period=%llu->%llu vtime=%llu " "weight=%u/%u hweight=%llu/%llu", __get_str(devname), __get_str(cgroup), __entry->now, __entry->vnow, __entry->vrate, __entry->last_period, __entry->cur_period, - __entry->last_vtime, __entry->vtime, - __entry->inuse, __entry->weight, + __entry->vtime, __entry->inuse, __entry->weight, __entry->hweight_inuse, __entry->hweight_active ) ); From dcab0fcf18e2a3cf63bdb033e11bbb1d4eda4bd8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:45 -0400 Subject: [PATCH 0350/5344] blk-iocost: replace iocg->has_surplus with ->surplus_list commit 8692d2db8e01f1a5718f9b67c219b823a64b2088 upstream. Instead of marking iocgs with surplus with a flag and filtering for them while walking all active iocgs, build a surpluses list. This doesn't make much difference now but will help implementing improved donation logic which will iterate iocgs with surplus multiple times. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-iocost.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 23c914f29dd56..6b3ebbc5a5d09 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -497,9 +497,9 @@ struct ioc_gq { int hweight_gen; u32 hweight_active; u32 hweight_inuse; - bool has_surplus; struct list_head walk_list; + struct list_head surplus_list; struct wait_queue_head waitq; struct hrtimer waitq_timer; @@ -1511,6 +1511,7 @@ static void ioc_timer_fn(struct timer_list *timer) struct ioc *ioc = container_of(timer, struct ioc, timer); struct ioc_gq *iocg, *tiocg; struct ioc_now now; + LIST_HEAD(surpluses); int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0; u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; @@ -1634,8 +1635,7 @@ static void ioc_timer_fn(struct timer_list *timer) /* see whether there's surplus vtime */ vmin = now.vnow - ioc->margins.max; - iocg->has_surplus = false; - + WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); if (!waitqueue_active(&iocg->waitq) && time_before64(vtime, vmin)) { u64 delta = vmin - vtime; @@ -1645,7 +1645,7 @@ static void ioc_timer_fn(struct timer_list *timer) atomic64_add(delta, &iocg->done_vtime); /* if usage is sufficiently low, maybe it can donate */ if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) { - iocg->has_surplus = true; + list_add(&iocg->surplus_list, &surpluses); nr_surpluses++; } } else if (hw_inuse < hw_active) { @@ -1681,13 +1681,10 @@ static void ioc_timer_fn(struct timer_list *timer) goto skip_surplus_transfers; /* there are both shortages and surpluses, transfer surpluses */ - list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { + list_for_each_entry(iocg, &surpluses, surplus_list) { u32 usage, hw_active, hw_inuse, new_hwi, new_inuse; int nr_valid = 0; - if (!iocg->has_surplus) - continue; - /* base the decision on max historical usage */ for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) { if (iocg->usages[i]) { @@ -1715,6 +1712,10 @@ static void ioc_timer_fn(struct timer_list *timer) skip_surplus_transfers: commit_weights(ioc); + /* surplus list should be dissolved after use */ + list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list) + list_del_init(&iocg->surplus_list); + /* * If q is getting clogged or we're missing too much, we're issuing * too much IO and should lower vtime rate. If we're not missing @@ -2288,6 +2289,7 @@ static void ioc_pd_init(struct blkg_policy_data *pd) atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period)); INIT_LIST_HEAD(&iocg->active_list); INIT_LIST_HEAD(&iocg->walk_list); + INIT_LIST_HEAD(&iocg->surplus_list); iocg->hweight_active = WEIGHT_ONE; iocg->hweight_inuse = WEIGHT_ONE; @@ -2324,6 +2326,7 @@ static void ioc_pd_free(struct blkg_policy_data *pd) } WARN_ON_ONCE(!list_empty(&iocg->walk_list)); + WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); spin_unlock_irqrestore(&ioc->lock, flags); From 0c9c04579a1b3bf0d31cda68c6cd5440debe22bf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:46 -0400 Subject: [PATCH 0351/5344] blk-iocost: decouple vrate adjustment from surplus transfers commit 065655c862fedf4b04e1b28b83ca6f338d81cf0b upstream. Budget donations are inaccurate and could take multiple periods to converge. To prevent triggering vrate adjustments while surplus transfers were catching up, vrate adjustment was suppressed if donations were increasing, which was indicated by non-zero nr_surpluses. This entangling won't be necessary with the scheduled rewrite of donation mechanism which will make it precise and immediate. Let's decouple the two in preparation. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-iocost.c | 19 +++++++------------ include/trace/events/iocost.h | 13 ++++--------- 2 files changed, 11 insertions(+), 21 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 6b3ebbc5a5d09..26e821918ec69 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1512,7 +1512,7 @@ static void ioc_timer_fn(struct timer_list *timer) struct ioc_gq *iocg, *tiocg; struct ioc_now now; LIST_HEAD(surpluses); - int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0; + int nr_shortages = 0, nr_lagging = 0; u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; u32 missed_ppm[2], rq_wait_pct; @@ -1644,10 +1644,8 @@ static void ioc_timer_fn(struct timer_list *timer) atomic64_add(delta, &iocg->vtime); atomic64_add(delta, &iocg->done_vtime); /* if usage is sufficiently low, maybe it can donate */ - if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) { + if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) list_add(&iocg->surplus_list, &surpluses); - nr_surpluses++; - } } else if (hw_inuse < hw_active) { u32 new_hwi, new_inuse; @@ -1677,7 +1675,7 @@ static void ioc_timer_fn(struct timer_list *timer) } } - if (!nr_shortages || !nr_surpluses) + if (!nr_shortages || list_empty(&surpluses)) goto skip_surplus_transfers; /* there are both shortages and surpluses, transfer surpluses */ @@ -1742,11 +1740,9 @@ static void ioc_timer_fn(struct timer_list *timer) /* * If there are IOs spanning multiple periods, wait - * them out before pushing the device harder. If - * there are surpluses, let redistribution work it - * out first. + * them out before pushing the device harder. */ - if (!nr_lagging && !nr_surpluses) + if (!nr_lagging) ioc->busy_level--; } else { /* @@ -1800,15 +1796,14 @@ static void ioc_timer_fn(struct timer_list *timer) } trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, - nr_lagging, nr_shortages, - nr_surpluses); + nr_lagging, nr_shortages); atomic64_set(&ioc->vtime_rate, vrate); ioc_refresh_margins(ioc); } else if (ioc->busy_level != prev_busy_level || nr_lagging) { trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate), missed_ppm, rq_wait_pct, nr_lagging, - nr_shortages, nr_surpluses); + nr_shortages); } ioc_refresh_params(ioc, false); diff --git a/include/trace/events/iocost.h b/include/trace/events/iocost.h index a905ecc0342fd..ee024fe8fef66 100644 --- a/include/trace/events/iocost.h +++ b/include/trace/events/iocost.h @@ -128,11 +128,9 @@ DEFINE_EVENT(iocg_inuse_update, iocost_inuse_reset, TRACE_EVENT(iocost_ioc_vrate_adj, TP_PROTO(struct ioc *ioc, u64 new_vrate, u32 *missed_ppm, - u32 rq_wait_pct, int nr_lagging, int nr_shortages, - int nr_surpluses), + u32 rq_wait_pct, int nr_lagging, int nr_shortages), - TP_ARGS(ioc, new_vrate, missed_ppm, rq_wait_pct, nr_lagging, nr_shortages, - nr_surpluses), + TP_ARGS(ioc, new_vrate, missed_ppm, rq_wait_pct, nr_lagging, nr_shortages), TP_STRUCT__entry ( __string(devname, ioc_name(ioc)) @@ -144,7 +142,6 @@ TRACE_EVENT(iocost_ioc_vrate_adj, __field(u32, rq_wait_pct) __field(int, nr_lagging) __field(int, nr_shortages) - __field(int, nr_surpluses) ), TP_fast_assign( @@ -157,15 +154,13 @@ TRACE_EVENT(iocost_ioc_vrate_adj, __entry->rq_wait_pct = rq_wait_pct; __entry->nr_lagging = nr_lagging; __entry->nr_shortages = nr_shortages; - __entry->nr_surpluses = nr_surpluses; ), - TP_printk("[%s] vrate=%llu->%llu busy=%d missed_ppm=%u:%u rq_wait_pct=%u lagging=%d shortages=%d surpluses=%d", + TP_printk("[%s] vrate=%llu->%llu busy=%d missed_ppm=%u:%u rq_wait_pct=%u lagging=%d shortages=%d", __get_str(devname), __entry->old_vrate, __entry->new_vrate, __entry->busy_level, __entry->read_missed_ppm, __entry->write_missed_ppm, - __entry->rq_wait_pct, __entry->nr_lagging, __entry->nr_shortages, - __entry->nr_surpluses + __entry->rq_wait_pct, __entry->nr_lagging, __entry->nr_shortages ) ); From e05013734c292c26f27fa37642b89bb2b12d1c2c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:47 -0400 Subject: [PATCH 0352/5344] blk-iocost: restructure surplus donation logic commit 93f7d2db80e4aea2731619d7b907a029e0d14259 upstream. The way the surplus donation logic is structured isn't great. There are two separate paths for starting/increasing donations and decreasing them making the logic harder to follow and is prone to unnecessary behavior differences. In preparation for improved donation handling, this patch restructures the code so that * All donors - new, increasing and decreasing - are funneled through the same code path. * The target donation calculation is factored into hweight_after_donation() which is called once from the same spot for all possible donors. * Actual inuse adjustment is factored into trasnfer_surpluses(). This change introduces a few behavior differences - e.g. donation amount reduction now uses the max usage of the recent three periods just like new and increasing donations, and inuse now gets adjusted upwards the same way it gets downwards. These differences are unlikely to have severely negative implications and the whole logic will be revamped soon. This patch also removes two tracepoints. The existing TPs don't quite fit the new implementation. A later patch will update and reinstate them. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-iocost.c | 179 ++++++++++++++++++++++++++------------------- 1 file changed, 103 insertions(+), 76 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 26e821918ec69..541fe0f06dcf2 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -497,6 +497,7 @@ struct ioc_gq { int hweight_gen; u32 hweight_active; u32 hweight_inuse; + u32 hweight_after_donation; struct list_head walk_list; struct list_head surplus_list; @@ -1074,6 +1075,32 @@ static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep *hw_inusep = iocg->hweight_inuse; } +/* + * Calculate the hweight_inuse @iocg would get with max @inuse assuming all the + * other weights stay unchanged. + */ +static u32 current_hweight_max(struct ioc_gq *iocg) +{ + u32 hwm = WEIGHT_ONE; + u32 inuse = iocg->active; + u64 child_inuse_sum; + int lvl; + + lockdep_assert_held(&iocg->ioc->lock); + + for (lvl = iocg->level - 1; lvl >= 0; lvl--) { + struct ioc_gq *parent = iocg->ancestors[lvl]; + struct ioc_gq *child = iocg->ancestors[lvl + 1]; + + child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse; + hwm = div64_u64((u64)hwm * inuse, child_inuse_sum); + inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum, + parent->child_active_sum); + } + + return max_t(u32, hwm, 1); +} + static void weight_updated(struct ioc_gq *iocg) { struct ioc *ioc = iocg->ioc; @@ -1492,20 +1519,58 @@ static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now) } } -/* returns usage with margin added if surplus is large enough */ -static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse) +/* + * Determine what @iocg's hweight_inuse should be after donating unused + * capacity. @hwm is the upper bound and used to signal no donation. This + * function also throws away @iocg's excess budget. + */ +static u32 hweight_after_donation(struct ioc_gq *iocg, u32 hwm, u32 usage, + struct ioc_now *now) { + struct ioc *ioc = iocg->ioc; + u64 vtime = atomic64_read(&iocg->vtime); + s64 excess; + + /* see whether minimum margin requirement is met */ + if (waitqueue_active(&iocg->waitq) || + time_after64(vtime, now->vnow - ioc->margins.min)) + return hwm; + + /* throw away excess above max */ + excess = now->vnow - vtime - ioc->margins.max; + if (excess > 0) { + atomic64_add(excess, &iocg->vtime); + atomic64_add(excess, &iocg->done_vtime); + vtime += excess; + } + /* add margin */ usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100); usage += SURPLUS_SCALE_ABS; /* don't bother if the surplus is too small */ - if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse) - return 0; + if (usage + SURPLUS_MIN_ADJ_DELTA > hwm) + return hwm; return usage; } +static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now) +{ + struct ioc_gq *iocg; + + list_for_each_entry(iocg, surpluses, surplus_list) { + u32 old_hwi, new_hwi, new_inuse; + + current_hweight(iocg, NULL, &old_hwi); + new_hwi = iocg->hweight_after_donation; + + new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi, + old_hwi); + __propagate_weights(iocg, iocg->weight, new_inuse); + } +} + static void ioc_timer_fn(struct timer_list *timer) { struct ioc *ioc = container_of(timer, struct ioc, timer); @@ -1564,9 +1629,9 @@ static void ioc_timer_fn(struct timer_list *timer) /* calc usages and see whether some weights need to be moved around */ list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { - u64 vdone, vtime, usage_us, vmin; + u64 vdone, vtime, usage_us; u32 hw_active, hw_inuse, usage; - int uidx; + int uidx, nr_valid; /* * Collect unused and wind vtime closer to vnow to prevent @@ -1622,92 +1687,54 @@ static void ioc_timer_fn(struct timer_list *timer) started_at = ioc->period_at; dur = max_t(u64, now.now - started_at, 1); - usage = clamp_t(u32, + + iocg->usage_idx = uidx; + iocg->usages[uidx] = clamp_t(u32, DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, dur), 1, WEIGHT_ONE); + } - iocg->usage_idx = uidx; - iocg->usages[uidx] = usage; - } else { - usage = 0; + /* base the decision on max historical usage */ + for (i = 0, usage = 0, nr_valid = 0; i < NR_USAGE_SLOTS; i++) { + if (iocg->usages[i]) { + usage = max(usage, iocg->usages[i]); + nr_valid++; + } } + if (nr_valid < MIN_VALID_USAGES) + usage = WEIGHT_ONE; /* see whether there's surplus vtime */ - vmin = now.vnow - ioc->margins.max; - WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); - if (!waitqueue_active(&iocg->waitq) && - time_before64(vtime, vmin)) { - u64 delta = vmin - vtime; - - /* throw away surplus vtime */ - atomic64_add(delta, &iocg->vtime); - atomic64_add(delta, &iocg->done_vtime); - /* if usage is sufficiently low, maybe it can donate */ - if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) - list_add(&iocg->surplus_list, &surpluses); - } else if (hw_inuse < hw_active) { - u32 new_hwi, new_inuse; + if (hw_inuse < hw_active || + (!waitqueue_active(&iocg->waitq) && + time_before64(vtime, now.vnow - ioc->margins.max))) { + u32 hwm, new_hwi; - /* was donating but might need to take back some */ - if (waitqueue_active(&iocg->waitq)) { - new_hwi = hw_active; + /* + * Already donating or accumulated enough to start. + * Determine the donation amount. + */ + hwm = current_hweight_max(iocg); + new_hwi = hweight_after_donation(iocg, hwm, usage, + &now); + if (new_hwi < hwm) { + iocg->hweight_after_donation = new_hwi; + list_add(&iocg->surplus_list, &surpluses); } else { - new_hwi = max(hw_inuse, - usage * SURPLUS_SCALE_PCT / 100 + - SURPLUS_SCALE_ABS); - } - - new_inuse = div64_u64((u64)iocg->inuse * new_hwi, - hw_inuse); - new_inuse = clamp_t(u32, new_inuse, 1, iocg->active); - - if (new_inuse > iocg->inuse) { - TRACE_IOCG_PATH(inuse_takeback, iocg, &now, - iocg->inuse, new_inuse, - hw_inuse, new_hwi); - __propagate_weights(iocg, iocg->weight, - new_inuse); + __propagate_weights(iocg, iocg->active, + iocg->active); + nr_shortages++; } } else { - /* genuninely out of vtime */ + /* genuinely short on vtime */ nr_shortages++; } } - if (!nr_shortages || list_empty(&surpluses)) - goto skip_surplus_transfers; + if (!list_empty(&surpluses) && nr_shortages) + transfer_surpluses(&surpluses, &now); - /* there are both shortages and surpluses, transfer surpluses */ - list_for_each_entry(iocg, &surpluses, surplus_list) { - u32 usage, hw_active, hw_inuse, new_hwi, new_inuse; - int nr_valid = 0; - - /* base the decision on max historical usage */ - for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) { - if (iocg->usages[i]) { - usage = max(usage, iocg->usages[i]); - nr_valid++; - } - } - if (nr_valid < MIN_VALID_USAGES) - continue; - - current_hweight(iocg, &hw_active, &hw_inuse); - new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse); - if (!new_hwi) - continue; - - new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi, - hw_inuse); - if (new_inuse < iocg->inuse) { - TRACE_IOCG_PATH(inuse_giveaway, iocg, &now, - iocg->inuse, new_inuse, - hw_inuse, new_hwi); - __propagate_weights(iocg, iocg->weight, new_inuse); - } - } -skip_surplus_transfers: commit_weights(ioc); /* surplus list should be dissolved after use */ From 4177e2cb19c9017afe2182a18f4c690f30ea6db3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:48 -0400 Subject: [PATCH 0353/5344] blk-iocost: implement Andy's method for donation weight updates commit e08d02aa5fc9978ed42240bd80d3fb5dd28ecac0 upstream. iocost implements work conservation by reducing iocg->inuse and propagating the adjustment upwards proportionally. However, while I knew the target absolute hierarchical proportion - adjusted hweight_inuse, I couldn't figure out how to determine the iocg->inuse adjustment to achieve that and approximated the adjustment by scaling iocg->inuse using the proportion of the needed hweight_inuse changes. When nested, these scalings aren't accurate even when adjusting a single node as the donating node also receives the benefit of the donated portion. When multiple nodes are donating as they often do, they can be wildly wrong. iocost employed various safety nets to combat the inaccuracies. There are ample buffers in determining how much to donate, the adjustments are conservative and gradual. While it can achieve a reasonable level of work conservation in simple scenarios, the inaccuracies can easily add up leading to significant loss of total work. This in turn makes it difficult to closely cap vrate as vrate adjustment is needed to compensate for the loss of work. The combination of inaccurate donation calculations and vrate adjustments can lead to wide fluctuations and clunky overall behaviors. Andy Newell devised a method to calculate the needed ->inuse updates to achieve the target hweight_inuse's. The method is compatible with the proportional inuse adjustment propagation which allows all hot path operations to be local to each iocg. To roughly summarize, Andy's method divides the tree into donating and non-donating parts, calculates global donation rate which is used to determine the target hweight_inuse for each node, and then derives per-level proportions. There's non-trivial amount of math involved. Please refer to the following pdfs for detailed descriptions. https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN This patch implements Andy's method in transfer_surpluses(). This makes the donation calculations accurate per cycle and enables further improvements in other parts of the donation logic. Signed-off-by: Tejun Heo Cc: Andy Newell Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-iocost.c | 252 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 244 insertions(+), 8 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 541fe0f06dcf2..eb7e9ec4bc006 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -494,9 +494,11 @@ struct ioc_gq { /* see __propagate_weights() and current_hweight() for details */ u64 child_active_sum; u64 child_inuse_sum; + u64 child_adjusted_sum; int hweight_gen; u32 hweight_active; u32 hweight_inuse; + u32 hweight_donating; u32 hweight_after_donation; struct list_head walk_list; @@ -1555,20 +1557,252 @@ static u32 hweight_after_donation(struct ioc_gq *iocg, u32 hwm, u32 usage, return usage; } +/* + * For work-conservation, an iocg which isn't using all of its share should + * donate the leftover to other iocgs. There are two ways to achieve this - 1. + * bumping up vrate accordingly 2. lowering the donating iocg's inuse weight. + * + * #1 is mathematically simpler but has the drawback of requiring synchronous + * global hweight_inuse updates when idle iocg's get activated or inuse weights + * change due to donation snapbacks as it has the possibility of grossly + * overshooting what's allowed by the model and vrate. + * + * #2 is inherently safe with local operations. The donating iocg can easily + * snap back to higher weights when needed without worrying about impacts on + * other nodes as the impacts will be inherently correct. This also makes idle + * iocg activations safe. The only effect activations have is decreasing + * hweight_inuse of others, the right solution to which is for those iocgs to + * snap back to higher weights. + * + * So, we go with #2. The challenge is calculating how each donating iocg's + * inuse should be adjusted to achieve the target donation amounts. This is done + * using Andy's method described in the following pdf. + * + * https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo + * + * Given the weights and target after-donation hweight_inuse values, Andy's + * method determines how the proportional distribution should look like at each + * sibling level to maintain the relative relationship between all non-donating + * pairs. To roughly summarize, it divides the tree into donating and + * non-donating parts, calculates global donation rate which is used to + * determine the target hweight_inuse for each node, and then derives per-level + * proportions. + * + * The following pdf shows that global distribution calculated this way can be + * achieved by scaling inuse weights of donating leaves and propagating the + * adjustments upwards proportionally. + * + * https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE + * + * Combining the above two, we can determine how each leaf iocg's inuse should + * be adjusted to achieve the target donation. + * + * https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN + * + * The inline comments use symbols from the last pdf. + * + * b is the sum of the absolute budgets in the subtree. 1 for the root node. + * f is the sum of the absolute budgets of non-donating nodes in the subtree. + * t is the sum of the absolute budgets of donating nodes in the subtree. + * w is the weight of the node. w = w_f + w_t + * w_f is the non-donating portion of w. w_f = w * f / b + * w_b is the donating portion of w. w_t = w * t / b + * s is the sum of all sibling weights. s = Sum(w) for siblings + * s_f and s_t are the non-donating and donating portions of s. + * + * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g. + * w_pt is the donating portion of the parent's weight and w'_pt the same value + * after adjustments. Subscript r denotes the root node's values. + */ static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now) { - struct ioc_gq *iocg; + LIST_HEAD(over_hwa); + LIST_HEAD(inner_walk); + struct ioc_gq *iocg, *tiocg, *root_iocg; + u32 after_sum, over_sum, over_target, gamma; + + /* + * It's pretty unlikely but possible for the total sum of + * hweight_after_donation's to be higher than WEIGHT_ONE, which will + * confuse the following calculations. If such condition is detected, + * scale down everyone over its full share equally to keep the sum below + * WEIGHT_ONE. + */ + after_sum = 0; + over_sum = 0; + list_for_each_entry(iocg, surpluses, surplus_list) { + u32 hwa; + + current_hweight(iocg, &hwa, NULL); + after_sum += iocg->hweight_after_donation; + + if (iocg->hweight_after_donation > hwa) { + over_sum += iocg->hweight_after_donation; + list_add(&iocg->walk_list, &over_hwa); + } + } + + if (after_sum >= WEIGHT_ONE) { + /* + * The delta should be deducted from the over_sum, calculate + * target over_sum value. + */ + u32 over_delta = after_sum - (WEIGHT_ONE - 1); + WARN_ON_ONCE(over_sum <= over_delta); + over_target = over_sum - over_delta; + } else { + over_target = 0; + } + + list_for_each_entry_safe(iocg, tiocg, &over_hwa, walk_list) { + if (over_target) + iocg->hweight_after_donation = + div_u64((u64)iocg->hweight_after_donation * + over_target, over_sum); + list_del_init(&iocg->walk_list); + } + + /* + * Build pre-order inner node walk list and prepare for donation + * adjustment calculations. + */ + list_for_each_entry(iocg, surpluses, surplus_list) { + iocg_build_inner_walk(iocg, &inner_walk); + } + + root_iocg = list_first_entry(&inner_walk, struct ioc_gq, walk_list); + WARN_ON_ONCE(root_iocg->level > 0); + list_for_each_entry(iocg, &inner_walk, walk_list) { + iocg->child_adjusted_sum = 0; + iocg->hweight_donating = 0; + iocg->hweight_after_donation = 0; + } + + /* + * Propagate the donating budget (b_t) and after donation budget (b'_t) + * up the hierarchy. + */ list_for_each_entry(iocg, surpluses, surplus_list) { - u32 old_hwi, new_hwi, new_inuse; + struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; + + parent->hweight_donating += iocg->hweight_donating; + parent->hweight_after_donation += iocg->hweight_after_donation; + } - current_hweight(iocg, NULL, &old_hwi); - new_hwi = iocg->hweight_after_donation; + list_for_each_entry_reverse(iocg, &inner_walk, walk_list) { + if (iocg->level > 0) { + struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; - new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi, - old_hwi); - __propagate_weights(iocg, iocg->weight, new_inuse); + parent->hweight_donating += iocg->hweight_donating; + parent->hweight_after_donation += iocg->hweight_after_donation; + } + } + + /* + * Calculate inner hwa's (b) and make sure the donation values are + * within the accepted ranges as we're doing low res calculations with + * roundups. + */ + list_for_each_entry(iocg, &inner_walk, walk_list) { + if (iocg->level) { + struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; + + iocg->hweight_active = DIV64_U64_ROUND_UP( + (u64)parent->hweight_active * iocg->active, + parent->child_active_sum); + + } + + iocg->hweight_donating = min(iocg->hweight_donating, + iocg->hweight_active); + iocg->hweight_after_donation = min(iocg->hweight_after_donation, + iocg->hweight_donating - 1); + if (WARN_ON_ONCE(iocg->hweight_active <= 1 || + iocg->hweight_donating <= 1 || + iocg->hweight_after_donation == 0)) { + pr_warn("iocg: invalid donation weights in "); + pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup); + pr_cont(": active=%u donating=%u after=%u\n", + iocg->hweight_active, iocg->hweight_donating, + iocg->hweight_after_donation); + } } + + /* + * Calculate the global donation rate (gamma) - the rate to adjust + * non-donating budgets by. No need to use 64bit multiplication here as + * the first operand is guaranteed to be smaller than WEIGHT_ONE + * (1<<16). + * + * gamma = (1 - t_r') / (1 - t_r) + */ + gamma = DIV_ROUND_UP( + (WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE, + WEIGHT_ONE - root_iocg->hweight_donating); + + /* + * Calculate adjusted hwi, child_adjusted_sum and inuse for the inner + * nodes. + */ + list_for_each_entry(iocg, &inner_walk, walk_list) { + struct ioc_gq *parent; + u32 inuse, wpt, wptp; + u64 st, sf; + + if (iocg->level == 0) { + /* adjusted weight sum for 1st level: s' = s * b_pf / b'_pf */ + iocg->child_adjusted_sum = DIV64_U64_ROUND_UP( + iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating), + WEIGHT_ONE - iocg->hweight_after_donation); + continue; + } + + parent = iocg->ancestors[iocg->level - 1]; + + /* b' = gamma * b_f + b_t' */ + iocg->hweight_inuse = DIV64_U64_ROUND_UP( + (u64)gamma * (iocg->hweight_active - iocg->hweight_donating), + WEIGHT_ONE) + iocg->hweight_after_donation; + + /* w' = s' * b' / b'_p */ + inuse = DIV64_U64_ROUND_UP( + (u64)parent->child_adjusted_sum * iocg->hweight_inuse, + parent->hweight_inuse); + + /* adjusted weight sum for children: s' = s_f + s_t * w'_pt / w_pt */ + st = DIV64_U64_ROUND_UP( + iocg->child_active_sum * iocg->hweight_donating, + iocg->hweight_active); + sf = iocg->child_active_sum - st; + wpt = DIV64_U64_ROUND_UP( + (u64)iocg->active * iocg->hweight_donating, + iocg->hweight_active); + wptp = DIV64_U64_ROUND_UP( + (u64)inuse * iocg->hweight_after_donation, + iocg->hweight_inuse); + + iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt); + } + + /* + * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and + * we can finally determine leaf adjustments. + */ + list_for_each_entry(iocg, surpluses, surplus_list) { + struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; + u32 inuse; + + /* w' = s' * b' / b'_p, note that b' == b'_t for donating leaves */ + inuse = DIV64_U64_ROUND_UP( + parent->child_adjusted_sum * iocg->hweight_after_donation, + parent->hweight_inuse); + __propagate_weights(iocg, iocg->active, inuse); + } + + /* walk list should be dissolved after use */ + list_for_each_entry_safe(iocg, tiocg, &inner_walk, walk_list) + list_del_init(&iocg->walk_list); } static void ioc_timer_fn(struct timer_list *timer) @@ -1709,16 +1943,18 @@ static void ioc_timer_fn(struct timer_list *timer) if (hw_inuse < hw_active || (!waitqueue_active(&iocg->waitq) && time_before64(vtime, now.vnow - ioc->margins.max))) { - u32 hwm, new_hwi; + u32 hwa, hwm, new_hwi; /* * Already donating or accumulated enough to start. * Determine the donation amount. */ + current_hweight(iocg, &hwa, NULL); hwm = current_hweight_max(iocg); new_hwi = hweight_after_donation(iocg, hwm, usage, &now); if (new_hwi < hwm) { + iocg->hweight_donating = hwa; iocg->hweight_after_donation = new_hwi; list_add(&iocg->surplus_list, &surpluses); } else { From 78c17c651f0f57f578cb8d6145bbe822fd5367a2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:49 -0400 Subject: [PATCH 0354/5344] blk-iocost: revamp donation amount determination commit f1de2439ec43b74764f2a26e3a310b24407e3bde upstream. iocost has various safety nets to combat inuse adjustment calculation inaccuracies. With Andy's method implemented in transfer_surpluses(), inuse adjustment calculations are now accurate and we can make donation amount determinations accurate too. * Stop keeping track of past usage history and using the maximum. Act on the immediate usage information. * Remove donation constraints defined by SURPLUS_* constants. Donate whatever isn't used. * Determine the donation amount so that the iocg will end up with MARGIN_TARGET_PCT budget at the end of the coming period assuming the same usage as the previous period. TARGET is set at 50% of period, which is the previous maximum. This provides smooth convergence for most repetitive IO patterns. * Apply donation logic early at 20% budget. There's no risk in doing so as the calculation is based on the delta between the current budget and the target budget at the end of the coming period. * Remove preemptive iocg activation for zero cost IOs. As donation can reach near zero now, the mere activation doesn't provide any protection anymore. In the unlikely case that this becomes a problem, the right solution is assigning appropriate costs for such IOs. This significantly improves the donation determination logic while also simplifying it. Now all donations are immediate, exact and smooth. Signed-off-by: Tejun Heo Cc: Andy Newell Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-iocost.c | 133 +++++++++++++++++---------------------------- 1 file changed, 51 insertions(+), 82 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index eb7e9ec4bc006..975cb1812a222 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -220,12 +220,14 @@ enum { MAX_PERIOD = USEC_PER_SEC, /* - * A cgroup's vtime can run 50% behind the device vtime, which + * iocg->vtime is targeted at 50% behind the device vtime, which * serves as its IO credit buffer. Surplus weight adjustment is * immediately canceled if the vtime margin runs below 10%. */ MARGIN_MIN_PCT = 10, - MARGIN_MAX_PCT = 50, + MARGIN_LOW_PCT = 20, + MARGIN_TARGET_PCT = 50, + MARGIN_MAX_PCT = 100, /* Have some play in timer operations */ TIMER_SLACK_PCT = 1, @@ -237,17 +239,6 @@ enum { */ VTIME_VALID_DUR = 300 * USEC_PER_SEC, - /* - * Remember the past three non-zero usages and use the max for - * surplus calculation. Three slots guarantee that we remember one - * full period usage from the last active stretch even after - * partial deactivation and re-activation periods. Don't start - * giving away weight before collecting two data points to prevent - * hweight adjustments based on one partial activation period. - */ - NR_USAGE_SLOTS = 3, - MIN_VALID_USAGES = 2, - /* 1/64k is granular enough and can easily be handled w/ u32 */ WEIGHT_ONE = 1 << 16, @@ -283,14 +274,6 @@ enum { /* don't let cmds which take a very long time pin lagging for too long */ MAX_LAGGING_PERIODS = 10, - /* - * If usage% * 1.25 + 2% is lower than hweight% by more than 3%, - * donate the surplus. - */ - SURPLUS_SCALE_PCT = 125, /* * 125% */ - SURPLUS_SCALE_ABS = WEIGHT_ONE / 50, /* + 2% */ - SURPLUS_MIN_ADJ_DELTA = WEIGHT_ONE / 33, /* 3% */ - /* switch iff the conditions are met for longer than this */ AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC, @@ -379,6 +362,8 @@ struct ioc_params { struct ioc_margins { s64 min; + s64 low; + s64 target; s64 max; }; @@ -517,11 +502,7 @@ struct ioc_gq { struct iocg_stat desc_stat; struct iocg_stat last_stat; u64 last_stat_abs_vusage; - - /* usage is recorded as fractions of WEIGHT_ONE */ - u32 usage_delta_us; - int usage_idx; - u32 usages[NR_USAGE_SLOTS]; + u64 usage_delta_us; /* this iocg's depth in the hierarchy and ancestors including self */ int level; @@ -740,6 +721,8 @@ static void ioc_refresh_margins(struct ioc *ioc) u64 vrate = atomic64_read(&ioc->vtime_rate); margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate; + margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate; + margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate; margins->max = (period_us * MARGIN_MAX_PCT / 100) * vrate; } @@ -1232,7 +1215,7 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) return false; } if (!atomic_read(&blkg->use_delay) && - time_before_eq64(vtime, now->vnow + ioc->margins.max)) + time_before_eq64(vtime, now->vnow + ioc->margins.target)) return false; /* use delay */ @@ -1531,7 +1514,7 @@ static u32 hweight_after_donation(struct ioc_gq *iocg, u32 hwm, u32 usage, { struct ioc *ioc = iocg->ioc; u64 vtime = atomic64_read(&iocg->vtime); - s64 excess; + s64 excess, delta, target, new_hwi; /* see whether minimum margin requirement is met */ if (waitqueue_active(&iocg->waitq) || @@ -1546,15 +1529,28 @@ static u32 hweight_after_donation(struct ioc_gq *iocg, u32 hwm, u32 usage, vtime += excess; } - /* add margin */ - usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100); - usage += SURPLUS_SCALE_ABS; - - /* don't bother if the surplus is too small */ - if (usage + SURPLUS_MIN_ADJ_DELTA > hwm) - return hwm; + /* + * Let's say the distance between iocg's and device's vtimes as a + * fraction of period duration is delta. Assuming that the iocg will + * consume the usage determined above, we want to determine new_hwi so + * that delta equals MARGIN_TARGET at the end of the next period. + * + * We need to execute usage worth of IOs while spending the sum of the + * new budget (1 - MARGIN_TARGET) and the leftover from the last period + * (delta): + * + * usage = (1 - MARGIN_TARGET + delta) * new_hwi + * + * Therefore, the new_hwi is: + * + * new_hwi = usage / (1 - MARGIN_TARGET + delta) + */ + delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime), + now->vnow - ioc->period_at_vtime); + target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100; + new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta); - return usage; + return clamp_t(s64, new_hwi, 1, hwm); } /* @@ -1816,7 +1812,7 @@ static void ioc_timer_fn(struct timer_list *timer) u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; u32 missed_ppm[2], rq_wait_pct; u64 period_vtime; - int prev_busy_level, i; + int prev_busy_level; /* how were the latencies during the period? */ ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct); @@ -1861,11 +1857,10 @@ static void ioc_timer_fn(struct timer_list *timer) } commit_weights(ioc); - /* calc usages and see whether some weights need to be moved around */ + /* calc usage and see whether some weights need to be moved around */ list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { - u64 vdone, vtime, usage_us; - u32 hw_active, hw_inuse, usage; - int uidx, nr_valid; + u64 vdone, vtime, usage_us, usage_dur; + u32 usage, hw_active, hw_inuse; /* * Collect unused and wind vtime closer to vnow to prevent @@ -1890,15 +1885,11 @@ static void ioc_timer_fn(struct timer_list *timer) nr_lagging++; /* - * Determine absolute usage factoring in pending and in-flight - * IOs to avoid stalls and high-latency completions appearing as - * idle. + * Determine absolute usage factoring in in-flight IOs to avoid + * high-latency completions appearing as idle. */ usage_us = iocg->usage_delta_us; - if (waitqueue_active(&iocg->waitq) && time_before64(vtime, now.vnow)) - usage_us += DIV64_U64_ROUND_UP( - cost_to_abs_cost(now.vnow - vtime, hw_inuse), - now.vrate); + if (vdone != vtime) { u64 inflight_us = DIV64_U64_ROUND_UP( cost_to_abs_cost(vtime - vdone, hw_inuse), @@ -1906,43 +1897,22 @@ static void ioc_timer_fn(struct timer_list *timer) usage_us = max(usage_us, inflight_us); } - /* convert to hweight based usage ratio and record */ - uidx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS; - - if (time_after64(vtime, now.vnow - ioc->margins.min)) { - iocg->usage_idx = uidx; - iocg->usages[uidx] = WEIGHT_ONE; - } else if (usage_us) { - u64 started_at, dur; - - if (time_after64(iocg->activated_at, ioc->period_at)) - started_at = iocg->activated_at; - else - started_at = ioc->period_at; - - dur = max_t(u64, now.now - started_at, 1); + /* convert to hweight based usage ratio */ + if (time_after64(iocg->activated_at, ioc->period_at)) + usage_dur = max_t(u64, now.now - iocg->activated_at, 1); + else + usage_dur = max_t(u64, now.now - ioc->period_at, 1); - iocg->usage_idx = uidx; - iocg->usages[uidx] = clamp_t(u32, - DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, dur), + usage = clamp_t(u32, + DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, + usage_dur), 1, WEIGHT_ONE); - } - - /* base the decision on max historical usage */ - for (i = 0, usage = 0, nr_valid = 0; i < NR_USAGE_SLOTS; i++) { - if (iocg->usages[i]) { - usage = max(usage, iocg->usages[i]); - nr_valid++; - } - } - if (nr_valid < MIN_VALID_USAGES) - usage = WEIGHT_ONE; /* see whether there's surplus vtime */ WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); if (hw_inuse < hw_active || (!waitqueue_active(&iocg->waitq) && - time_before64(vtime, now.vnow - ioc->margins.max))) { + time_before64(vtime, now.vnow - ioc->margins.low))) { u32 hwa, hwm, new_hwi; /* @@ -2179,15 +2149,14 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) if (!ioc->enabled || !iocg->level) return; - /* always activate so that even 0 cost IOs get protected to some level */ - if (!iocg_activate(iocg, &now)) - return; - /* calculate the absolute vtime cost */ abs_cost = calc_vtime_cost(bio, iocg, false); if (!abs_cost) return; + if (!iocg_activate(iocg, &now)) + return; + iocg->cursor = bio_end_sector(bio); vtime = atomic64_read(&iocg->vtime); From add8b3c8f0453d973e5699bf51569c438f6a3538 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:50 -0400 Subject: [PATCH 0355/5344] blk-iocost: revamp in-period donation snapbacks commit b0853ab4a238c54b8f97ca7dde1ae156e2bbd5e4 upstream. When the margin drops below the minimum on a donating iocg, donation is immediately canceled in full. There are a couple shortcomings with the current behavior. * It's abrupt. A small temporary budget deficit can lead to a wide swing in weight allocation and a large surplus. * It's open coded in the issue path but not implemented for the merge path. A series of merges at a low inuse can make the iocg incur debts and stall incorrectly. This patch reimplements in-period donation snapbacks so that * inuse adjustment and cost calculations are factored into adjust_inuse_and_calc_cost() which is called from both the issue and merge paths. * Snapbacks are more gradual. It occurs in quarter steps. * A snapback triggers if the margin goes below the low threshold and is lower than the budget at the time of the last adjustment. * For the above, __propagate_weights() stores the margin in iocg->saved_margin. Move iocg->last_inuse storing together into __propagate_weights() for consistency. * Full snapback is guaranteed when there are waiters. * With precise donation and gradual snapbacks, inuse adjustments are now a lot more effective and the value of scaling inuse on weight changes isn't clear. Removed inuse scaling from weight_update(). Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-iocost.c | 133 ++++++++++++++++++++++++++++++++------------- 1 file changed, 96 insertions(+), 37 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 975cb1812a222..e76ba983921fe 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -229,6 +229,8 @@ enum { MARGIN_TARGET_PCT = 50, MARGIN_MAX_PCT = 100, + INUSE_ADJ_STEP_PCT = 25, + /* Have some play in timer operations */ TIMER_SLACK_PCT = 1, @@ -446,12 +448,17 @@ struct ioc_gq { * * `last_inuse` remembers `inuse` while an iocg is idle to persist * surplus adjustments. + * + * `inuse` may be adjusted dynamically during period. `saved_*` are used + * to determine and track adjustments. */ u32 cfg_weight; u32 weight; u32 active; u32 inuse; + u32 last_inuse; + s64 saved_margin; sector_t cursor; /* to detect randio */ @@ -938,9 +945,11 @@ static void ioc_start_period(struct ioc *ioc, struct ioc_now *now) /* * Update @iocg's `active` and `inuse` to @active and @inuse, update level - * weight sums and propagate upwards accordingly. + * weight sums and propagate upwards accordingly. If @save, the current margin + * is saved to be used as reference for later inuse in-period adjustments. */ -static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse) +static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, + bool save, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; int lvl; @@ -949,6 +958,10 @@ static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse) inuse = clamp_t(u32, inuse, 1, active); + iocg->last_inuse = iocg->inuse; + if (save) + iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime); + if (active == iocg->active && inuse == iocg->inuse) return; @@ -1000,9 +1013,10 @@ static void commit_weights(struct ioc *ioc) } } -static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse) +static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, + bool save, struct ioc_now *now) { - __propagate_weights(iocg, active, inuse); + __propagate_weights(iocg, active, inuse, save, now); commit_weights(iocg->ioc); } @@ -1086,7 +1100,7 @@ static u32 current_hweight_max(struct ioc_gq *iocg) return max_t(u32, hwm, 1); } -static void weight_updated(struct ioc_gq *iocg) +static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct blkcg_gq *blkg = iocg_to_blkg(iocg); @@ -1097,9 +1111,7 @@ static void weight_updated(struct ioc_gq *iocg) weight = iocg->cfg_weight ?: iocc->dfl_weight; if (weight != iocg->weight && iocg->active) - propagate_weights(iocg, weight, - DIV64_U64_ROUND_UP((u64)iocg->inuse * weight, - iocg->weight)); + propagate_weights(iocg, weight, iocg->inuse, true, now); iocg->weight = weight; } @@ -1169,8 +1181,9 @@ static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) */ iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1; list_add(&iocg->active_list, &ioc->active_iocgs); + propagate_weights(iocg, iocg->weight, - iocg->last_inuse ?: iocg->weight); + iocg->last_inuse ?: iocg->weight, true, now); TRACE_IOCG_PATH(iocg_activate, iocg, now, last_period, cur_period, vtime); @@ -1793,7 +1806,7 @@ static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now) inuse = DIV64_U64_ROUND_UP( parent->child_adjusted_sum * iocg->hweight_after_donation, parent->hweight_inuse); - __propagate_weights(iocg, iocg->active, inuse); + __propagate_weights(iocg, iocg->active, inuse, true, now); } /* walk list should be dissolved after use */ @@ -1848,8 +1861,7 @@ static void ioc_timer_fn(struct timer_list *timer) iocg_kick_waitq(iocg, true, &now); } else if (iocg_is_idle(iocg)) { /* no waiter and idle, deactivate */ - iocg->last_inuse = iocg->inuse; - __propagate_weights(iocg, 0, 0); + __propagate_weights(iocg, 0, 0, false, &now); list_del_init(&iocg->active_list); } @@ -1929,7 +1941,7 @@ static void ioc_timer_fn(struct timer_list *timer) list_add(&iocg->surplus_list, &surpluses); } else { __propagate_weights(iocg, iocg->active, - iocg->active); + iocg->active, true, &now); nr_shortages++; } } else { @@ -2059,6 +2071,50 @@ static void ioc_timer_fn(struct timer_list *timer) spin_unlock_irq(&ioc->lock); } +static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, + u64 abs_cost, struct ioc_now *now) +{ + struct ioc *ioc = iocg->ioc; + struct ioc_margins *margins = &ioc->margins; + u32 adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100); + u32 hwi; + s64 margin; + u64 cost, new_inuse; + + current_hweight(iocg, NULL, &hwi); + cost = abs_cost_to_cost(abs_cost, hwi); + margin = now->vnow - vtime - cost; + + /* + * We only increase inuse during period and do so iff the margin has + * deteriorated since the previous adjustment. + */ + if (margin >= iocg->saved_margin || margin >= margins->low || + iocg->inuse == iocg->active) + return cost; + + spin_lock_irq(&ioc->lock); + + /* we own inuse only when @iocg is in the normal active state */ + if (list_empty(&iocg->active_list)) { + spin_unlock_irq(&ioc->lock); + return cost; + } + + /* bump up inuse till @abs_cost fits in the existing budget */ + new_inuse = iocg->inuse; + do { + new_inuse = new_inuse + adj_step; + propagate_weights(iocg, iocg->active, new_inuse, true, now); + current_hweight(iocg, NULL, &hwi); + cost = abs_cost_to_cost(abs_cost, hwi); + } while (time_after64(vtime + cost, now->vnow) && + iocg->inuse != iocg->active); + + spin_unlock_irq(&ioc->lock); + return cost; +} + static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg, bool is_merge, u64 *costp) { @@ -2140,7 +2196,6 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) struct ioc_gq *iocg = blkg_to_iocg(blkg); struct ioc_now now; struct iocg_wait wait; - u32 hw_active, hw_inuse; u64 abs_cost, cost, vtime; bool use_debt, ioc_locked; unsigned long flags; @@ -2158,21 +2213,8 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) return; iocg->cursor = bio_end_sector(bio); - vtime = atomic64_read(&iocg->vtime); - current_hweight(iocg, &hw_active, &hw_inuse); - - if (hw_inuse < hw_active && - time_after_eq64(vtime + ioc->margins.min, now.vnow)) { - TRACE_IOCG_PATH(inuse_reset, iocg, &now, - iocg->inuse, iocg->weight, hw_inuse, hw_active); - spin_lock_irq(&ioc->lock); - propagate_weights(iocg, iocg->weight, iocg->weight); - spin_unlock_irq(&ioc->lock); - current_hweight(iocg, &hw_active, &hw_inuse); - } - - cost = abs_cost_to_cost(abs_cost, hw_inuse); + cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now); /* * If no one's waiting and within budget, issue right away. The @@ -2194,7 +2236,7 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) */ use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current); ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt); - +retry_lock: iocg_lock(iocg, ioc_locked, &flags); /* @@ -2236,6 +2278,17 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) return; } + /* guarantee that iocgs w/ waiters have maximum inuse */ + if (iocg->inuse != iocg->active) { + if (!ioc_locked) { + iocg_unlock(iocg, false, &flags); + ioc_locked = true; + goto retry_lock; + } + propagate_weights(iocg, iocg->active, iocg->active, true, + &now); + } + /* * Append self to the waitq and schedule the wakeup timer if we're * the first waiter. The timer duration is calculated based on the @@ -2278,8 +2331,7 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, struct ioc *ioc = iocg->ioc; sector_t bio_end = bio_end_sector(bio); struct ioc_now now; - u32 hw_inuse; - u64 abs_cost, cost; + u64 vtime, abs_cost, cost; unsigned long flags; /* bypass if disabled or for root cgroup */ @@ -2291,8 +2343,9 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, return; ioc_now(ioc, &now); - current_hweight(iocg, NULL, &hw_inuse); - cost = abs_cost_to_cost(abs_cost, hw_inuse); + + vtime = atomic64_read(&iocg->vtime); + cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now); /* update cursor if backmerging into the request at the cursor */ if (blk_rq_pos(rq) < bio_end && @@ -2534,7 +2587,7 @@ static void ioc_pd_init(struct blkg_policy_data *pd) } spin_lock_irqsave(&ioc->lock, flags); - weight_updated(iocg); + weight_updated(iocg, &now); spin_unlock_irqrestore(&ioc->lock, flags); } @@ -2548,7 +2601,10 @@ static void ioc_pd_free(struct blkg_policy_data *pd) spin_lock_irqsave(&ioc->lock, flags); if (!list_empty(&iocg->active_list)) { - propagate_weights(iocg, 0, 0); + struct ioc_now now; + + ioc_now(ioc, &now); + propagate_weights(iocg, 0, 0, false, &now); list_del_init(&iocg->active_list); } @@ -2616,6 +2672,7 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg); struct blkg_conf_ctx ctx; + struct ioc_now now; struct ioc_gq *iocg; u32 v; int ret; @@ -2636,7 +2693,8 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, if (iocg) { spin_lock_irq(&iocg->ioc->lock); - weight_updated(iocg); + ioc_now(iocg->ioc, &now); + weight_updated(iocg, &now); spin_unlock_irq(&iocg->ioc->lock); } } @@ -2662,7 +2720,8 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, spin_lock(&iocg->ioc->lock); iocg->cfg_weight = v * WEIGHT_ONE; - weight_updated(iocg); + ioc_now(iocg->ioc, &now); + weight_updated(iocg, &now); spin_unlock(&iocg->ioc->lock); blkg_conf_finish(&ctx); From 92de4bf5b0ae7f59cbbd329f384a1de6cab0daaa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:51 -0400 Subject: [PATCH 0356/5344] blk-iocost: revamp debt handling commit c421a3eb2e27402c14131a14390551ae6cdb15fa upstream. Debt handling had several issues. * How much inuse a debtor carries wasn't clearly defined. inuse would be driven down over time from not issuing IOs but it'd be better to clamp it to minimum immediately once in debt. * How much can be paid off was determined by hweight_inuse. As inuse was driven down, the payment amount would fall together regardless of the debtor's active weight. This means that the debtors were punished harshly. * ioc_rqos_merge() wasn't calling blkcg_schedule_throttle() after iocg_kick_delay(). This patch revamps debt handling so that * Debt handling owns inuse for iocgs in debt and keeps them at zero. * Payment amount is determined by hweight_active. This is more deterministic and safer than hweight_inuse but still far from ideal in that it doesn't factor in possible donations from other iocgs for debt payments. This likely needs further improvements in the future. * iocg_rqos_merge() now calls blkcg_schedule_throttle() as necessary. Signed-off-by: Tejun Heo Cc: Andy Newell Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-iocost.c | 117 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 93 insertions(+), 24 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index e76ba983921fe..4382645aeb7cc 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1210,13 +1210,13 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) struct blkcg_gq *blkg = iocg_to_blkg(iocg); u64 vtime = atomic64_read(&iocg->vtime); u64 delta_ns, expires, oexpires; - u32 hw_inuse; + u32 hwa; lockdep_assert_held(&iocg->waitq.lock); /* debt-adjust vtime */ - current_hweight(iocg, NULL, &hw_inuse); - vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse); + current_hweight(iocg, &hwa, NULL); + vtime += abs_cost_to_cost(iocg->abs_vdebt, hwa); /* * Clear or maintain depending on the overage. Non-zero vdebt is what @@ -1262,6 +1262,47 @@ static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } +static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost, + struct ioc_now *now) +{ + struct iocg_pcpu_stat *gcs; + + lockdep_assert_held(&iocg->ioc->lock); + lockdep_assert_held(&iocg->waitq.lock); + WARN_ON_ONCE(list_empty(&iocg->active_list)); + + /* + * Once in debt, debt handling owns inuse. @iocg stays at the minimum + * inuse donating all of it share to others until its debt is paid off. + */ + if (!iocg->abs_vdebt && abs_cost) + propagate_weights(iocg, iocg->active, 0, false, now); + + iocg->abs_vdebt += abs_cost; + + gcs = get_cpu_ptr(iocg->pcpu_stat); + local64_add(abs_cost, &gcs->abs_vusage); + put_cpu_ptr(gcs); +} + +static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay, + struct ioc_now *now) +{ + lockdep_assert_held(&iocg->ioc->lock); + lockdep_assert_held(&iocg->waitq.lock); + + /* make sure that nobody messed with @iocg */ + WARN_ON_ONCE(list_empty(&iocg->active_list)); + WARN_ON_ONCE(iocg->inuse > 1); + + iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt); + + /* if debt is paid in full, restore inuse */ + if (!iocg->abs_vdebt) + propagate_weights(iocg, iocg->active, iocg->last_inuse, + false, now); +} + static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key) { @@ -1300,26 +1341,25 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt, struct iocg_wake_ctx ctx = { .iocg = iocg }; u64 vshortage, expires, oexpires; s64 vbudget; - u32 hw_inuse; + u32 hwa; lockdep_assert_held(&iocg->waitq.lock); - current_hweight(iocg, NULL, &hw_inuse); + current_hweight(iocg, &hwa, NULL); vbudget = now->vnow - atomic64_read(&iocg->vtime); /* pay off debt */ if (pay_debt && iocg->abs_vdebt && vbudget > 0) { - u64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse); - u64 delta = min_t(u64, vbudget, vdebt); - u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse), - iocg->abs_vdebt); + u64 abs_vbudget = cost_to_abs_cost(vbudget, hwa); + u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt); + u64 vpay = abs_cost_to_cost(abs_vpay, hwa); lockdep_assert_held(&ioc->lock); - atomic64_add(delta, &iocg->vtime); - atomic64_add(delta, &iocg->done_vtime); - iocg->abs_vdebt -= abs_delta; - vbudget -= vdebt; + atomic64_add(vpay, &iocg->vtime); + atomic64_add(vpay, &iocg->done_vtime); + iocg_pay_debt(iocg, abs_vpay, now); + vbudget -= vpay; iocg_kick_delay(iocg, now); } @@ -1331,17 +1371,20 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt, * not positive. */ if (iocg->abs_vdebt) { - s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse); + s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa); vbudget = min_t(s64, 0, vbudget - vdebt); } /* - * Wake up the ones which are due and see how much vtime we'll need - * for the next one. + * Wake up the ones which are due and see how much vtime we'll need for + * the next one. As paying off debt restores hw_inuse, it must be read + * after the above debt payment. */ - ctx.hw_inuse = hw_inuse; ctx.vbudget = vbudget; + current_hweight(iocg, NULL, &ctx.hw_inuse); + __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx); + if (!waitqueue_active(&iocg->waitq)) return; if (WARN_ON_ONCE(ctx.vbudget >= 0)) @@ -1529,6 +1572,10 @@ static u32 hweight_after_donation(struct ioc_gq *iocg, u32 hwm, u32 usage, u64 vtime = atomic64_read(&iocg->vtime); s64 excess, delta, target, new_hwi; + /* debt handling owns inuse for debtors */ + if (iocg->abs_vdebt) + return 1; + /* see whether minimum margin requirement is met */ if (waitqueue_active(&iocg->waitq) || time_after64(vtime, now->vnow - ioc->margins.min)) @@ -1802,6 +1849,18 @@ static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now) struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; u32 inuse; + /* + * In-debt iocgs participated in the donation calculation with + * the minimum target hweight_inuse. Configuring inuse + * accordingly would work fine but debt handling expects + * @iocg->inuse stay at the minimum and we don't wanna + * interfere. + */ + if (iocg->abs_vdebt) { + WARN_ON_ONCE(iocg->inuse > 1); + continue; + } + /* w' = s' * b' / b'_p, note that b' == b'_t for donating leaves */ inuse = DIV64_U64_ROUND_UP( parent->child_adjusted_sum * iocg->hweight_after_donation, @@ -2085,6 +2144,10 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, cost = abs_cost_to_cost(abs_cost, hwi); margin = now->vnow - vtime - cost; + /* debt handling owns inuse for debtors */ + if (iocg->abs_vdebt) + return cost; + /* * We only increase inuse during period and do so iff the margin has * deteriorated since the previous adjustment. @@ -2096,7 +2159,7 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, spin_lock_irq(&ioc->lock); /* we own inuse only when @iocg is in the normal active state */ - if (list_empty(&iocg->active_list)) { + if (iocg->abs_vdebt || list_empty(&iocg->active_list)) { spin_unlock_irq(&ioc->lock); return cost; } @@ -2270,7 +2333,7 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) * penalizing the cgroup and its descendants. */ if (use_debt) { - iocg->abs_vdebt += abs_cost; + iocg_incur_debt(iocg, abs_cost, &now); if (iocg_kick_delay(iocg, &now)) blkcg_schedule_throttle(rqos->q, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); @@ -2279,7 +2342,7 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) } /* guarantee that iocgs w/ waiters have maximum inuse */ - if (iocg->inuse != iocg->active) { + if (!iocg->abs_vdebt && iocg->inuse != iocg->active) { if (!ioc_locked) { iocg_unlock(iocg, false, &flags); ioc_locked = true; @@ -2367,14 +2430,20 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, * be for the vast majority of cases. See debt handling in * ioc_rqos_throttle() for details. */ - spin_lock_irqsave(&iocg->waitq.lock, flags); + spin_lock_irqsave(&ioc->lock, flags); + spin_lock(&iocg->waitq.lock); + if (likely(!list_empty(&iocg->active_list))) { - iocg->abs_vdebt += abs_cost; - iocg_kick_delay(iocg, &now); + iocg_incur_debt(iocg, abs_cost, &now); + if (iocg_kick_delay(iocg, &now)) + blkcg_schedule_throttle(rqos->q, + (bio->bi_opf & REQ_SWAP) == REQ_SWAP); } else { iocg_commit_bio(iocg, bio, abs_cost, cost); } - spin_unlock_irqrestore(&iocg->waitq.lock, flags); + + spin_unlock(&iocg->waitq.lock); + spin_unlock_irqrestore(&ioc->lock, flags); } static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio) From 98ac453aeb657b49ed5c4726ee1c8b605ab382bb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:52 -0400 Subject: [PATCH 0357/5344] blk-iocost: implement delay adjustment hysteresis commit 5160a5a53c0c4ae3708959d9465ea43ad5d90542 upstream. Curently, iocost syncs the delay duration to the outstanding debt amount, which seemed enough to protect the system from anon memory hogs. However, that was mostly because the delay calcuation was using hweight_inuse which quickly converges towards zero under debt for delay duration calculation, often pusnishing debtors overly harshly for longer than deserved. The previous patch fixed the delay calcuation and now the protection against anonymous memory hogs isn't enough because the effect of delay is indirect and non-linear and a huge amount of future debt can accumulate abruptly while unthrottled. This patch implements delay hysteresis so that delay is decayed exponentially over time instead of getting cleared immediately as debt is paid off. While the overall behavior is similar to the blk-cgroup implementation used by blk-iolatency, a lot of the details are different and due to the empirical nature of the mechanism, it's challenging to adapt the mechanism for one controller without negatively impacting the other. As the delay is gradually decayed now, there's no point in running it from its own hrtimer. Periodic updates are now performed from ioc_timer_fn() and the dedicated hrtimer is removed. Signed-off-by: Tejun Heo Cc: Josef Bacik Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-cgroup.c | 23 ++++++--- block/blk-iocost.c | 119 ++++++++++++++++++++++++++------------------- 2 files changed, 86 insertions(+), 56 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index bf0438f41a0e9..57ecfbbd0926e 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1695,16 +1695,24 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) { unsigned long pflags; + bool clamp; u64 now = ktime_to_ns(ktime_get()); u64 exp; u64 delay_nsec = 0; int tok; while (blkg->parent) { - if (atomic_read(&blkg->use_delay)) { + int use_delay = atomic_read(&blkg->use_delay); + + if (use_delay) { + u64 this_delay; + blkcg_scale_delay(blkg, now); - delay_nsec = max_t(u64, delay_nsec, - atomic64_read(&blkg->delay_nsec)); + this_delay = atomic64_read(&blkg->delay_nsec); + if (this_delay > delay_nsec) { + delay_nsec = this_delay; + clamp = use_delay > 0; + } } blkg = blkg->parent; } @@ -1716,10 +1724,13 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) * Let's not sleep for all eternity if we've amassed a huge delay. * Swapping or metadata IO can accumulate 10's of seconds worth of * delay, and we want userspace to be able to do _something_ so cap the - * delays at 1 second. If there's 10's of seconds worth of delay then - * the tasks will be delayed for 1 second for every syscall. + * delays at 0.25s. If there's 10's of seconds worth of delay then the + * tasks will be delayed for 0.25 second for every syscall. If + * blkcg_set_delay() was used as indicated by negative use_delay, the + * caller is responsible for regulating the range. */ - delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); + if (clamp) + delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); if (use_memdelay) psi_memstall_enter(&pflags); diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 4382645aeb7cc..6c1dcd426397b 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -273,6 +273,31 @@ enum { /* unbusy hysterisis */ UNBUSY_THR_PCT = 75, + /* + * The effect of delay is indirect and non-linear and a huge amount of + * future debt can accumulate abruptly while unthrottled. Linearly scale + * up delay as debt is going up and then let it decay exponentially. + * This gives us quick ramp ups while delay is accumulating and long + * tails which can help reducing the frequency of debt explosions on + * unthrottle. The parameters are experimentally determined. + * + * The delay mechanism provides adequate protection and behavior in many + * cases. However, this is far from ideal and falls shorts on both + * fronts. The debtors are often throttled too harshly costing a + * significant level of fairness and possibly total work while the + * protection against their impacts on the system can be choppy and + * unreliable. + * + * The shortcoming primarily stems from the fact that, unlike for page + * cache, the kernel doesn't have well-defined back-pressure propagation + * mechanism and policies for anonymous memory. Fully addressing this + * issue will likely require substantial improvements in the area. + */ + MIN_DELAY_THR_PCT = 500, + MAX_DELAY_THR_PCT = 25000, + MIN_DELAY = 250, + MAX_DELAY = 250 * USEC_PER_MSEC, + /* don't let cmds which take a very long time pin lagging for too long */ MAX_LAGGING_PERIODS = 10, @@ -476,6 +501,10 @@ struct ioc_gq { atomic64_t done_vtime; u64 abs_vdebt; + /* current delay in effect and when it started */ + u64 delay; + u64 delay_at; + /* * The period this iocg was last active in. Used for deactivation * and invalidating `vtime`. @@ -498,7 +527,6 @@ struct ioc_gq { struct wait_queue_head waitq; struct hrtimer waitq_timer; - struct hrtimer delay_timer; /* timestamp at the latest activation */ u64 activated_at; @@ -1208,58 +1236,50 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct blkcg_gq *blkg = iocg_to_blkg(iocg); - u64 vtime = atomic64_read(&iocg->vtime); - u64 delta_ns, expires, oexpires; + u64 tdelta, delay, new_delay; + s64 vover, vover_pct; u32 hwa; lockdep_assert_held(&iocg->waitq.lock); - /* debt-adjust vtime */ + /* calculate the current delay in effect - 1/2 every second */ + tdelta = now->now - iocg->delay_at; + if (iocg->delay) + delay = iocg->delay >> div64_u64(tdelta, USEC_PER_SEC); + else + delay = 0; + + /* calculate the new delay from the debt amount */ current_hweight(iocg, &hwa, NULL); - vtime += abs_cost_to_cost(iocg->abs_vdebt, hwa); + vover = atomic64_read(&iocg->vtime) + + abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow; + vover_pct = div64_s64(100 * vover, ioc->period_us * now->vrate); + + if (vover_pct <= MIN_DELAY_THR_PCT) + new_delay = 0; + else if (vover_pct >= MAX_DELAY_THR_PCT) + new_delay = MAX_DELAY; + else + new_delay = MIN_DELAY + + div_u64((MAX_DELAY - MIN_DELAY) * + (vover_pct - MIN_DELAY_THR_PCT), + MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT); - /* - * Clear or maintain depending on the overage. Non-zero vdebt is what - * guarantees that @iocg is online and future iocg_kick_delay() will - * clear use_delay. Don't leave it on when there's no vdebt. - */ - if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) { - blkcg_clear_delay(blkg); - return false; + /* pick the higher one and apply */ + if (new_delay > delay) { + iocg->delay = new_delay; + iocg->delay_at = now->now; + delay = new_delay; } - if (!atomic_read(&blkg->use_delay) && - time_before_eq64(vtime, now->vnow + ioc->margins.target)) - return false; - - /* use delay */ - delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow, - now->vrate) * NSEC_PER_USEC; - blkcg_set_delay(blkg, delta_ns); - expires = now->now_ns + delta_ns; - /* if already active and close enough, don't bother */ - oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer)); - if (hrtimer_is_queued(&iocg->delay_timer) && - abs(oexpires - expires) <= ioc->timer_slack_ns) + if (delay >= MIN_DELAY) { + blkcg_set_delay(blkg, delay * NSEC_PER_USEC); return true; - - hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires), - ioc->timer_slack_ns, HRTIMER_MODE_ABS); - return true; -} - -static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer) -{ - struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer); - struct ioc_now now; - unsigned long flags; - - spin_lock_irqsave(&iocg->waitq.lock, flags); - ioc_now(iocg->ioc, &now); - iocg_kick_delay(iocg, &now); - spin_unlock_irqrestore(&iocg->waitq.lock, flags); - - return HRTIMER_NORESTART; + } else { + iocg->delay = 0; + blkcg_clear_delay(blkg); + return false; + } } static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost, @@ -1360,9 +1380,10 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt, atomic64_add(vpay, &iocg->done_vtime); iocg_pay_debt(iocg, abs_vpay, now); vbudget -= vpay; + } + if (iocg->abs_vdebt || iocg->delay) iocg_kick_delay(iocg, now); - } /* * Debt can still be outstanding if we haven't paid all yet or the @@ -1910,12 +1931,13 @@ static void ioc_timer_fn(struct timer_list *timer) */ list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && - !iocg_is_idle(iocg)) + !iocg->delay && !iocg_is_idle(iocg)) continue; spin_lock(&iocg->waitq.lock); - if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) { + if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt || + iocg->delay) { /* might be oversleeping vtime / hweight changes, kick */ iocg_kick_waitq(iocg, true, &now); } else if (iocg_is_idle(iocg)) { @@ -2645,8 +2667,6 @@ static void ioc_pd_init(struct blkg_policy_data *pd) init_waitqueue_head(&iocg->waitq); hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); iocg->waitq_timer.function = iocg_waitq_timer_fn; - hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - iocg->delay_timer.function = iocg_delay_timer_fn; iocg->level = blkg->blkcg->css.cgroup->level; @@ -2683,7 +2703,6 @@ static void ioc_pd_free(struct blkg_policy_data *pd) spin_unlock_irqrestore(&ioc->lock, flags); hrtimer_cancel(&iocg->waitq_timer); - hrtimer_cancel(&iocg->delay_timer); } free_percpu(iocg->pcpu_stat); kfree(iocg); From e042d3875b2e6ae78fae143a1208fed711ada0ec Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:53 -0400 Subject: [PATCH 0358/5344] blk-iocost: halve debts if device stays idle commit dda1315f18536cb94e293a3ae0ccb5c4df53e5e4 upstream. A low weight iocg can amass a large amount of debt, for example, when anonymous memory gets reclaimed aggressively. If the system has a lot of memory paired with a slow IO device, the debt can span multiple seconds or more. If there are no other subsequent IO issuers, the in-debt iocg may end up blocked paying its debt while the IO device is idle. This patch implements a mechanism to protect against such pathological cases. If the device has been sufficiently idle for a substantial amount of time, the debts are halved. The criteria are on the conservative side as we want to resolve the rare extreme cases without impacting regular operation by forgiving debts too readily. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-iocost.c | 49 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 6c1dcd426397b..b7c0abd8ed6ab 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -298,6 +298,13 @@ enum { MIN_DELAY = 250, MAX_DELAY = 250 * USEC_PER_MSEC, + /* + * Halve debts if total usage keeps staying under 25% w/o any shortages + * for over 100ms. + */ + DEBT_BUSY_USAGE_PCT = 25, + DEBT_REDUCTION_IDLE_DUR = 100 * USEC_PER_MSEC, + /* don't let cmds which take a very long time pin lagging for too long */ MAX_LAGGING_PERIODS = 10, @@ -439,6 +446,9 @@ struct ioc { bool weights_updated; atomic_t hweight_gen; /* for lazy hweights */ + /* the last time debt cancel condition wasn't met */ + u64 debt_busy_at; + u64 autop_too_fast_at; u64 autop_too_slow_at; int autop_idx; @@ -1220,6 +1230,7 @@ static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) if (ioc->running == IOC_IDLE) { ioc->running = IOC_RUNNING; + ioc->debt_busy_at = now->now; ioc_start_period(ioc, now); } @@ -1900,7 +1911,8 @@ static void ioc_timer_fn(struct timer_list *timer) struct ioc_gq *iocg, *tiocg; struct ioc_now now; LIST_HEAD(surpluses); - int nr_shortages = 0, nr_lagging = 0; + int nr_debtors = 0, nr_shortages = 0, nr_lagging = 0; + u64 usage_us_sum = 0; u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; u32 missed_ppm[2], rq_wait_pct; @@ -1940,6 +1952,8 @@ static void ioc_timer_fn(struct timer_list *timer) iocg->delay) { /* might be oversleeping vtime / hweight changes, kick */ iocg_kick_waitq(iocg, true, &now); + if (iocg->abs_vdebt) + nr_debtors++; } else if (iocg_is_idle(iocg)) { /* no waiter and idle, deactivate */ __propagate_weights(iocg, 0, 0, false, &now); @@ -1982,6 +1996,7 @@ static void ioc_timer_fn(struct timer_list *timer) * high-latency completions appearing as idle. */ usage_us = iocg->usage_delta_us; + usage_us_sum += usage_us; if (vdone != vtime) { u64 inflight_us = DIV64_U64_ROUND_UP( @@ -2040,6 +2055,38 @@ static void ioc_timer_fn(struct timer_list *timer) list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list) list_del_init(&iocg->surplus_list); + /* + * A low weight iocg can amass a large amount of debt, for example, when + * anonymous memory gets reclaimed aggressively. If the system has a lot + * of memory paired with a slow IO device, the debt can span multiple + * seconds or more. If there are no other subsequent IO issuers, the + * in-debt iocg may end up blocked paying its debt while the IO device + * is idle. + * + * The following protects against such pathological cases. If the device + * has been sufficiently idle for a substantial amount of time, the + * debts are halved. The criteria are on the conservative side as we + * want to resolve the rare extreme cases without impacting regular + * operation by forgiving debts too readily. + */ + if (nr_shortages || + div64_u64(100 * usage_us_sum, now.now - ioc->period_at) >= + DEBT_BUSY_USAGE_PCT) + ioc->debt_busy_at = now.now; + + if (nr_debtors && + now.now - ioc->debt_busy_at >= DEBT_REDUCTION_IDLE_DUR) { + list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { + if (iocg->abs_vdebt) { + spin_lock(&iocg->waitq.lock); + iocg->abs_vdebt /= 2; + iocg_kick_waitq(iocg, true, &now); + spin_unlock(&iocg->waitq.lock); + } + } + ioc->debt_busy_at = now.now; + } + /* * If q is getting clogged or we're missing too much, we're issuing * too much IO and should lower vtime rate. If we're not missing From 97f68752b915917c8d18d3dbc1b8bc0dd8c00028 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:54 -0400 Subject: [PATCH 0359/5344] blk-iocost: implement vtime loss compensation commit ac33e91e2daca40fcad66c68712276da7b40f169 upstream. Note: there are a few conflicts need to be resolved when the origin patch applied. When an iocg accumulates too much vtime or gets deactivated, we throw away some vtime, which lowers the overall device utilization. As the exact amount which is being thrown away is known, we can compensate by accelerating the vrate accordingly so that the extra vtime generated in the current period matches what got lost. This significantly improves work conservation when involving high weight cgroups with intermittent and bursty IO patterns. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-iocost.c | 132 ++++++++++++++++++++++++++++++--------------- 1 file changed, 90 insertions(+), 42 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index b7c0abd8ed6ab..46e9324e05a50 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -227,20 +227,12 @@ enum { MARGIN_MIN_PCT = 10, MARGIN_LOW_PCT = 20, MARGIN_TARGET_PCT = 50, - MARGIN_MAX_PCT = 100, INUSE_ADJ_STEP_PCT = 25, /* Have some play in timer operations */ TIMER_SLACK_PCT = 1, - /* - * vtime can wrap well within a reasonable uptime when vrate is - * consistently raised. Don't trust recorded cgroup vtime if the - * period counter indicates that it's older than 5mins. - */ - VTIME_VALID_DUR = 300 * USEC_PER_SEC, - /* 1/64k is granular enough and can easily be handled w/ u32 */ WEIGHT_ONE = 1 << 16, @@ -398,7 +390,6 @@ struct ioc_margins { s64 min; s64 low; s64 target; - s64 max; }; struct ioc_missed { @@ -435,6 +426,8 @@ struct ioc { enum ioc_running running; atomic64_t vtime_rate; + u64 vtime_base_rate; + s64 vtime_err; seqcount_t period_seqcount; u64 period_at; /* wallclock starttime */ @@ -763,12 +756,11 @@ static void ioc_refresh_margins(struct ioc *ioc) { struct ioc_margins *margins = &ioc->margins; u32 period_us = ioc->period_us; - u64 vrate = atomic64_read(&ioc->vtime_rate); + u64 vrate = ioc->vtime_base_rate; margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate; margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate; margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate; - margins->max = (period_us * MARGIN_MAX_PCT / 100) * vrate; } /* latency Qos params changed, update period_us and all the dependent params */ @@ -834,8 +826,7 @@ static int ioc_autop_idx(struct ioc *ioc) return idx; /* step up/down based on the vrate */ - vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100, - VTIME_PER_USEC); + vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC); now_ns = ktime_get_ns(); if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) { @@ -943,6 +934,43 @@ static bool ioc_refresh_params(struct ioc *ioc, bool force) return true; } +/* + * When an iocg accumulates too much vtime or gets deactivated, we throw away + * some vtime, which lowers the overall device utilization. As the exact amount + * which is being thrown away is known, we can compensate by accelerating the + * vrate accordingly so that the extra vtime generated in the current period + * matches what got lost. + */ +static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now) +{ + s64 pleft = ioc->period_at + ioc->period_us - now->now; + s64 vperiod = ioc->period_us * ioc->vtime_base_rate; + s64 vcomp, vcomp_min, vcomp_max; + + lockdep_assert_held(&ioc->lock); + + /* we need some time left in this period */ + if (pleft <= 0) + goto done; + + /* + * Calculate how much vrate should be adjusted to offset the error. + * Limit the amount of adjustment and deduct the adjusted amount from + * the error. + */ + vcomp = -div64_s64(ioc->vtime_err, pleft); + vcomp_min = -(ioc->vtime_base_rate >> 1); + vcomp_max = ioc->vtime_base_rate; + vcomp = clamp(vcomp, vcomp_min, vcomp_max); + + ioc->vtime_err += vcomp * pleft; + + atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp); +done: + /* bound how much error can accumulate */ + ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod); +} + /* take a snapshot of the current [v]time and vrate */ static void ioc_now(struct ioc *ioc, struct ioc_now *now) { @@ -1156,8 +1184,8 @@ static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now) static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; - u64 last_period, cur_period, max_period_delta; - u64 vtime, vmin; + u64 last_period, cur_period; + u64 vtime, vtarget; int i; /* @@ -1196,21 +1224,15 @@ static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) goto fail_unlock; /* - * vtime may wrap when vrate is raised substantially due to - * underestimated IO costs. Look at the period and ignore its - * vtime if the iocg has been idle for too long. Also, cap the - * budget it can start with to the margin. + * Always start with the target budget. On deactivation, we throw away + * anything above it. */ - max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us); + vtarget = now->vnow - ioc->margins.target; vtime = atomic64_read(&iocg->vtime); - vmin = now->vnow - ioc->margins.max; - if (last_period + max_period_delta < cur_period || - time_before64(vtime, vmin)) { - atomic64_add(vmin - vtime, &iocg->vtime); - atomic64_add(vmin - vtime, &iocg->done_vtime); - vtime = vmin; - } + atomic64_add(vtarget - vtime, &iocg->vtime); + atomic64_add(vtarget - vtime, &iocg->done_vtime); + vtime = vtarget; /* * Activate, propagate weight and start period timer if not @@ -1264,7 +1286,8 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) current_hweight(iocg, &hwa, NULL); vover = atomic64_read(&iocg->vtime) + abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow; - vover_pct = div64_s64(100 * vover, ioc->period_us * now->vrate); + vover_pct = div64_s64(100 * vover, + ioc->period_us * ioc->vtime_base_rate); if (vover_pct <= MIN_DELAY_THR_PCT) new_delay = 0; @@ -1425,7 +1448,8 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt, /* determine next wakeup, add a timer margin to guarantee chunking */ vshortage = -ctx.vbudget; expires = now->now_ns + - DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC; + DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) * + NSEC_PER_USEC; expires += ioc->timer_slack_ns; /* if already active and close enough, don't bother */ @@ -1540,6 +1564,7 @@ static void iocg_build_inner_walk(struct ioc_gq *iocg, /* collect per-cpu counters and propagate the deltas to the parent */ static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now) { + struct ioc *ioc = iocg->ioc; struct iocg_stat new_stat; u64 abs_vusage = 0; u64 vusage_delta; @@ -1555,7 +1580,7 @@ static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now) vusage_delta = abs_vusage - iocg->last_stat_abs_vusage; iocg->last_stat_abs_vusage = abs_vusage; - iocg->usage_delta_us = div64_u64(vusage_delta, now->vrate); + iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate); iocg->local_stat.usage_us += iocg->usage_delta_us; new_stat.usage_us = @@ -1597,8 +1622,8 @@ static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now) * capacity. @hwm is the upper bound and used to signal no donation. This * function also throws away @iocg's excess budget. */ -static u32 hweight_after_donation(struct ioc_gq *iocg, u32 hwm, u32 usage, - struct ioc_now *now) +static u32 hweight_after_donation(struct ioc_gq *iocg, u32 old_hwi, u32 hwm, + u32 usage, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; u64 vtime = atomic64_read(&iocg->vtime); @@ -1613,12 +1638,13 @@ static u32 hweight_after_donation(struct ioc_gq *iocg, u32 hwm, u32 usage, time_after64(vtime, now->vnow - ioc->margins.min)) return hwm; - /* throw away excess above max */ - excess = now->vnow - vtime - ioc->margins.max; + /* throw away excess above target */ + excess = now->vnow - vtime - ioc->margins.target; if (excess > 0) { atomic64_add(excess, &iocg->vtime); atomic64_add(excess, &iocg->done_vtime); vtime += excess; + ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE); } /* @@ -1956,6 +1982,24 @@ static void ioc_timer_fn(struct timer_list *timer) nr_debtors++; } else if (iocg_is_idle(iocg)) { /* no waiter and idle, deactivate */ + u64 vtime = atomic64_read(&iocg->vtime); + s64 excess; + + /* + * @iocg has been inactive for a full duration and will + * have a high budget. Account anything above target as + * error and throw away. On reactivation, it'll start + * with the target budget. + */ + excess = now.vnow - vtime - ioc->margins.target; + if (excess > 0) { + u32 old_hwi; + + current_hweight(iocg, NULL, &old_hwi); + ioc->vtime_err -= div64_u64(excess * old_hwi, + WEIGHT_ONE); + } + __propagate_weights(iocg, 0, 0, false, &now); list_del_init(&iocg->active_list); } @@ -2001,7 +2045,7 @@ static void ioc_timer_fn(struct timer_list *timer) if (vdone != vtime) { u64 inflight_us = DIV64_U64_ROUND_UP( cost_to_abs_cost(vtime - vdone, hw_inuse), - now.vrate); + ioc->vtime_base_rate); usage_us = max(usage_us, inflight_us); } @@ -2021,16 +2065,16 @@ static void ioc_timer_fn(struct timer_list *timer) if (hw_inuse < hw_active || (!waitqueue_active(&iocg->waitq) && time_before64(vtime, now.vnow - ioc->margins.low))) { - u32 hwa, hwm, new_hwi; + u32 hwa, old_hwi, hwm, new_hwi; /* * Already donating or accumulated enough to start. * Determine the donation amount. */ - current_hweight(iocg, &hwa, NULL); + current_hweight(iocg, &hwa, &old_hwi); hwm = current_hweight_max(iocg); - new_hwi = hweight_after_donation(iocg, hwm, usage, - &now); + new_hwi = hweight_after_donation(iocg, old_hwi, hwm, + usage, &now); if (new_hwi < hwm) { iocg->hweight_donating = hwa; iocg->hweight_after_donation = new_hwi; @@ -2134,7 +2178,7 @@ static void ioc_timer_fn(struct timer_list *timer) ioc->busy_level = clamp(ioc->busy_level, -1000, 1000); if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) { - u64 vrate = atomic64_read(&ioc->vtime_rate); + u64 vrate = ioc->vtime_base_rate; u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; /* rq_wait signal is always reliable, ignore user vrate_min */ @@ -2171,7 +2215,7 @@ static void ioc_timer_fn(struct timer_list *timer) trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, nr_lagging, nr_shortages); - atomic64_set(&ioc->vtime_rate, vrate); + ioc->vtime_base_rate = vrate; ioc_refresh_margins(ioc); } else if (ioc->busy_level != prev_busy_level || nr_lagging) { trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate), @@ -2192,8 +2236,11 @@ static void ioc_timer_fn(struct timer_list *timer) ioc_start_period(ioc, &now); } else { ioc->busy_level = 0; + ioc->vtime_err = 0; ioc->running = IOC_IDLE; } + + ioc_refresh_vrate(ioc, &now); } spin_unlock_irq(&ioc->lock); @@ -2632,6 +2679,7 @@ static int blk_iocost_init(struct request_queue *q) INIT_LIST_HEAD(&ioc->active_iocgs); ioc->running = IOC_IDLE; + ioc->vtime_base_rate = VTIME_PER_USEC; atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); seqcount_init(&ioc->period_seqcount); ioc->period_at = ktime_to_us(ktime_get()); @@ -2766,7 +2814,7 @@ static size_t ioc_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) if (iocg->level == 0) { unsigned vp10k = DIV64_U64_ROUND_CLOSEST( - atomic64_read(&ioc->vtime_rate) * 10000, + ioc->vtime_base_rate * 10000, VTIME_PER_USEC); pos += scnprintf(buf + pos, size - pos, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100); From 021c2e430b14190a29b318468d09b1f3c1cf8bda Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:55 -0400 Subject: [PATCH 0360/5344] blk-iocost: restore inuse update tracepoints commit 046037551721e8831f6718ac2149887f6bb1f802 upstream. Update and restore the inuse update tracepoints. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-iocost.c | 16 ++++++++++++++++ include/trace/events/iocost.h | 6 +++--- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 46e9324e05a50..92fdc9a792451 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1923,6 +1923,12 @@ static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now) inuse = DIV64_U64_ROUND_UP( parent->child_adjusted_sum * iocg->hweight_after_donation, parent->hweight_inuse); + + TRACE_IOCG_PATH(inuse_transfer, iocg, now, + iocg->inuse, inuse, + iocg->hweight_inuse, + iocg->hweight_after_donation); + __propagate_weights(iocg, iocg->active, inuse, true, now); } @@ -2080,6 +2086,10 @@ static void ioc_timer_fn(struct timer_list *timer) iocg->hweight_after_donation = new_hwi; list_add(&iocg->surplus_list, &surpluses); } else { + TRACE_IOCG_PATH(inuse_shortage, iocg, &now, + iocg->inuse, iocg->active, + iocg->hweight_inuse, new_hwi); + __propagate_weights(iocg, iocg->active, iocg->active, true, &now); nr_shortages++; @@ -2252,11 +2262,13 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, struct ioc *ioc = iocg->ioc; struct ioc_margins *margins = &ioc->margins; u32 adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100); + u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi; u32 hwi; s64 margin; u64 cost, new_inuse; current_hweight(iocg, NULL, &hwi); + old_hwi = hwi; cost = abs_cost_to_cost(abs_cost, hwi); margin = now->vnow - vtime - cost; @@ -2291,6 +2303,10 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, iocg->inuse != iocg->active); spin_unlock_irq(&ioc->lock); + + TRACE_IOCG_PATH(inuse_adjust, iocg, now, + old_inuse, iocg->inuse, old_hwi, hwi); + return cost; } diff --git a/include/trace/events/iocost.h b/include/trace/events/iocost.h index ee024fe8fef66..b350860d2e711 100644 --- a/include/trace/events/iocost.h +++ b/include/trace/events/iocost.h @@ -95,7 +95,7 @@ DECLARE_EVENT_CLASS(iocg_inuse_update, ) ); -DEFINE_EVENT(iocg_inuse_update, iocost_inuse_takeback, +DEFINE_EVENT(iocg_inuse_update, iocost_inuse_shortage, TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now, u32 old_inuse, u32 new_inuse, @@ -105,7 +105,7 @@ DEFINE_EVENT(iocg_inuse_update, iocost_inuse_takeback, old_hw_inuse, new_hw_inuse) ); -DEFINE_EVENT(iocg_inuse_update, iocost_inuse_giveaway, +DEFINE_EVENT(iocg_inuse_update, iocost_inuse_transfer, TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now, u32 old_inuse, u32 new_inuse, @@ -115,7 +115,7 @@ DEFINE_EVENT(iocg_inuse_update, iocost_inuse_giveaway, old_hw_inuse, new_hw_inuse) ); -DEFINE_EVENT(iocg_inuse_update, iocost_inuse_reset, +DEFINE_EVENT(iocg_inuse_update, iocost_inuse_adjust, TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now, u32 old_inuse, u32 new_inuse, From 85f286cc66b4cbebb6dd8f8ee68763c160b7a6fe Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:56 -0400 Subject: [PATCH 0361/5344] blk-iocost: add three debug stat - cost.wait, indebt and indelay commit f0bf84a5dffa0800c4f8ae52a75c579978086901 upstream. These are really cheap to collect and can be useful in debugging iocost behavior. Add them as debug stats for now. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-iocost.c | 77 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 72 insertions(+), 5 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 92fdc9a792451..47b88661ddec1 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -455,6 +455,9 @@ struct iocg_pcpu_stat { struct iocg_stat { u64 usage_us; + u64 wait_us; + u64 indebt_us; + u64 indelay_us; }; /* per device-cgroup pair */ @@ -541,6 +544,9 @@ struct ioc_gq { struct iocg_stat last_stat; u64 last_stat_abs_vusage; u64 usage_delta_us; + u64 wait_since; + u64 indebt_since; + u64 indelay_since; /* this iocg's depth in the hierarchy and ancestors including self */ int level; @@ -1307,9 +1313,15 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) } if (delay >= MIN_DELAY) { + if (!iocg->indelay_since) + iocg->indelay_since = now->now; blkcg_set_delay(blkg, delay * NSEC_PER_USEC); return true; } else { + if (iocg->indelay_since) { + iocg->local_stat.indelay_us += now->now - iocg->indelay_since; + iocg->indelay_since = 0; + } iocg->delay = 0; blkcg_clear_delay(blkg); return false; @@ -1329,8 +1341,10 @@ static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost, * Once in debt, debt handling owns inuse. @iocg stays at the minimum * inuse donating all of it share to others until its debt is paid off. */ - if (!iocg->abs_vdebt && abs_cost) + if (!iocg->abs_vdebt && abs_cost) { + iocg->indebt_since = now->now; propagate_weights(iocg, iocg->active, 0, false, now); + } iocg->abs_vdebt += abs_cost; @@ -1352,9 +1366,13 @@ static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay, iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt); /* if debt is paid in full, restore inuse */ - if (!iocg->abs_vdebt) + if (!iocg->abs_vdebt) { + iocg->local_stat.indebt_us += now->now - iocg->indebt_since; + iocg->indebt_since = 0; + propagate_weights(iocg, iocg->active, iocg->last_inuse, false, now); + } } static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode, @@ -1440,8 +1458,17 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt, __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx); - if (!waitqueue_active(&iocg->waitq)) + if (!waitqueue_active(&iocg->waitq)) { + if (iocg->wait_since) { + iocg->local_stat.wait_us += now->now - iocg->wait_since; + iocg->wait_since = 0; + } return; + } + + if (!iocg->wait_since) + iocg->wait_since = now->now; + if (WARN_ON_ONCE(ctx.vbudget >= 0)) return; @@ -1583,8 +1610,15 @@ static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now) iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate); iocg->local_stat.usage_us += iocg->usage_delta_us; + /* propagate upwards */ new_stat.usage_us = iocg->local_stat.usage_us + iocg->desc_stat.usage_us; + new_stat.wait_us = + iocg->local_stat.wait_us + iocg->desc_stat.wait_us; + new_stat.indebt_us = + iocg->local_stat.indebt_us + iocg->desc_stat.indebt_us; + new_stat.indelay_us = + iocg->local_stat.indelay_us + iocg->desc_stat.indelay_us; /* propagate the deltas to the parent */ if (iocg->level > 0) { @@ -1593,6 +1627,12 @@ static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now) parent_stat->usage_us += new_stat.usage_us - iocg->last_stat.usage_us; + parent_stat->wait_us += + new_stat.wait_us - iocg->last_stat.wait_us; + parent_stat->indebt_us += + new_stat.indebt_us - iocg->last_stat.indebt_us; + parent_stat->indelay_us += + new_stat.indelay_us - iocg->last_stat.indelay_us; } iocg->last_stat = new_stat; @@ -1965,8 +2005,6 @@ static void ioc_timer_fn(struct timer_list *timer) return; } - iocg_flush_stat(&ioc->active_iocgs, &now); - /* * Waiters determine the sleep durations based on the vrate they * saw at the time of sleep. If vrate has increased, some waiters @@ -1980,6 +2018,22 @@ static void ioc_timer_fn(struct timer_list *timer) spin_lock(&iocg->waitq.lock); + /* flush wait and indebt stat deltas */ + if (iocg->wait_since) { + iocg->local_stat.wait_us += now.now - iocg->wait_since; + iocg->wait_since = now.now; + } + if (iocg->indebt_since) { + iocg->local_stat.indebt_us += + now.now - iocg->indebt_since; + iocg->indebt_since = now.now; + } + if (iocg->indelay_since) { + iocg->local_stat.indelay_us += + now.now - iocg->indelay_since; + iocg->indelay_since = now.now; + } + if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt || iocg->delay) { /* might be oversleeping vtime / hweight changes, kick */ @@ -2014,6 +2068,12 @@ static void ioc_timer_fn(struct timer_list *timer) } commit_weights(ioc); + /* + * Wait and indebt stat are flushed above and the donation calculation + * below needs updated usage stat. Let's bring stat up-to-date. + */ + iocg_flush_stat(&ioc->active_iocgs, &now); + /* calc usage and see whether some weights need to be moved around */ list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { u64 vdone, vtime, usage_us, usage_dur; @@ -2839,6 +2899,13 @@ static size_t ioc_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) pos += scnprintf(buf + pos, size - pos, " cost.usage=%llu", iocg->last_stat.usage_us); + if (blkcg_debug_stats) + pos += scnprintf(buf + pos, size - pos, + " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu", + iocg->last_stat.wait_us, + iocg->last_stat.indebt_us, + iocg->last_stat.indelay_us); + return pos; } From 815332f15c007f7fcf486277b0fd5ec773cfb787 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Sep 2020 14:52:57 -0400 Subject: [PATCH 0362/5344] blk-iocost: update iocost_monitor.py commit a7863b3423fd5d1ab82161654ba83973764b570b upstream. iocost went through significant internal changes. Update iocost_monitor.py accordingly. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- tools/cgroup/iocost_monitor.py | 54 ++++++++++++---------------------- 1 file changed, 19 insertions(+), 35 deletions(-) diff --git a/tools/cgroup/iocost_monitor.py b/tools/cgroup/iocost_monitor.py index f4699f9b46ba7..c4ff907c078b5 100644 --- a/tools/cgroup/iocost_monitor.py +++ b/tools/cgroup/iocost_monitor.py @@ -45,8 +45,7 @@ def err(s): err('The kernel does not have iocost enabled') IOC_RUNNING = prog['IOC_RUNNING'].value_() -NR_USAGE_SLOTS = prog['NR_USAGE_SLOTS'].value_() -HWEIGHT_WHOLE = prog['HWEIGHT_WHOLE'].value_() +WEIGHT_ONE = prog['WEIGHT_ONE'].value_() VTIME_PER_SEC = prog['VTIME_PER_SEC'].value_() VTIME_PER_USEC = prog['VTIME_PER_USEC'].value_() AUTOP_SSD_FAST = prog['AUTOP_SSD_FAST'].value_() @@ -100,7 +99,7 @@ def __init__(self, ioc): self.period_ms = ioc.period_us.value_() / 1_000 self.period_at = ioc.period_at.value_() / 1_000_000 self.vperiod_at = ioc.period_at_vtime.value_() / VTIME_PER_SEC - self.vrate_pct = ioc.vtime_rate.counter.value_() * 100 / VTIME_PER_USEC + self.vrate_pct = ioc.vtime_base_rate.value_() * 100 / VTIME_PER_USEC self.busy_level = ioc.busy_level.value_() self.autop_idx = ioc.autop_idx.value_() self.user_cost_model = ioc.user_cost_model.value_() @@ -136,7 +135,7 @@ def table_preamble_str(self): def table_header_str(self): return f'{"":25} active {"weight":>9} {"hweight%":>13} {"inflt%":>6} ' \ - f'{"dbt":>3} {"delay":>6} {"usages%"}' + f'{"debt":>7} {"delay":>7} {"usage%"}' class IocgStat: def __init__(self, iocg): @@ -144,11 +143,11 @@ def __init__(self, iocg): blkg = iocg.pd.blkg self.is_active = not list_empty(iocg.active_list.address_of_()) - self.weight = iocg.weight.value_() - self.active = iocg.active.value_() - self.inuse = iocg.inuse.value_() - self.hwa_pct = iocg.hweight_active.value_() * 100 / HWEIGHT_WHOLE - self.hwi_pct = iocg.hweight_inuse.value_() * 100 / HWEIGHT_WHOLE + self.weight = iocg.weight.value_() / WEIGHT_ONE + self.active = iocg.active.value_() / WEIGHT_ONE + self.inuse = iocg.inuse.value_() / WEIGHT_ONE + self.hwa_pct = iocg.hweight_active.value_() * 100 / WEIGHT_ONE + self.hwi_pct = iocg.hweight_inuse.value_() * 100 / WEIGHT_ONE self.address = iocg.value_() vdone = iocg.done_vtime.counter.value_() @@ -160,23 +159,13 @@ def __init__(self, iocg): else: self.inflight_pct = 0 - # vdebt used to be an atomic64_t and is now u64, support both - try: - self.debt_ms = iocg.abs_vdebt.counter.value_() / VTIME_PER_USEC / 1000 - except: - self.debt_ms = iocg.abs_vdebt.value_() / VTIME_PER_USEC / 1000 - - self.use_delay = blkg.use_delay.counter.value_() - self.delay_ms = blkg.delay_nsec.counter.value_() / 1_000_000 - - usage_idx = iocg.usage_idx.value_() - self.usages = [] - self.usage = 0 - for i in range(NR_USAGE_SLOTS): - usage = iocg.usages[(usage_idx + 1 + i) % NR_USAGE_SLOTS].value_() - upct = usage * 100 / HWEIGHT_WHOLE - self.usages.append(upct) - self.usage = max(self.usage, upct) + self.usage = (100 * iocg.usage_delta_us.value_() / + ioc.period_us.value_()) if self.active else 0 + self.debt_ms = iocg.abs_vdebt.value_() / VTIME_PER_USEC / 1000 + if blkg.use_delay.counter.value_() != 0: + self.delay_ms = blkg.delay_nsec.counter.value_() / 1_000_000 + else: + self.delay_ms = 0 def dict(self, now, path): out = { 'cgroup' : path, @@ -189,25 +178,20 @@ def dict(self, now, path): 'hweight_inuse_pct' : self.hwi_pct, 'inflight_pct' : self.inflight_pct, 'debt_ms' : self.debt_ms, - 'use_delay' : self.use_delay, 'delay_ms' : self.delay_ms, 'usage_pct' : self.usage, 'address' : self.address } - for i in range(len(self.usages)): - out[f'usage_pct_{i}'] = str(self.usages[i]) return out def table_row_str(self, path): out = f'{path[-28:]:28} ' \ f'{"*" if self.is_active else " "} ' \ - f'{self.inuse:5}/{self.active:5} ' \ + f'{round(self.inuse):5}/{round(self.active):5} ' \ f'{self.hwi_pct:6.2f}/{self.hwa_pct:6.2f} ' \ f'{self.inflight_pct:6.2f} ' \ - f'{min(math.ceil(self.debt_ms), 999):3} ' \ - f'{min(self.use_delay, 99):2}*'\ - f'{min(math.ceil(self.delay_ms), 999):03} ' - for u in self.usages: - out += f'{min(round(u), 999):03d}:' + f'{self.debt_ms:7.2f} ' \ + f'{self.delay_ms:7.2f} '\ + f'{min(self.usage, 999):6.2f}' out = out.rstrip(':') return out From 9d819ec77fa9e259972cce6f2fc09727c566f33f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 11 Sep 2020 18:40:49 -0400 Subject: [PATCH 0363/5344] blk-iocost: fix divide-by-zero in transfer_surpluses() commit 769b628de0b9a7ab0b81e6eecca5ee7bf66e978d upstream. Conceptually, root_iocg->hweight_donating must be less than WEIGHT_ONE but all hweight calculations round up and thus it may end up >= WEIGHT_ONE triggering divide-by-zero and other issues. Bound the value to avoid surprises. Fixes: e08d02aa5fc9 ("blk-iocost: implement Andy's method for donation weight updates") Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-iocost.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 47b88661ddec1..aa63d3d3cf7d1 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1885,15 +1885,21 @@ static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now) /* * Calculate the global donation rate (gamma) - the rate to adjust - * non-donating budgets by. No need to use 64bit multiplication here as - * the first operand is guaranteed to be smaller than WEIGHT_ONE - * (1<<16). + * non-donating budgets by. + * + * No need to use 64bit multiplication here as the first operand is + * guaranteed to be smaller than WEIGHT_ONE (1<<16). + * + * We know that there are beneficiary nodes and the sum of the donating + * hweights can't be whole; however, due to the round-ups during hweight + * calculations, root_iocg->hweight_donating might still end up equal to + * or greater than whole. Limit the range when calculating the divider. * * gamma = (1 - t_r') / (1 - t_r) */ gamma = DIV_ROUND_UP( (WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE, - WEIGHT_ONE - root_iocg->hweight_donating); + WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1)); /* * Calculate adjusted hwi, child_adjusted_sum and inuse for the inner From b5daf1a732a3fdade2316465bdb87266c104d91d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 14 Sep 2020 11:05:13 -0400 Subject: [PATCH 0364/5344] iocost: fix infinite loop bug in adjust_inuse_and_calc_cost() commit aa67db24b6761277d731cc3434420283479927ca upstream. adjust_inuse_and_calc_cost() is responsible for reducing the amount of donated weights dynamically in period as the budget runs low. Because we don't want to do full donation calculation in period, we keep latching up inuse by INUSE_ADJ_STEP_PCT of the active weight of the cgroup until the resulting hweight_inuse is satisfactory. Unfortunately, the adj_step calculation was reading the active weight before acquiring ioc->lock. Because the current thread could have lost race to activate the iocg to another thread before entering this function, it may read the active weight as zero before acquiring ioc->lock. When this happens, the adj_step is calculated as zero and the incremental adjustment loop becomes an infinite one. Fix it by fetching the active weight after acquiring ioc->lock. Fixes: b0853ab4a238 ("blk-iocost: revamp in-period donation snapbacks") Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-iocost.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index aa63d3d3cf7d1..a429793c70121 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2327,9 +2327,8 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, { struct ioc *ioc = iocg->ioc; struct ioc_margins *margins = &ioc->margins; - u32 adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100); u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi; - u32 hwi; + u32 hwi, adj_step; s64 margin; u64 cost, new_inuse; @@ -2358,8 +2357,15 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, return cost; } - /* bump up inuse till @abs_cost fits in the existing budget */ + /* + * Bump up inuse till @abs_cost fits in the existing budget. + * adj_step must be determined after acquiring ioc->lock - we might + * have raced and lost to another thread for activation and could + * be reading 0 iocg->active before ioc->lock which will lead to + * infinite loop. + */ new_inuse = iocg->inuse; + adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100); do { new_inuse = new_inuse + adj_step; propagate_weights(iocg, iocg->active, new_inuse, true, now); From b18808abc4bfca00b88f6bba4ad122edc03878a1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Jan 2021 12:37:23 -0500 Subject: [PATCH 0365/5344] blk-iocost: fix NULL iocg deref from racing against initialization commit d16baa3f1453c14d680c5fee01cd122a22d0e0ce upstream. When initializing iocost for a queue, its rqos should be registered before the blkcg policy is activated to allow policy data initiailization to lookup the associated ioc. This unfortunately means that the rqos methods can be called on bios before iocgs are attached to all existing blkgs. While the race is theoretically possible on ioc_rqos_throttle(), it mostly happened in ioc_rqos_merge() due to the difference in how they lookup ioc. The former determines it from the passed in @rqos and then bails before dereferencing iocg if the looked up ioc is disabled, which most likely is the case if initialization is still in progress. The latter looked up ioc by dereferencing the possibly NULL iocg making it a lot more prone to actually triggering the bug. * Make ioc_rqos_merge() use the same method as ioc_rqos_throttle() to look up ioc for consistency. * Make ioc_rqos_throttle() and ioc_rqos_merge() test for NULL iocg before dereferencing it. * Explain the danger of NULL iocgs in blk_iocost_init(). Signed-off-by: Tejun Heo Reported-by: Jonathan Lemon Cc: stable@vger.kernel.org # v5.4+ Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-iocost.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index a429793c70121..bfe6ea3e41e95 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2467,8 +2467,8 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) bool use_debt, ioc_locked; unsigned long flags; - /* bypass IOs if disabled or for root cgroup */ - if (!ioc->enabled || !iocg->level) + /* bypass IOs if disabled, still initializing, or for root cgroup */ + if (!ioc->enabled || !iocg || !iocg->level) return; /* calculate the absolute vtime cost */ @@ -2595,14 +2595,14 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio) { struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg); - struct ioc *ioc = iocg->ioc; + struct ioc *ioc = rqos_to_ioc(rqos); sector_t bio_end = bio_end_sector(bio); struct ioc_now now; u64 vtime, abs_cost, cost; unsigned long flags; - /* bypass if disabled or for root cgroup */ - if (!ioc->enabled || !iocg->level) + /* bypass if disabled, still initializing, or for root cgroup */ + if (!ioc->enabled || !iocg || !iocg->level) return; abs_cost = calc_vtime_cost(bio, iocg, true); @@ -2779,6 +2779,12 @@ static int blk_iocost_init(struct request_queue *q) ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); + /* + * rqos must be added before activation to allow iocg_pd_init() to + * lookup the ioc from q. This means that the rqos methods may get + * called before policy activation completion, can't assume that the + * target bio has an iocg associated and need to test for NULL iocg. + */ rq_qos_add(q, rqos); ret = blkcg_activate_policy(q, &blkcg_policy_iocost); if (ret) { From fb2490a592873314f052207721ea55b15b1b515f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 13 Dec 2021 14:14:43 -1000 Subject: [PATCH 0366/5344] iocost: Fix divide-by-zero on donation from low hweight cgroup commit edaa26334c117a584add6053f48d63a988d25a6e upstream. The donation calculation logic assumes that the donor has non-zero after-donation hweight, so the lowest active hweight a donating cgroup can have is 2 so that it can donate 1 while keeping the other 1 for itself. Earlier, we only donated from cgroups with sizable surpluses so this condition was always true. However, with the precise donation algorithm implemented, commit f1de2439ec43 ("blk-iocost: revamp donation amount determination") made the donation amount calculation exact enabling even low hweight cgroups to donate. This means that in rare occasions, a cgroup with active hweight of 1 can enter donation calculation triggering the following warning and then a divide-by-zero oops. WARNING: CPU: 4 PID: 0 at block/blk-iocost.c:1928 transfer_surpluses.cold+0x0/0x53 [884/94867] ... RIP: 0010:transfer_surpluses.cold+0x0/0x53 Code: 92 ff 48 c7 c7 28 d1 ab b5 65 48 8b 34 25 00 ae 01 00 48 81 c6 90 06 00 00 e8 8b 3f fe ff 48 c7 c0 ea ff ff ff e9 95 ff 92 ff <0f> 0b 48 c7 c7 30 da ab b5 e8 71 3f fe ff 4c 89 e8 4d 85 ed 74 0 4 ... Call Trace: ioc_timer_fn+0x1043/0x1390 call_timer_fn+0xa1/0x2c0 __run_timers.part.0+0x1ec/0x2e0 run_timer_softirq+0x35/0x70 ... iocg: invalid donation weights in /a/b: active=1 donating=1 after=0 Fix it by excluding cgroups w/ active hweight < 2 from donating. Excluding these extreme low hweight donations shouldn't affect work conservation in any meaningful way. Signed-off-by: Tejun Heo Fixes: f1de2439ec43 ("blk-iocost: revamp donation amount determination") Cc: stable@vger.kernel.org # v5.10+ Link: https://lore.kernel.org/r/Ybfh86iSvpWKxhVM@slm.duckdns.org Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-iocost.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index bfe6ea3e41e95..079c6c5fc2968 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2147,7 +2147,14 @@ static void ioc_timer_fn(struct timer_list *timer) hwm = current_hweight_max(iocg); new_hwi = hweight_after_donation(iocg, old_hwi, hwm, usage, &now); - if (new_hwi < hwm) { + /* + * Donation calculation assumes hweight_after_donation + * to be positive, a condition that a donor w/ hwa < 2 + * can't meet. Don't bother with donation if hwa is + * below 2. It's not gonna make a meaningful difference + * anyway. + */ + if (new_hwi < hwm && hwa >= 2) { iocg->hweight_donating = hwa; iocg->hweight_after_donation = new_hwi; list_add(&iocg->surplus_list, &surpluses); From 064c1540b5fee7c25830f3638200f4aab8c10f41 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 26 Apr 2022 19:01:01 -1000 Subject: [PATCH 0367/5344] iocost: don't reset the inuse weight of under-weighted debtors commit 8c936f9ea11ec4e35e288810a7503b5c841a355f upstream. When an iocg is in debt, its inuse weight is owned by debt handling and should stay at 1. This invariant was broken when determining the amount of surpluses at the beginning of donation calculation - when an iocg's hierarchical weight is too low, the iocg is excluded from donation calculation and its inuse is reset to its active regardless of its indebtedness, triggering warnings like the following: WARNING: CPU: 6 PID: 0 at block/blk-iocost.c:1416 iocg_kick_waitq+0x392/0x3a0 ... RIP: 0010:iocg_kick_waitq+0x392/0x3a0 Code: 00 00 be ff ff ff ff 48 89 4d a8 e8 98 b2 70 00 48 8b 4d a8 85 c0 0f 85 4a fe ff ff 0f 0b e9 43 fe ff ff 0f 0b e9 4d fe ff ff <0f> 0b e9 50 fe ff ff e8 a2 ae 70 00 66 90 0f 1f 44 00 00 55 48 89 RSP: 0018:ffffc90000200d08 EFLAGS: 00010016 ... ioc_timer_fn+0x2e0/0x1470 call_timer_fn+0xa1/0x2c0 ... As this happens only when an iocg's hierarchical weight is negligible, its impact likely is limited to triggering the warnings. Fix it by skipping resetting inuse of under-weighted debtors. Signed-off-by: Tejun Heo Reported-by: Rik van Riel Fixes: c421a3eb2e27 ("blk-iocost: revamp debt handling") Cc: stable@vger.kernel.org # v5.10+ Link: https://lore.kernel.org/r/YmjODd4aif9BzFuO@slm.duckdns.org Signed-off-by: Jens Axboe Signed-off-by: hanjinke --- block/blk-iocost.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 079c6c5fc2968..900be48f6b5a3 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2158,7 +2158,17 @@ static void ioc_timer_fn(struct timer_list *timer) iocg->hweight_donating = hwa; iocg->hweight_after_donation = new_hwi; list_add(&iocg->surplus_list, &surpluses); - } else { + } else if (!iocg->abs_vdebt) { + /* + * @iocg doesn't have enough to donate. Reset + * its inuse to active. + * + * Don't reset debtors as their inuse's are + * owned by debt handling. This shouldn't affect + * donation calculuation in any meaningful way + * as @iocg doesn't have a meaningful amount of + * share anyway. + */ TRACE_IOCG_PATH(inuse_shortage, iocg, &now, iocg->inuse, iocg->active, iocg->hweight_inuse, new_hwi); From a78f5fe87bfc3651691867da89ab1d65d360b6b4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 17 Sep 2020 20:44:52 -0400 Subject: [PATCH 0368/5344] iocost: factor out ioc_forgive_debts() commit ab8df828b5f6eab6377fa0c349c92a3678498256 upstream. Debt reduction logic is going to be improved and expanded. Factor it out into ioc_forgive_debts() and generalize the comment a bit. No functional change. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: Chengming Zhou --- block/blk-iocost.c | 66 ++++++++++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 31 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 900be48f6b5a3..5c40ea8095d0b 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1983,6 +1983,40 @@ static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now) list_del_init(&iocg->walk_list); } +/* + * A low weight iocg can amass a large amount of debt, for example, when + * anonymous memory gets reclaimed aggressively. If the system has a lot of + * memory paired with a slow IO device, the debt can span multiple seconds or + * more. If there are no other subsequent IO issuers, the in-debt iocg may end + * up blocked paying its debt while the IO device is idle. + * + * The following protects against such cases. If the device has been + * sufficiently idle for a while, the debts are halved. + */ +static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors, + int nr_shortages, struct ioc_now *now) +{ + if (nr_shortages || + div64_u64(100 * usage_us_sum, now->now - ioc->period_at) >= + DEBT_BUSY_USAGE_PCT) + ioc->debt_busy_at = now->now; + + if (nr_debtors && + now->now - ioc->debt_busy_at >= DEBT_REDUCTION_IDLE_DUR) { + struct ioc_gq *iocg; + + list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { + if (iocg->abs_vdebt) { + spin_lock(&iocg->waitq.lock); + iocg->abs_vdebt /= 2; + iocg_kick_waitq(iocg, true, now); + spin_unlock(&iocg->waitq.lock); + } + } + ioc->debt_busy_at = now->now; + } +} + static void ioc_timer_fn(struct timer_list *timer) { struct ioc *ioc = container_of(timer, struct ioc, timer); @@ -2192,37 +2226,7 @@ static void ioc_timer_fn(struct timer_list *timer) list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list) list_del_init(&iocg->surplus_list); - /* - * A low weight iocg can amass a large amount of debt, for example, when - * anonymous memory gets reclaimed aggressively. If the system has a lot - * of memory paired with a slow IO device, the debt can span multiple - * seconds or more. If there are no other subsequent IO issuers, the - * in-debt iocg may end up blocked paying its debt while the IO device - * is idle. - * - * The following protects against such pathological cases. If the device - * has been sufficiently idle for a substantial amount of time, the - * debts are halved. The criteria are on the conservative side as we - * want to resolve the rare extreme cases without impacting regular - * operation by forgiving debts too readily. - */ - if (nr_shortages || - div64_u64(100 * usage_us_sum, now.now - ioc->period_at) >= - DEBT_BUSY_USAGE_PCT) - ioc->debt_busy_at = now.now; - - if (nr_debtors && - now.now - ioc->debt_busy_at >= DEBT_REDUCTION_IDLE_DUR) { - list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { - if (iocg->abs_vdebt) { - spin_lock(&iocg->waitq.lock); - iocg->abs_vdebt /= 2; - iocg_kick_waitq(iocg, true, &now); - spin_unlock(&iocg->waitq.lock); - } - } - ioc->debt_busy_at = now.now; - } + ioc_forgive_debts(ioc, usage_us_sum, nr_debtors, nr_shortages, &now); /* * If q is getting clogged or we're missing too much, we're issuing From 5c830eb73002b9adaa2e96c85d2afee09f8fa2c3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 17 Sep 2020 20:44:53 -0400 Subject: [PATCH 0369/5344] iocost: replace nr_shortages cond in ioc_forgive_debts() with busy_level one commit 33a1fe6d822b3552fcad9bbb059b2fb93566f4c9 upstream. Debt reduction was blocked if any iocg was short on budget in the past period to avoid reducing debts while some iocgs are saturated. However, this ends up unnecessarily blocking debt reduction due to temporary local imbalances when the device is generally being underutilized, while also failing to block when the underlying device is overwhelmed and the usage becomes low from high latency. Given that debt accumulation mostly happens with swapout bursts which can significantly deteriorate the underlying device's latency response, the current logic is not great. Let's replace it with ioc->busy_level based condition so that we block debt reduction when the underlying device is being saturated. ioc_forgive_debts() call is moved after busy_level determination. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: Chengming Zhou --- block/blk-iocost.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 5c40ea8095d0b..6bf763ac3edfc 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -290,10 +290,7 @@ enum { MIN_DELAY = 250, MAX_DELAY = 250 * USEC_PER_MSEC, - /* - * Halve debts if total usage keeps staying under 25% w/o any shortages - * for over 100ms. - */ + /* halve debts if total usage keeps staying under 25% for over 100ms */ DEBT_BUSY_USAGE_PCT = 25, DEBT_REDUCTION_IDLE_DUR = 100 * USEC_PER_MSEC, @@ -1994,9 +1991,9 @@ static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now) * sufficiently idle for a while, the debts are halved. */ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors, - int nr_shortages, struct ioc_now *now) + struct ioc_now *now) { - if (nr_shortages || + if (ioc->busy_level < 0 || div64_u64(100 * usage_us_sum, now->now - ioc->period_at) >= DEBT_BUSY_USAGE_PCT) ioc->debt_busy_at = now->now; @@ -2226,8 +2223,6 @@ static void ioc_timer_fn(struct timer_list *timer) list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list) list_del_init(&iocg->surplus_list); - ioc_forgive_debts(ioc, usage_us_sum, nr_debtors, nr_shortages, &now); - /* * If q is getting clogged or we're missing too much, we're issuing * too much IO and should lower vtime rate. If we're not missing @@ -2322,6 +2317,8 @@ static void ioc_timer_fn(struct timer_list *timer) ioc_refresh_params(ioc, false); + ioc_forgive_debts(ioc, usage_us_sum, nr_debtors, &now); + /* * This period is done. Move onto the next one. If nothing's * going on with the device, stop the timer. From f0714f3753205d15e4109179d027e13c378bb8ee Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 17 Sep 2020 20:44:54 -0400 Subject: [PATCH 0370/5344] iocost: recalculate delay after debt reduction commit d95178410b7706844491efb77fb0a37a69f7a5e6 upstream. Debt sets the initial delay duration which is decayed over time. The current debt reduction halved the debt but didn't change the delay. It prevented future debts from increasing delay but didn't do anything to lower the existing delay, limiting the mechanism's ability to reduce unnecessary idling. Reset iocg->delay to 0 after debt reduction so that iocg_kick_waitq() recalculates new delay value based on the reduced debt amount. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: Chengming Zhou --- block/blk-iocost.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 6bf763ac3edfc..a6b41503f8821 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1988,7 +1988,8 @@ static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now) * up blocked paying its debt while the IO device is idle. * * The following protects against such cases. If the device has been - * sufficiently idle for a while, the debts are halved. + * sufficiently idle for a while, the debts are halved and delays are + * recalculated. */ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors, struct ioc_now *now) @@ -2006,6 +2007,7 @@ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors, if (iocg->abs_vdebt) { spin_lock(&iocg->waitq.lock); iocg->abs_vdebt /= 2; + iocg->delay = 0; /* kick_waitq will recalc */ iocg_kick_waitq(iocg, true, now); spin_unlock(&iocg->waitq.lock); } From 22b142343456fc1e1be597336741e6ad4f61b601 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 17 Sep 2020 20:44:55 -0400 Subject: [PATCH 0371/5344] iocost: reimplement debt forgiveness using average usage commit c7af2a003a411853fc9c805efe96d10dcc8adb5f upstream. Debt forgiveness logic was counting the number of consecutive !busy periods as the trigger condition. While this usually works, it can easily be thrown off by temporary fluctuations especially on configurations w/ short periods. This patch reimplements debt forgiveness so that: * Use the average usage over the forgiveness period instead of counting consecutive periods. * Debt is reduced at around the target rate (1/2 every 100ms) regardless of ioc period duration. * Usage threshold is raised to 50%. Combined with the preceding changes and the switch to average usage, this makes debt forgivness a lot more effective at reducing the amount of unnecessary idleness. * Constants are renamed with DFGV_ prefix. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: Chengming Zhou --- block/blk-iocost.c | 94 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 69 insertions(+), 25 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index a6b41503f8821..655daf7ed26b5 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -290,9 +290,9 @@ enum { MIN_DELAY = 250, MAX_DELAY = 250 * USEC_PER_MSEC, - /* halve debts if total usage keeps staying under 25% for over 100ms */ - DEBT_BUSY_USAGE_PCT = 25, - DEBT_REDUCTION_IDLE_DUR = 100 * USEC_PER_MSEC, + /* halve debts if avg usage over 100ms is under 50% */ + DFGV_USAGE_PCT = 50, + DFGV_PERIOD = 100 * USEC_PER_MSEC, /* don't let cmds which take a very long time pin lagging for too long */ MAX_LAGGING_PERIODS = 10, @@ -436,8 +436,10 @@ struct ioc { bool weights_updated; atomic_t hweight_gen; /* for lazy hweights */ - /* the last time debt cancel condition wasn't met */ - u64 debt_busy_at; + /* debt forgivness */ + u64 dfgv_period_at; + u64 dfgv_period_rem; + u64 dfgv_usage_us_sum; u64 autop_too_fast_at; u64 autop_too_slow_at; @@ -1255,7 +1257,8 @@ static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) if (ioc->running == IOC_IDLE) { ioc->running = IOC_RUNNING; - ioc->debt_busy_at = now->now; + ioc->dfgv_period_at = now->now; + ioc->dfgv_period_rem = 0; ioc_start_period(ioc, now); } @@ -1994,25 +1997,66 @@ static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now) static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors, struct ioc_now *now) { - if (ioc->busy_level < 0 || - div64_u64(100 * usage_us_sum, now->now - ioc->period_at) >= - DEBT_BUSY_USAGE_PCT) - ioc->debt_busy_at = now->now; - - if (nr_debtors && - now->now - ioc->debt_busy_at >= DEBT_REDUCTION_IDLE_DUR) { - struct ioc_gq *iocg; - - list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { - if (iocg->abs_vdebt) { - spin_lock(&iocg->waitq.lock); - iocg->abs_vdebt /= 2; - iocg->delay = 0; /* kick_waitq will recalc */ - iocg_kick_waitq(iocg, true, now); - spin_unlock(&iocg->waitq.lock); - } - } - ioc->debt_busy_at = now->now; + struct ioc_gq *iocg; + u64 dur, usage_pct, nr_cycles; + + /* if no debtor, reset the cycle */ + if (!nr_debtors) { + ioc->dfgv_period_at = now->now; + ioc->dfgv_period_rem = 0; + ioc->dfgv_usage_us_sum = 0; + return; + } + + /* + * Debtors can pass through a lot of writes choking the device and we + * don't want to be forgiving debts while the device is struggling from + * write bursts. If we're missing latency targets, consider the device + * fully utilized. + */ + if (ioc->busy_level > 0) + usage_us_sum = max_t(u64, usage_us_sum, ioc->period_us); + + ioc->dfgv_usage_us_sum += usage_us_sum; + if (time_before64(now->now, ioc->dfgv_period_at + DFGV_PERIOD)) + return; + + /* + * At least DFGV_PERIOD has passed since the last period. Calculate the + * average usage and reset the period counters. + */ + dur = now->now - ioc->dfgv_period_at; + usage_pct = div64_u64(100 * ioc->dfgv_usage_us_sum, dur); + + ioc->dfgv_period_at = now->now; + ioc->dfgv_usage_us_sum = 0; + + /* if was too busy, reset everything */ + if (usage_pct > DFGV_USAGE_PCT) { + ioc->dfgv_period_rem = 0; + return; + } + + /* + * Usage is lower than threshold. Let's forgive some debts. Debt + * forgiveness runs off of the usual ioc timer but its period usually + * doesn't match ioc's. Compensate the difference by performing the + * reduction as many times as would fit in the duration since the last + * run and carrying over the left-over duration in @ioc->dfgv_period_rem + * - if ioc period is 75% of DFGV_PERIOD, one out of three consecutive + * reductions is doubled. + */ + nr_cycles = dur + ioc->dfgv_period_rem; + ioc->dfgv_period_rem = do_div(nr_cycles, DFGV_PERIOD); + + list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { + if (!iocg->abs_vdebt) + continue; + spin_lock(&iocg->waitq.lock); + iocg->abs_vdebt >>= nr_cycles; + iocg->delay = 0; /* kick_waitq will recalc */ + iocg_kick_waitq(iocg, true, now); + spin_unlock(&iocg->waitq.lock); } } From 6f219517b32b21bd49a1c9e1fb1d432ca0c73f56 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 17 Sep 2020 20:44:56 -0400 Subject: [PATCH 0372/5344] iocost: add iocg_forgive_debt tracepoint commit c5a6561b8d99ea7d8df21308249ce05bce3dd466 upstream. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: Chengming Zhou --- block/blk-iocost.c | 12 ++++++++++ include/trace/events/iocost.h | 41 +++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 655daf7ed26b5..6b77c71a4ff0f 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2050,12 +2050,24 @@ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors, ioc->dfgv_period_rem = do_div(nr_cycles, DFGV_PERIOD); list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { + u64 __maybe_unused old_debt, __maybe_unused old_delay; + if (!iocg->abs_vdebt) continue; + spin_lock(&iocg->waitq.lock); + + old_debt = iocg->abs_vdebt; + old_delay = iocg->delay; + iocg->abs_vdebt >>= nr_cycles; iocg->delay = 0; /* kick_waitq will recalc */ iocg_kick_waitq(iocg, true, now); + + TRACE_IOCG_PATH(iocg_forgive_debt, iocg, now, usage_pct, + old_debt, iocg->abs_vdebt, + old_delay, iocg->delay); + spin_unlock(&iocg->waitq.lock); } } diff --git a/include/trace/events/iocost.h b/include/trace/events/iocost.h index b350860d2e711..0b6869980ba22 100644 --- a/include/trace/events/iocost.h +++ b/include/trace/events/iocost.h @@ -164,6 +164,47 @@ TRACE_EVENT(iocost_ioc_vrate_adj, ) ); +TRACE_EVENT(iocost_iocg_forgive_debt, + + TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now, + u32 usage_pct, u64 old_debt, u64 new_debt, + u64 old_delay, u64 new_delay), + + TP_ARGS(iocg, path, now, usage_pct, + old_debt, new_debt, old_delay, new_delay), + + TP_STRUCT__entry ( + __string(devname, ioc_name(iocg->ioc)) + __string(cgroup, path) + __field(u64, now) + __field(u64, vnow) + __field(u32, usage_pct) + __field(u64, old_debt) + __field(u64, new_debt) + __field(u64, old_delay) + __field(u64, new_delay) + ), + + TP_fast_assign( + __assign_str(devname, ioc_name(iocg->ioc)); + __assign_str(cgroup, path); + __entry->now = now->now; + __entry->vnow = now->vnow; + __entry->usage_pct = usage_pct; + __entry->old_debt = old_debt; + __entry->new_debt = new_debt; + __entry->old_delay = old_delay; + __entry->new_delay = new_delay; + ), + + TP_printk("[%s:%s] now=%llu:%llu usage=%u debt=%llu->%llu delay=%llu->%llu", + __get_str(devname), __get_str(cgroup), + __entry->now, __entry->vnow, __entry->usage_pct, + __entry->old_debt, __entry->new_debt, + __entry->old_delay, __entry->new_delay + ) +); + #endif /* _TRACE_BLK_IOCOST_H */ /* This part must be outside protection */ From c895405a6d94d2584d026b3bd4ca5b34028abb29 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 18 Sep 2020 14:41:27 -0400 Subject: [PATCH 0373/5344] iocost: consider iocgs with active delays for debt forgiveness commit bec02dbbafad534674309f8b948094900f456797 upstream. An iocg may have 0 debt but non-zero delay. The current debt forgiveness logic doesn't act on such iocgs. This can lead to unexpected behaviors - an iocg with a little bit of debt will have its delay canceled through debt forgiveness but one w/o any debt but active delay will have to wait out until its delay decays out. This patch updates the debt handling logic so that it treats delays the same as debts. If either debt or delay is active, debt forgiveness logic kicks in and acts on both the same way. Also, avoid turning the debt and delay directly to zero as that can confuse state transitions. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: Chengming Zhou --- block/blk-iocost.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 6b77c71a4ff0f..02e099228b036 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2052,7 +2052,7 @@ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors, list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { u64 __maybe_unused old_debt, __maybe_unused old_delay; - if (!iocg->abs_vdebt) + if (!iocg->abs_vdebt && !iocg->delay) continue; spin_lock(&iocg->waitq.lock); @@ -2060,8 +2060,11 @@ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors, old_debt = iocg->abs_vdebt; old_delay = iocg->delay; - iocg->abs_vdebt >>= nr_cycles; - iocg->delay = 0; /* kick_waitq will recalc */ + if (iocg->abs_vdebt) + iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles ?: 1; + if (iocg->delay) + iocg->delay = iocg->delay >> nr_cycles ?: 1; + iocg_kick_waitq(iocg, true, now); TRACE_IOCG_PATH(iocg_forgive_debt, iocg, now, usage_pct, @@ -2133,7 +2136,7 @@ static void ioc_timer_fn(struct timer_list *timer) iocg->delay) { /* might be oversleeping vtime / hweight changes, kick */ iocg_kick_waitq(iocg, true, &now); - if (iocg->abs_vdebt) + if (iocg->abs_vdebt || iocg->delay) nr_debtors++; } else if (iocg_is_idle(iocg)) { /* no waiter and idle, deactivate */ From 15870c90dc347c46bb7ac27175ba155b010b363f Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 26 Nov 2020 16:16:11 +0800 Subject: [PATCH 0374/5344] blk-iocost: Fix some typos in comments commit 5ba1add216fe82289769045627d97f233bbcc645 upstream. Fix some typos in comments. Signed-off-by: Baolin Wang Acked-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: Chengming Zhou --- block/blk-iocost.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 02e099228b036..9f70323c08bc7 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -39,7 +39,7 @@ * On top of that, a size cost proportional to the length of the IO is * added. While simple, this model captures the operational * characteristics of a wide varienty of devices well enough. Default - * paramters for several different classes of devices are provided and the + * parameters for several different classes of devices are provided and the * parameters can be configured from userspace via * /sys/fs/cgroup/io.cost.model. * @@ -80,7 +80,7 @@ * * This constitutes the basis of IO capacity distribution. Each cgroup's * vtime is running at a rate determined by its hweight. A cgroup tracks - * the vtime consumed by past IOs and can issue a new IO iff doing so + * the vtime consumed by past IOs and can issue a new IO if doing so * wouldn't outrun the current device vtime. Otherwise, the IO is * suspended until the vtime has progressed enough to cover it. * @@ -158,7 +158,7 @@ * Instead of debugfs or other clumsy monitoring mechanisms, this * controller uses a drgn based monitoring script - * tools/cgroup/iocost_monitor.py. For details on drgn, please see - * https://github.com/osandov/drgn. The ouput looks like the following. + * https://github.com/osandov/drgn. The output looks like the following. * * sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12% * active weight hweight% inflt% dbt delay usages% @@ -495,7 +495,7 @@ struct ioc_gq { /* * `vtime` is this iocg's vtime cursor which progresses as IOs are * issued. If lagging behind device vtime, the delta represents - * the currently available IO budget. If runnning ahead, the + * the currently available IO budget. If running ahead, the * overage. * * `vtime_done` is the same but progressed on completion rather @@ -1050,7 +1050,7 @@ static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, /* * The delta between inuse and active sums indicates that - * that much of weight is being given away. Parent's inuse + * much of weight is being given away. Parent's inuse * and active should reflect the ratio. */ if (parent->child_active_sum) { @@ -2421,7 +2421,7 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, return cost; /* - * We only increase inuse during period and do so iff the margin has + * We only increase inuse during period and do so if the margin has * deteriorated since the previous adjustment. */ if (margin >= iocg->saved_margin || margin >= margins->low || From bf60210da6152a74d73aef3bc9bb366e215ba634 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 26 Nov 2020 16:16:12 +0800 Subject: [PATCH 0375/5344] blk-iocost: Remove unnecessary advance declaration commit 647c9f03b2b66cf1f505208c313998fc833ed28b upstream. Remove unnecessary advance declaration of struct ioc_gq. Signed-off-by: Baolin Wang Acked-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: Chengming Zhou --- block/blk-iocost.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 9f70323c08bc7..813e0be6476b2 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -373,8 +373,6 @@ enum { AUTOP_SSD_FAST, }; -struct ioc_gq; - struct ioc_params { u32 qos[NR_QOS_PARAMS]; u64 i_lcoefs[NR_I_LCOEFS]; From 3120c1db63fdf8303b24b2cf6d3403001aa91c1c Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 26 Nov 2020 16:16:13 +0800 Subject: [PATCH 0376/5344] blk-iocost: Move the usage ratio calculation to the correct place commit c09245f61c6ac4ef253a5fcf97e5bcfc0ce25fc7 upstream. We only use the hweight based usage ratio to calculate the new hweight_inuse of the iocg to decide if this iocg can donate some surplus vtime. Thus move the usage ratio calculation to the correct place to avoid unnecessary calculation for some vtime shortage iocgs. Signed-off-by: Baolin Wang Acked-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: Chengming Zhou --- block/blk-iocost.c | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 813e0be6476b2..6f8905e07e85b 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2172,8 +2172,8 @@ static void ioc_timer_fn(struct timer_list *timer) /* calc usage and see whether some weights need to be moved around */ list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { - u64 vdone, vtime, usage_us, usage_dur; - u32 usage, hw_active, hw_inuse; + u64 vdone, vtime, usage_us; + u32 hw_active, hw_inuse; /* * Collect unused and wind vtime closer to vnow to prevent @@ -2204,30 +2204,32 @@ static void ioc_timer_fn(struct timer_list *timer) usage_us = iocg->usage_delta_us; usage_us_sum += usage_us; - if (vdone != vtime) { - u64 inflight_us = DIV64_U64_ROUND_UP( - cost_to_abs_cost(vtime - vdone, hw_inuse), - ioc->vtime_base_rate); - usage_us = max(usage_us, inflight_us); - } - - /* convert to hweight based usage ratio */ - if (time_after64(iocg->activated_at, ioc->period_at)) - usage_dur = max_t(u64, now.now - iocg->activated_at, 1); - else - usage_dur = max_t(u64, now.now - ioc->period_at, 1); - - usage = clamp_t(u32, - DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, - usage_dur), - 1, WEIGHT_ONE); - /* see whether there's surplus vtime */ WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); if (hw_inuse < hw_active || (!waitqueue_active(&iocg->waitq) && time_before64(vtime, now.vnow - ioc->margins.low))) { - u32 hwa, old_hwi, hwm, new_hwi; + u32 hwa, old_hwi, hwm, new_hwi, usage; + u64 usage_dur; + + if (vdone != vtime) { + u64 inflight_us = DIV64_U64_ROUND_UP( + cost_to_abs_cost(vtime - vdone, hw_inuse), + ioc->vtime_base_rate); + + usage_us = max(usage_us, inflight_us); + } + + /* convert to hweight based usage ratio */ + if (time_after64(iocg->activated_at, ioc->period_at)) + usage_dur = max_t(u64, now.now - iocg->activated_at, 1); + else + usage_dur = max_t(u64, now.now - ioc->period_at, 1); + + usage = clamp_t(u32, + DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, + usage_dur), + 1, WEIGHT_ONE); /* * Already donating or accumulated enough to start. From 1a889083b5439881a254f2d23a376fdb7528313f Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 26 Nov 2020 16:16:14 +0800 Subject: [PATCH 0377/5344] blk-iocost: Factor out the active iocgs' state check into a separate function commit 2474787a75b4f358e81f367653c73edecd67aa2d upstream. Factor out the iocgs' state check into a separate function to simplify the ioc_timer_fn(). No functional change. Signed-off-by: Baolin Wang Acked-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: Chengming Zhou --- block/blk-iocost.c | 94 ++++++++++++++++++++++++++-------------------- 1 file changed, 54 insertions(+), 40 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 6f8905e07e85b..168c4f1567f53 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2073,40 +2073,21 @@ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors, } } -static void ioc_timer_fn(struct timer_list *timer) +/* + * Check the active iocgs' state to avoid oversleeping and deactive + * idle iocgs. + * + * Since waiters determine the sleep durations based on the vrate + * they saw at the time of sleep, if vrate has increased, some + * waiters could be sleeping for too long. Wake up tardy waiters + * which should have woken up in the last period and expire idle + * iocgs. + */ +static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now) { - struct ioc *ioc = container_of(timer, struct ioc, timer); + int nr_debtors = 0; struct ioc_gq *iocg, *tiocg; - struct ioc_now now; - LIST_HEAD(surpluses); - int nr_debtors = 0, nr_shortages = 0, nr_lagging = 0; - u64 usage_us_sum = 0; - u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; - u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; - u32 missed_ppm[2], rq_wait_pct; - u64 period_vtime; - int prev_busy_level; - - /* how were the latencies during the period? */ - ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct); - /* take care of active iocgs */ - spin_lock_irq(&ioc->lock); - - ioc_now(ioc, &now); - - period_vtime = now.vnow - ioc->period_at_vtime; - if (WARN_ON_ONCE(!period_vtime)) { - spin_unlock_irq(&ioc->lock); - return; - } - - /* - * Waiters determine the sleep durations based on the vrate they - * saw at the time of sleep. If vrate has increased, some waiters - * could be sleeping for too long. Wake up tardy waiters which - * should have woken up in the last period and expire idle iocgs. - */ list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && !iocg->delay && !iocg_is_idle(iocg)) @@ -2116,24 +2097,24 @@ static void ioc_timer_fn(struct timer_list *timer) /* flush wait and indebt stat deltas */ if (iocg->wait_since) { - iocg->local_stat.wait_us += now.now - iocg->wait_since; - iocg->wait_since = now.now; + iocg->local_stat.wait_us += now->now - iocg->wait_since; + iocg->wait_since = now->now; } if (iocg->indebt_since) { iocg->local_stat.indebt_us += - now.now - iocg->indebt_since; - iocg->indebt_since = now.now; + now->now - iocg->indebt_since; + iocg->indebt_since = now->now; } if (iocg->indelay_since) { iocg->local_stat.indelay_us += - now.now - iocg->indelay_since; - iocg->indelay_since = now.now; + now->now - iocg->indelay_since; + iocg->indelay_since = now->now; } if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt || iocg->delay) { /* might be oversleeping vtime / hweight changes, kick */ - iocg_kick_waitq(iocg, true, &now); + iocg_kick_waitq(iocg, true, now); if (iocg->abs_vdebt || iocg->delay) nr_debtors++; } else if (iocg_is_idle(iocg)) { @@ -2147,7 +2128,7 @@ static void ioc_timer_fn(struct timer_list *timer) * error and throw away. On reactivation, it'll start * with the target budget. */ - excess = now.vnow - vtime - ioc->margins.target; + excess = now->vnow - vtime - ioc->margins.target; if (excess > 0) { u32 old_hwi; @@ -2156,13 +2137,46 @@ static void ioc_timer_fn(struct timer_list *timer) WEIGHT_ONE); } - __propagate_weights(iocg, 0, 0, false, &now); + __propagate_weights(iocg, 0, 0, false, now); list_del_init(&iocg->active_list); } spin_unlock(&iocg->waitq.lock); } + commit_weights(ioc); + return nr_debtors; +} + +static void ioc_timer_fn(struct timer_list *timer) +{ + struct ioc *ioc = container_of(timer, struct ioc, timer); + struct ioc_gq *iocg, *tiocg; + struct ioc_now now; + LIST_HEAD(surpluses); + int nr_debtors, nr_shortages = 0, nr_lagging = 0; + u64 usage_us_sum = 0; + u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; + u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; + u32 missed_ppm[2], rq_wait_pct; + u64 period_vtime; + int prev_busy_level; + + /* how were the latencies during the period? */ + ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct); + + /* take care of active iocgs */ + spin_lock_irq(&ioc->lock); + + ioc_now(ioc, &now); + + period_vtime = now.vnow - ioc->period_at_vtime; + if (WARN_ON_ONCE(!period_vtime)) { + spin_unlock_irq(&ioc->lock); + return; + } + + nr_debtors = ioc_check_iocgs(ioc, &now); /* * Wait and indebt stat are flushed above and the donation calculation From 2ec60cfb9f9c7d506307852d96030c502ae00b22 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 26 Nov 2020 16:16:15 +0800 Subject: [PATCH 0378/5344] blk-iocost: Factor out the base vrate change into a separate function commit 926f75f6a9ef503d45dced061e304d0324beeba1 upstream. Factor out the base vrate change code into a separate function to fimplify the ioc_timer_fn(). No functional change. Signed-off-by: Baolin Wang Acked-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: Chengming Zhou --- block/blk-iocost.c | 99 +++++++++++++++++++++++++--------------------- 1 file changed, 54 insertions(+), 45 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 168c4f1567f53..18bb593adcdeb 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -974,6 +974,58 @@ static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now) ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod); } +static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct, + int nr_lagging, int nr_shortages, + int prev_busy_level, u32 *missed_ppm) +{ + u64 vrate = ioc->vtime_base_rate; + u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; + + if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) { + if (ioc->busy_level != prev_busy_level || nr_lagging) + trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate), + missed_ppm, rq_wait_pct, + nr_lagging, nr_shortages); + + return; + } + + /* rq_wait signal is always reliable, ignore user vrate_min */ + if (rq_wait_pct > RQ_WAIT_BUSY_PCT) + vrate_min = VRATE_MIN; + + /* + * If vrate is out of bounds, apply clamp gradually as the + * bounds can change abruptly. Otherwise, apply busy_level + * based adjustment. + */ + if (vrate < vrate_min) { + vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), 100); + vrate = min(vrate, vrate_min); + } else if (vrate > vrate_max) { + vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100); + vrate = max(vrate, vrate_max); + } else { + int idx = min_t(int, abs(ioc->busy_level), + ARRAY_SIZE(vrate_adj_pct) - 1); + u32 adj_pct = vrate_adj_pct[idx]; + + if (ioc->busy_level > 0) + adj_pct = 100 - adj_pct; + else + adj_pct = 100 + adj_pct; + + vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100), + vrate_min, vrate_max); + } + + trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, + nr_lagging, nr_shortages); + + ioc->vtime_base_rate = vrate; + ioc_refresh_margins(ioc); +} + /* take a snapshot of the current [v]time and vrate */ static void ioc_now(struct ioc *ioc, struct ioc_now *now) { @@ -2344,51 +2396,8 @@ static void ioc_timer_fn(struct timer_list *timer) ioc->busy_level = clamp(ioc->busy_level, -1000, 1000); - if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) { - u64 vrate = ioc->vtime_base_rate; - u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; - - /* rq_wait signal is always reliable, ignore user vrate_min */ - if (rq_wait_pct > RQ_WAIT_BUSY_PCT) - vrate_min = VRATE_MIN; - - /* - * If vrate is out of bounds, apply clamp gradually as the - * bounds can change abruptly. Otherwise, apply busy_level - * based adjustment. - */ - if (vrate < vrate_min) { - vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), - 100); - vrate = min(vrate, vrate_min); - } else if (vrate > vrate_max) { - vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), - 100); - vrate = max(vrate, vrate_max); - } else { - int idx = min_t(int, abs(ioc->busy_level), - ARRAY_SIZE(vrate_adj_pct) - 1); - u32 adj_pct = vrate_adj_pct[idx]; - - if (ioc->busy_level > 0) - adj_pct = 100 - adj_pct; - else - adj_pct = 100 + adj_pct; - - vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100), - vrate_min, vrate_max); - } - - trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, - nr_lagging, nr_shortages); - - ioc->vtime_base_rate = vrate; - ioc_refresh_margins(ioc); - } else if (ioc->busy_level != prev_busy_level || nr_lagging) { - trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate), - missed_ppm, rq_wait_pct, nr_lagging, - nr_shortages); - } + ioc_adjust_base_vrate(ioc, rq_wait_pct, nr_lagging, nr_shortages, + prev_busy_level, missed_ppm); ioc_refresh_params(ioc, false); From 78d72c8c3ef443c86ba678d2c7d738bee37a155d Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 10 Dec 2020 18:56:44 +0800 Subject: [PATCH 0379/5344] blk-iocost: Add iocg idle state tracepoint commit 76efc1c770968d6c786e5340029f8005ed29b2a5 upstream. It will be helpful to trace the iocg's whole state, including active and idle state. And we can easily expand the original iocost_iocg_activate trace event to support a state trace class, including active and idle state tracing. Acked-by: Tejun Heo Signed-off-by: Baolin Wang Signed-off-by: Jens Axboe Signed-off-by: Chengming Zhou --- block/blk-iocost.c | 3 +++ include/trace/events/iocost.h | 16 +++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 18bb593adcdeb..6ddca77e3f35b 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2189,6 +2189,9 @@ static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now) WEIGHT_ONE); } + TRACE_IOCG_PATH(iocg_idle, iocg, now, + atomic64_read(&iocg->active_period), + atomic64_read(&ioc->cur_period), vtime); __propagate_weights(iocg, 0, 0, false, now); list_del_init(&iocg->active_list); } diff --git a/include/trace/events/iocost.h b/include/trace/events/iocost.h index 0b6869980ba22..e282ce02fa2d6 100644 --- a/include/trace/events/iocost.h +++ b/include/trace/events/iocost.h @@ -11,7 +11,7 @@ struct ioc_gq; #include -TRACE_EVENT(iocost_iocg_activate, +DECLARE_EVENT_CLASS(iocost_iocg_state, TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now, u64 last_period, u64 cur_period, u64 vtime), @@ -59,6 +59,20 @@ TRACE_EVENT(iocost_iocg_activate, ) ); +DEFINE_EVENT(iocost_iocg_state, iocost_iocg_activate, + TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now, + u64 last_period, u64 cur_period, u64 vtime), + + TP_ARGS(iocg, path, now, last_period, cur_period, vtime) +); + +DEFINE_EVENT(iocost_iocg_state, iocost_iocg_idle, + TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now, + u64 last_period, u64 cur_period, u64 vtime), + + TP_ARGS(iocg, path, now, last_period, cur_period, vtime) +); + DECLARE_EVENT_CLASS(iocg_inuse_update, TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now, From 0dd1254bd2779a884ff6d55d81a910c43c7429f8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 22 Apr 2021 21:54:28 -0400 Subject: [PATCH 0380/5344] blk-iocost: don't ignore vrate_min on QD contention commit f46ec84b5acbf8d7067d71a6bbdde213d4b86036 upstream. ioc_adjust_base_vrate() ignored vrate_min when rq_wait_pct indicates that there is QD contention. The reasoning was that QD depletion always reliably indicates device saturation and thus it's safe to override user specified vrate_min. However, this sometimes leads to unnecessary throttling, especially on really fast devices, because vrate adjustments have delays and inertia. It also confuses users because the behavior violates the explicitly specified configuration. This patch drops the special case handling so that vrate_min is always applied. Signed-off-by: Tejun Heo Link: https://lore.kernel.org/r/YIIo1HuyNmhDeiNx@slm.duckdns.org Signed-off-by: Jens Axboe Signed-off-by: Chengming Zhou --- block/blk-iocost.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 6ddca77e3f35b..af5d75cad2bee 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -990,10 +990,6 @@ static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct, return; } - /* rq_wait signal is always reliable, ignore user vrate_min */ - if (rq_wait_pct > RQ_WAIT_BUSY_PCT) - vrate_min = VRATE_MIN; - /* * If vrate is out of bounds, apply clamp gradually as the * bounds can change abruptly. Otherwise, apply busy_level From 5a2c05900e843f324d09e7b5397e8ccece345b86 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 May 2021 21:38:36 -0400 Subject: [PATCH 0381/5344] blk-iocost: fix weight updates of inner active iocgs commit e9f4eee9a0023ba22db9560d4cc6ee63f933dae8 upstream. When the weight of an active iocg is updated, weight_updated() is called which in turn calls __propagate_weights() to update the active and inuse weights so that the effective hierarchical weights are update accordingly. The current implementation is incorrect for inner active nodes. For an active leaf iocg, inuse can be any value between 1 and active and the difference represents how much the iocg is donating. When weight is updated, as long as inuse is clamped between 1 and the new weight, we're alright and this is what __propagate_weights() currently implements. However, that's not how an active inner node's inuse is set. An inner node's inuse is solely determined by the ratio between the sums of inuse's and active's of its children - ie. they're results of propagating the leaves' active and inuse weights upwards. __propagate_weights() incorrectly applies the same clamping as for a leaf when an active inner node's weight is updated. Consider a hierarchy which looks like the following with saturating workloads in AA and BB. R / \ A B | | AA BB 1. For both A and B, active=100, inuse=100, hwa=0.5, hwi=0.5. 2. echo 200 > A/io.weight 3. __propagate_weights() update A's active to 200 and leave inuse at 100 as it's already between 1 and the new active, making A:active=200, A:inuse=100. As R's active_sum is updated along with A's active, A:hwa=2/3, B:hwa=1/3. However, because the inuses didn't change, the hwi's remain unchanged at 0.5. 4. The weight of A is now twice that of B but AA and BB still have the same hwi of 0.5 and thus are doing the same amount of IOs. Fix it by making __propgate_weights() always calculate the inuse of an active inner iocg based on the ratio of child_inuse_sum to child_active_sum. Signed-off-by: Tejun Heo Reported-by: Dan Schatzberg Fixes: 7caa47151ab2 ("blkcg: implement blk-iocost") Cc: stable@vger.kernel.org # v5.4+ Link: https://lore.kernel.org/r/YJsxnLZV1MnBcqjj@slm.duckdns.org Signed-off-by: Jens Axboe Signed-off-by: Chengming Zhou --- block/blk-iocost.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index af5d75cad2bee..1bf19194defe2 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1073,7 +1073,17 @@ static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, lockdep_assert_held(&ioc->lock); - inuse = clamp_t(u32, inuse, 1, active); + /* + * For an active leaf node, its inuse shouldn't be zero or exceed + * @active. An active internal node's inuse is solely determined by the + * inuse to active ratio of its children regardless of @inuse. + */ + if (list_empty(&iocg->active_list) && iocg->child_active_sum) { + inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum, + iocg->child_active_sum); + } else { + inuse = clamp_t(u32, inuse, 1, active); + } iocg->last_inuse = iocg->inuse; if (save) @@ -1090,7 +1100,7 @@ static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, /* update the level sums */ parent->child_active_sum += (s32)(active - child->active); parent->child_inuse_sum += (s32)(inuse - child->inuse); - /* apply the udpates */ + /* apply the updates */ child->active = active; child->inuse = inuse; From bff898c24a8a239a2c92d28d462c76d74178504a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 23 Jul 2020 12:33:41 -0700 Subject: [PATCH 0382/5344] list: add "list_del_init_careful()" to go with "list_empty_careful()" commit c6fe44d96fc1536af5b11cd859686453d1b7bfd1 upstream. That gives us ordering guarantees around the pair. Signed-off-by: Linus Torvalds Signed-off-by: Chengming Zhou --- include/linux/list.h | 20 +++++++++++++++++++- kernel/sched/wait.c | 2 +- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/include/linux/list.h b/include/linux/list.h index 85c92555e31f8..f471c6d56c58e 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -268,6 +268,24 @@ static inline int list_empty(const struct list_head *head) return READ_ONCE(head->next) == head; } +/** + * list_del_init_careful - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + * + * This is the same as list_del_init(), except designed to be used + * together with list_empty_careful() in a way to guarantee ordering + * of other memory operations. + * + * Any memory operations done before a list_del_init_careful() are + * guaranteed to be visible after a list_empty_careful() test. + */ +static inline void list_del_init_careful(struct list_head *entry) +{ + __list_del_entry(entry); + entry->prev = entry; + smp_store_release(&entry->next, entry); +} + /** * list_empty_careful - tests whether a list is empty and not being modified * @head: the list to test @@ -283,7 +301,7 @@ static inline int list_empty(const struct list_head *head) */ static inline int list_empty_careful(const struct list_head *head) { - struct list_head *next = head->next; + struct list_head *next = smp_load_acquire(&head->next); return (next == head) && (next == head->prev); } diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 84bd05117dc22..5015ef5ad4327 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -377,7 +377,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i int ret = default_wake_function(wq_entry, mode, sync, key); if (ret) - list_del_init(&wq_entry->entry); + list_del_init_careful(&wq_entry->entry); return ret; } From aafa96ba4eee219d4cf91dbc51ca66634e24880c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Jul 2021 14:38:09 -1000 Subject: [PATCH 0383/5344] blk-iocost: fix operation ordering in iocg_wake_fn() commit 5ab189cf3abbc9994bae3be524c5b88589ed56e2 upstream. iocg_wake_fn() open-codes wait_queue_entry removal and wakeup because it wants the wq_entry to be always removed whether it ended up waking the task or not. finish_wait() tests whether wq_entry needs removal without grabbing the wait_queue lock and expects the waker to use list_del_init_careful() after all waking operations are complete, which iocg_wake_fn() didn't do. The operation order was wrong and the regular list_del_init() was used. The result is that if a waiter wakes up racing the waker, it can free pop the wq_entry off stack before the waker is still looking at it, which can lead to a backtrace like the following. [7312084.588951] general protection fault, probably for non-canonical address 0x586bf4005b2b88: 0000 [#1] SMP ... [7312084.647079] RIP: 0010:queued_spin_lock_slowpath+0x171/0x1b0 ... [7312084.858314] Call Trace: [7312084.863548] _raw_spin_lock_irqsave+0x22/0x30 [7312084.872605] try_to_wake_up+0x4c/0x4f0 [7312084.880444] iocg_wake_fn+0x71/0x80 [7312084.887763] __wake_up_common+0x71/0x140 [7312084.895951] iocg_kick_waitq+0xe8/0x2b0 [7312084.903964] ioc_rqos_throttle+0x275/0x650 [7312084.922423] __rq_qos_throttle+0x20/0x30 [7312084.930608] blk_mq_make_request+0x120/0x650 [7312084.939490] generic_make_request+0xca/0x310 [7312084.957600] submit_bio+0x173/0x200 [7312084.981806] swap_readpage+0x15c/0x240 [7312084.989646] read_swap_cache_async+0x58/0x60 [7312084.998527] swap_cluster_readahead+0x201/0x320 [7312085.023432] swapin_readahead+0x2df/0x450 [7312085.040672] do_swap_page+0x52f/0x820 [7312085.058259] handle_mm_fault+0xa16/0x1420 [7312085.066620] do_page_fault+0x2c6/0x5c0 [7312085.074459] page_fault+0x2f/0x40 Fix it by switching to list_del_init_careful() and putting it at the end. Signed-off-by: Tejun Heo Reported-by: Rik van Riel Fixes: 7caa47151ab2 ("blkcg: implement blk-iocost") Cc: stable@vger.kernel.org # v5.4+ Signed-off-by: Jens Axboe Signed-off-by: Chengming Zhou --- block/blk-iocost.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 1bf19194defe2..a27822e4ae2bb 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1444,16 +1444,17 @@ static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode, return -1; iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost); + wait->committed = true; /* * autoremove_wake_function() removes the wait entry only when it - * actually changed the task state. We want the wait always - * removed. Remove explicitly and use default_wake_function(). + * actually changed the task state. We want the wait always removed. + * Remove explicitly and use default_wake_function(). Note that the + * order of operations is important as finish_wait() tests whether + * @wq_entry is removed without grabbing the lock. */ - list_del_init(&wq_entry->entry); - wait->committed = true; - default_wake_function(wq_entry, mode, flags, key); + list_del_init_careful(&wq_entry->entry); return 0; } From 916c5e78ad509384b9cf8c7267f0c743f109e36c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 3 Aug 2021 15:06:08 +0800 Subject: [PATCH 0384/5344] blk-iocost: fix lockdep warning on blkcg->lock commit 11431e26c9c43fa26f6b33ee1a90989f57b86024 upstream. blkcg->lock depends on q->queue_lock which may depend on another driver lock required in irq context, one example is dm-thin: Chain exists of: &pool->lock#3 --> &q->queue_lock --> &blkcg->lock Possible interrupt unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&blkcg->lock); local_irq_disable(); lock(&pool->lock#3); lock(&q->queue_lock); lock(&pool->lock#3); Fix the issue by using spin_lock_irq(&blkcg->lock) in ioc_weight_write(). Cc: Tejun Heo Reported-by: Bruno Goncalves Link: https://lore.kernel.org/linux-block/CA+QYu4rzz6079ighEanS3Qq_Dmnczcf45ZoJoHKVLVATTo1e4Q@mail.gmail.com/T/#u Signed-off-by: Ming Lei Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20210803070608.1766400-1-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Chengming Zhou --- block/blk-iocost.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index a27822e4ae2bb..7e359449ce54c 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -3082,19 +3082,19 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX) return -EINVAL; - spin_lock(&blkcg->lock); + spin_lock_irq(&blkcg->lock); iocc->dfl_weight = v * WEIGHT_ONE; hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { struct ioc_gq *iocg = blkg_to_iocg(blkg); if (iocg) { - spin_lock_irq(&iocg->ioc->lock); + spin_lock(&iocg->ioc->lock); ioc_now(iocg->ioc, &now); weight_updated(iocg, &now); - spin_unlock_irq(&iocg->ioc->lock); + spin_unlock(&iocg->ioc->lock); } } - spin_unlock(&blkcg->lock); + spin_unlock_irq(&blkcg->lock); return nbytes; } From 7bc8afecfe67b50910d817f7799b80d9ce552cd2 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Tue, 10 May 2022 11:47:57 +0800 Subject: [PATCH 0385/5344] blk-iocost: combine local_stat and desc_stat to stat commit 2a371f7d5fa575010b915e325c5d20b9ad0d5d5a upstream. When we flush usage, wait, indebt stat in iocg_flush_stat(), we use local_stat and desc_stat, which has no point since the leaf iocg only has local_stat and the inner iocg only has desc_stat. Also we don't need to flush percpu abs_vusage for these inner iocgs. This patch combine local_stat and desc_stat to stat, only flush percpu abs_vusage for active leaf iocgs, then build inner walk list to propagate. Signed-off-by: Chengming Zhou Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220510034757.21761-1-zhouchengming@bytedance.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 71 +++++++++++++++++++++------------------------- 1 file changed, 32 insertions(+), 39 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 7e359449ce54c..682041cf03911 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -536,8 +536,7 @@ struct ioc_gq { /* statistics */ struct iocg_pcpu_stat __percpu *pcpu_stat; - struct iocg_stat local_stat; - struct iocg_stat desc_stat; + struct iocg_stat stat; struct iocg_stat last_stat; u64 last_stat_abs_vusage; u64 usage_delta_us; @@ -1375,7 +1374,7 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) return true; } else { if (iocg->indelay_since) { - iocg->local_stat.indelay_us += now->now - iocg->indelay_since; + iocg->stat.indelay_us += now->now - iocg->indelay_since; iocg->indelay_since = 0; } iocg->delay = 0; @@ -1423,7 +1422,7 @@ static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay, /* if debt is paid in full, restore inuse */ if (!iocg->abs_vdebt) { - iocg->local_stat.indebt_us += now->now - iocg->indebt_since; + iocg->stat.indebt_us += now->now - iocg->indebt_since; iocg->indebt_since = 0; propagate_weights(iocg, iocg->active, iocg->last_inuse, @@ -1517,7 +1516,7 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt, if (!waitqueue_active(&iocg->waitq)) { if (iocg->wait_since) { - iocg->local_stat.wait_us += now->now - iocg->wait_since; + iocg->stat.wait_us += now->now - iocg->wait_since; iocg->wait_since = 0; } return; @@ -1645,11 +1644,30 @@ static void iocg_build_inner_walk(struct ioc_gq *iocg, } } +/* propagate the deltas to the parent */ +static void iocg_flush_stat_upward(struct ioc_gq *iocg) +{ + if (iocg->level > 0) { + struct iocg_stat *parent_stat = + &iocg->ancestors[iocg->level - 1]->stat; + + parent_stat->usage_us += + iocg->stat.usage_us - iocg->last_stat.usage_us; + parent_stat->wait_us += + iocg->stat.wait_us - iocg->last_stat.wait_us; + parent_stat->indebt_us += + iocg->stat.indebt_us - iocg->last_stat.indebt_us; + parent_stat->indelay_us += + iocg->stat.indelay_us - iocg->last_stat.indelay_us; + } + + iocg->last_stat = iocg->stat; +} + /* collect per-cpu counters and propagate the deltas to the parent */ -static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now) +static void iocg_flush_stat_leaf(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; - struct iocg_stat new_stat; u64 abs_vusage = 0; u64 vusage_delta; int cpu; @@ -1665,34 +1683,9 @@ static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now) iocg->last_stat_abs_vusage = abs_vusage; iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate); - iocg->local_stat.usage_us += iocg->usage_delta_us; - - /* propagate upwards */ - new_stat.usage_us = - iocg->local_stat.usage_us + iocg->desc_stat.usage_us; - new_stat.wait_us = - iocg->local_stat.wait_us + iocg->desc_stat.wait_us; - new_stat.indebt_us = - iocg->local_stat.indebt_us + iocg->desc_stat.indebt_us; - new_stat.indelay_us = - iocg->local_stat.indelay_us + iocg->desc_stat.indelay_us; - - /* propagate the deltas to the parent */ - if (iocg->level > 0) { - struct iocg_stat *parent_stat = - &iocg->ancestors[iocg->level - 1]->desc_stat; - - parent_stat->usage_us += - new_stat.usage_us - iocg->last_stat.usage_us; - parent_stat->wait_us += - new_stat.wait_us - iocg->last_stat.wait_us; - parent_stat->indebt_us += - new_stat.indebt_us - iocg->last_stat.indebt_us; - parent_stat->indelay_us += - new_stat.indelay_us - iocg->last_stat.indelay_us; - } + iocg->stat.usage_us += iocg->usage_delta_us; - iocg->last_stat = new_stat; + iocg_flush_stat_upward(iocg); } /* get stat counters ready for reading on all active iocgs */ @@ -1703,13 +1696,13 @@ static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now) /* flush leaves and build inner node walk list */ list_for_each_entry(iocg, target_iocgs, active_list) { - iocg_flush_stat_one(iocg, now); + iocg_flush_stat_leaf(iocg, now); iocg_build_inner_walk(iocg, &inner_walk); } /* keep flushing upwards by walking the inner list backwards */ list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) { - iocg_flush_stat_one(iocg, now); + iocg_flush_stat_upward(iocg); list_del_init(&iocg->walk_list); } } @@ -2156,16 +2149,16 @@ static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now) /* flush wait and indebt stat deltas */ if (iocg->wait_since) { - iocg->local_stat.wait_us += now->now - iocg->wait_since; + iocg->stat.wait_us += now->now - iocg->wait_since; iocg->wait_since = now->now; } if (iocg->indebt_since) { - iocg->local_stat.indebt_us += + iocg->stat.indebt_us += now->now - iocg->indebt_since; iocg->indebt_since = now->now; } if (iocg->indelay_since) { - iocg->local_stat.indelay_us += + iocg->stat.indelay_us += now->now - iocg->indelay_since; iocg->indelay_since = now->now; } From b216dce90c628ec82cc5ad10452c472de240f88e Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 8 Apr 2022 19:53:08 +0800 Subject: [PATCH 0386/5344] sched/fair: Fix cfs_rq_clock_pelt() for throttled cfs_rq commit 64eaf50731ac0a8c76ce2fedd50ef6652aabc5ff upstream. Since commit 23127296889f ("sched/fair: Update scale invariance of PELT") change to use rq_clock_pelt() instead of rq_clock_task(), we should also use rq_clock_pelt() for throttled_clock_task_time and throttled_clock_task accounting to get correct cfs_rq_clock_pelt() of throttled cfs_rq. And rename throttled_clock_task(_time) to be clock_pelt rather than clock_task. Fixes: 23127296889f ("sched/fair: Update scale invariance of PELT") Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ben Segall Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20220408115309.81603-1-zhouchengming@bytedance.com --- kernel/sched/fair.c | 8 ++++---- kernel/sched/pelt.h | 4 ++-- kernel/sched/sched.h | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d39ed2e45023f..6f7a6871a5a4a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4567,8 +4567,8 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttle_count--; if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - - cfs_rq->throttled_clock_task; + cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - + cfs_rq->throttled_clock_pelt; /* Add cfs_rq with already running entity in the list */ if (cfs_rq->nr_running >= 1) @@ -4585,7 +4585,7 @@ static int tg_throttle_down(struct task_group *tg, void *data) /* group is entering throttled state, stop time */ if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_task = rq_clock_task(rq); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); list_del_leaf_cfs_rq(cfs_rq); } cfs_rq->throttle_count++; @@ -5025,7 +5025,7 @@ static void sync_throttle(struct task_group *tg, int cpu) pcfs_rq = tg->parent->cfs_rq[cpu]; cfs_rq->throttle_count = pcfs_rq->throttle_count; - cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu)); } /* conditionally throttle active cfs_rq's from put_prev_entity() */ diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index afff644da0650..43e2a47489fae 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -127,9 +127,9 @@ static inline u64 rq_clock_pelt(struct rq *rq) static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { if (unlikely(cfs_rq->throttle_count)) - return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; + return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time; - return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; + return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time; } #else static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a52232dd13f2c..8d484647fe96d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -586,8 +586,8 @@ struct cfs_rq { s64 runtime_remaining; u64 throttled_clock; - u64 throttled_clock_task; - u64 throttled_clock_task_time; + u64 throttled_clock_pelt; + u64 throttled_clock_pelt_time; int throttled; int throttle_count; struct list_head throttled_list; From 51cb4655f2c90f701718184f0f70b98e2ce9cd66 Mon Sep 17 00:00:00 2001 From: hanjinke Date: Thu, 16 Jun 2022 19:05:47 +0800 Subject: [PATCH 0387/5344] bytedance: block: Add iotrace for cgroup io controller MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The iotrace traces iops、bps and latency for per cgroup. All metrics are collected to the io.iotrace.stat files in cgroup directory. We can enable and disable it by writing the io.iotrace.enable file in root cgroup directory. Signed-off-by: hanjinke --- block/Kconfig | 11 + block/Makefile | 1 + block/bio.c | 3 + block/blk-cgroup.c | 4 + block/blk-iotrace.c | 557 ++++++++++++++++++++++++++++++++++++++ block/blk-rq-qos.h | 3 + block/blk.h | 6 + include/linux/blk_types.h | 18 ++ 8 files changed, 603 insertions(+) create mode 100644 block/blk-iotrace.c diff --git a/block/Kconfig b/block/Kconfig index 86ff6ecd7c275..7e1cff1cfc1a9 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -150,6 +150,17 @@ config BLK_CGROUP_IOCOST distributes IO capacity between different groups based on their share of the overall weight distribution. +config BYTEDANCE_BLK_CGROUP_IOTRACE + default n + bool "Enable support for io stat trace for cgroup IO controller" + depends on BLK_CGROUP=y + help + Enabling this option enable iotrace interface for iotrace. + The iotrace collects io metrics for io cgroup which can + be a complement to current io stat traces. + + The code of iotrace is in block/blk-iotrace.c. + config BLK_WBT_MQ bool "Multiqueue writeback throttling" default y diff --git a/block/Makefile b/block/Makefile index 205a5f2fef17f..f2165c766aaec 100644 --- a/block/Makefile +++ b/block/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_BLK_CGROUP_RWSTAT) += blk-cgroup-rwstat.o obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o +obj-$(CONFIG_BYTEDANCE_BLK_CGROUP_IOTRACE) += blk-iotrace.o obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o diff --git a/block/bio.c b/block/bio.c index 4f28cff50afd7..8fb78a9435dce 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1907,6 +1907,9 @@ struct bio *bio_split(struct bio *bio, int sectors, bio_integrity_trim(split); bio_advance(bio, split->bi_iter.bi_size); +#ifdef CONFIG_BYTEDANCE_BLK_CGROUP_IOTRACE + bio_issue_init_sector(&split->bi_issue, bio_sectors(split)); +#endif if (bio_flagged(bio, BIO_TRACE_COMPLETION)) bio_set_flag(split, BIO_TRACE_COMPLETION); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 57ecfbbd0926e..1611fdc8b900b 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1206,6 +1206,10 @@ int blkcg_init_queue(struct request_queue *q) if (preloaded) radix_tree_preload_end(); + ret = blk_iotrace_init(q); + if (ret) + goto err_destroy_all; + ret = blk_throtl_init(q); if (ret) goto err_destroy_all; diff --git a/block/blk-iotrace.c b/block/blk-iotrace.c new file mode 100644 index 0000000000000..4de4adf958b5a --- /dev/null +++ b/block/blk-iotrace.c @@ -0,0 +1,557 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Block iotrace code + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "blk-rq-qos.h" +#include "blk-stat.h" + +static struct blkcg_policy blkcg_policy_iotrace; + +struct blk_iotrace { + struct rq_qos rqos; +}; + +#define LAT_BUCKET_NR ARRAY_SIZE(def_latb_thresh) +/* default latency bucket(ns) */ +static uint64_t def_latb_thresh[] = { + 5000000, /* 5 ms */ + 10000000, /* 10 ms */ + 30000000, /* 30 ms */ + 50000000, /* 50 ms */ + 100000000, /* 100 ms */ +}; + +enum { + IOT_READ, + IOT_WRITE, + IOT_OTHER, + IOT_NR, +}; + +struct iotrace_stat { + struct blk_rq_stat rqs; + uint64_t ios[IOT_NR]; + uint64_t sts[IOT_NR]; + uint64_t tms[IOT_NR]; + uint64_t dtms[IOT_NR]; + uint64_t hit[IOT_NR][LAT_BUCKET_NR + 1]; + uint64_t dhit[IOT_NR][LAT_BUCKET_NR + 1]; +}; + +struct iotrace_grp { + struct blkg_policy_data pd; + struct iotrace_stat __percpu *stat_pcpu; + uint64_t thresh_ns[LAT_BUCKET_NR]; + struct iotrace_stat stat; +}; + +/* iotrace_mode */ +#define IOTRACE_DISABLE 0 /* iotrace disable */ +#define IOTRACE_ENABLE 1 /* iotrace enable without q2c */ +#define IOTRACE_ENABLE_ALL 2 /* iotrace enable all */ + +static int __read_mostly iotrace_mode; + +static inline struct blk_iotrace *BLKIOTRACE(struct rq_qos *rqos) +{ + return container_of(rqos, struct blk_iotrace, rqos); +} + +static inline struct iotrace_grp *pd_to_iot(struct blkg_policy_data *pd) +{ + return pd ? container_of(pd, struct iotrace_grp, pd) : NULL; +} + +static inline struct iotrace_grp *blkg_to_iot(struct blkcg_gq *blkg) +{ + return pd_to_iot(blkg_to_pd(blkg, &blkcg_policy_iotrace)); +} + +static inline struct blkcg_gq *iot_to_blkg(struct iotrace_grp *iot) +{ + return pd_to_blkg(&iot->pd); +} + +static struct blkg_policy_data *iotrace_pd_alloc(gfp_t gfp, struct request_queue *q, + struct blkcg *blkcg) +{ + struct iotrace_grp *iot; + + iot = kzalloc_node(sizeof(*iot), gfp, q->node); + if (!iot) + return NULL; + + iot->stat_pcpu = alloc_percpu_gfp(struct iotrace_stat, gfp); + if (!iot->stat_pcpu) { + kfree(iot); + return NULL; + } + + return &iot->pd; +} + +static void iotrace_pd_init(struct blkg_policy_data *pd) +{ + struct iotrace_grp *iot = pd_to_iot(pd); + int i, j, cpu; + + for_each_possible_cpu(cpu) { + struct iotrace_stat *stat; + + stat = per_cpu_ptr(iot->stat_pcpu, cpu); + blk_rq_stat_init(&stat->rqs); + for (i = 0; i < IOT_NR; i++) { + stat->ios[i] = stat->sts[i] = 0; + stat->tms[i] = stat->dtms[i] = 0; + for (j = 0; j < LAT_BUCKET_NR + 1; j++) + stat->hit[i][j] = 0; + } + } + + blk_rq_stat_init(&iot->stat.rqs); + for (i = 0; i < IOT_NR; i++) { + iot->stat.ios[i] = iot->stat.sts[i] = 0; + iot->stat.tms[i] = iot->stat.dtms[i] = 0; + for (j = 0; j < LAT_BUCKET_NR + 1; j++) + iot->stat.hit[i][j] = 0; + } + + for (i = 0; i < LAT_BUCKET_NR; i++) + iot->thresh_ns[i] = def_latb_thresh[i]; +} + +static void iotrace_pd_free(struct blkg_policy_data *pd) +{ + struct iotrace_grp *iot = pd_to_iot(pd); + + free_percpu(iot->stat_pcpu); + kfree(iot); +} + +static inline u64 iotrace_prfill_formal(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + struct iotrace_grp *iot = pd_to_iot(pd); + struct iotrace_stat *stat = &iot->stat; + const char *dname = blkg_dev_name(pd->blkg); + int cpu, i; + + if (!dname) + return 0; + memset(stat, 0, sizeof(struct iotrace_stat)); + + /* collect per cpu data */ + for_each_online_cpu(cpu) { + struct iotrace_stat *s; + + s = per_cpu_ptr(iot->stat_pcpu, cpu); + for (i = 0; i < IOT_NR; i++) { + int j; + + stat->ios[i] += s->ios[i]; + stat->sts[i] += s->sts[i]; + stat->tms[i] += s->tms[i]; + for (j = 0; j < LAT_BUCKET_NR + 1; j++) + stat->hit[i][j] += s->hit[i][j]; + } + } + + seq_printf(sf, "%s rios: %llu wios: %llu oios: %llu rsts: %llu wsts: %llu osts: %llu rtms: %llu wtms: %llu otms: %llu ", + dname, + stat->ios[IOT_READ], stat->ios[IOT_WRITE], stat->ios[IOT_OTHER], + stat->sts[IOT_READ], stat->sts[IOT_WRITE], stat->sts[IOT_OTHER], + stat->tms[IOT_READ], stat->tms[IOT_WRITE], stat->tms[IOT_OTHER]); + + /* read hit */ + seq_puts(sf, " rhit:"); + for (i = 0; i < LAT_BUCKET_NR + 1; i++) + seq_printf(sf, " %llu", stat->hit[IOT_READ][i]); + + /* write hit */ + seq_puts(sf, " whit:"); + for (i = 0; i < LAT_BUCKET_NR + 1; i++) + seq_printf(sf, " %llu", stat->hit[IOT_WRITE][i]); + + /* other hit */ + seq_puts(sf, " ohit:"); + for (i = 0; i < LAT_BUCKET_NR + 1; i++) + seq_printf(sf, " %llu", stat->hit[IOT_OTHER][i]); + + seq_putc(sf, '\n'); + + return 0; +} + +static inline u64 iotrace_prfill_extend(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + struct iotrace_grp *iot = pd_to_iot(pd); + struct iotrace_stat *stat = &iot->stat; + const char *dname = blkg_dev_name(pd->blkg); + int cpu, i, j; + + if (!dname) + return 0; + + memset(stat, 0, sizeof(struct iotrace_stat)); + + /* collect per cpu data */ + for_each_online_cpu(cpu) { + struct iotrace_stat *s; + + s = per_cpu_ptr(iot->stat_pcpu, cpu); + for (i = 0; i < IOT_NR; i++) { + stat->ios[i] += s->ios[i]; + stat->sts[i] += s->sts[i]; + stat->tms[i] += s->tms[i]; + stat->dtms[i] += s->dtms[i]; + for (j = 0; j < LAT_BUCKET_NR + 1; j++) { + stat->hit[i][j] += s->hit[i][j]; + stat->dhit[i][j] += s->dhit[i][j]; + } + } + } + + seq_printf(sf, "%s rios: %llu wios: %llu oios: %llu rsts: %llu wsts: %llu osts: %llu rtms: %llu wtms: %llu otms: %llu rdtms: %llu wdtms: %llu odtms: %llu", + dname, + stat->ios[IOT_READ], stat->ios[IOT_WRITE], stat->ios[IOT_OTHER], + stat->sts[IOT_READ], stat->sts[IOT_WRITE], stat->sts[IOT_OTHER], + stat->tms[IOT_READ], stat->tms[IOT_WRITE], stat->tms[IOT_OTHER], + stat->dtms[IOT_READ], stat->dtms[IOT_WRITE], stat->dtms[IOT_OTHER]); + + /* read hit */ + seq_puts(sf, " rhit:"); + for (i = 0; i < LAT_BUCKET_NR + 1; i++) + seq_printf(sf, " %llu", stat->hit[IOT_READ][i]); + + /* write hit */ + seq_puts(sf, " whit:"); + for (i = 0; i < LAT_BUCKET_NR + 1; i++) + seq_printf(sf, " %llu", stat->hit[IOT_WRITE][i]); + + /* other hit */ + seq_puts(sf, " ohit:"); + for (i = 0; i < LAT_BUCKET_NR + 1; i++) + seq_printf(sf, " %llu", stat->hit[IOT_OTHER][i]); + + /* read dhit */ + seq_puts(sf, " rdhit:"); + for (i = 0; i < LAT_BUCKET_NR + 1; i++) + seq_printf(sf, " %llu", stat->dhit[IOT_READ][i]); + + /* write dhit */ + seq_puts(sf, " wdhit:"); + for (i = 0; i < LAT_BUCKET_NR + 1; i++) + seq_printf(sf, " %llu", stat->dhit[IOT_WRITE][i]); + + /* other dhit */ + seq_puts(sf, " odhit:"); + for (i = 0; i < LAT_BUCKET_NR + 1; i++) + seq_printf(sf, " %llu", stat->dhit[IOT_OTHER][i]); + + seq_putc(sf, '\n'); + + return 0; +} + +static u64 iotrace_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + if (iotrace_mode == IOTRACE_ENABLE) + return iotrace_prfill_formal(sf, pd, off); + + if (iotrace_mode == IOTRACE_ENABLE_ALL) + return iotrace_prfill_extend(sf, pd, off); + + return 0; +} + + +static int iotrace_print_stat(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), iotrace_prfill_stat, + &blkcg_policy_iotrace, seq_cft(sf)->private, false); + return 0; +} + +static u64 iotrace_prfill_lat_thresh(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct iotrace_grp *iot = pd_to_iot(pd); + const char *dname = blkg_dev_name(pd->blkg); + int i; + + if (!dname) + return 0; + + seq_puts(sf, dname); + for (i = 0; i < LAT_BUCKET_NR; i++) + seq_printf(sf, " %llu", iot->thresh_ns[i]); + + seq_putc(sf, '\n'); + + return 0; +} + +static int iotrace_print_lat_thresh(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + iotrace_prfill_lat_thresh, &blkcg_policy_iotrace, + seq_cft(sf)->private, false); + return 0; +} + +static ssize_t iotrace_set_lat_thresh(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct blkcg *blkcg = css_to_blkcg(of_css(of)); + struct blkg_conf_ctx ctx; + struct iotrace_grp *iot; + uint64_t tmp[LAT_BUCKET_NR]; + int i, ret; + char *p; + + ret = blkg_conf_prep(blkcg, &blkcg_policy_iotrace, buf, &ctx); + if (ret) + return ret; + + iot = blkg_to_iot(ctx.blkg); + p = ctx.body; + + ret = -EINVAL; + if (sscanf(p, "%llu %llu %llu %llu %llu", + &tmp[0], &tmp[1], &tmp[2], &tmp[3], &tmp[4]) != LAT_BUCKET_NR) + goto out; + + /* make sure threshold in order */ + for (i = 0; i < LAT_BUCKET_NR - 1; i++) { + if (tmp[i] >= tmp[i + 1]) + goto out; + } + + /* update threshold for each bucket */ + for (i = 0; i < LAT_BUCKET_NR; i++) + iot->thresh_ns[i] = tmp[i]; + + ret = 0; +out: + blkg_conf_finish(&ctx); + return ret ? : nbytes; +} + +static int iotrace_print_enable(struct seq_file *sf, void *v) +{ + seq_printf(sf, "%d\n", iotrace_mode); + return 0; +} + +static ssize_t iotrace_set_mode(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + int mode; + + if (kstrtoint(buf, 10, &mode)) + return -EINVAL; + if ((mode < IOTRACE_DISABLE) || (mode > IOTRACE_ENABLE_ALL)) + return -EINVAL; + + iotrace_mode = mode; + + return nbytes; +} + +/* cgroup v2: io.iotrace.stat */ +static struct cftype iotrace_def_files[] = { + { + .name = "iotrace.stat", + .seq_show = iotrace_print_stat, + }, + { + .name = "iotrace.lat_thresh", + .seq_show = iotrace_print_lat_thresh, + .write = iotrace_set_lat_thresh, + }, + { + .name = "iotrace.enable", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = iotrace_print_enable, + .write = iotrace_set_mode, + }, + {} +}; + +static struct blkcg_policy blkcg_policy_iotrace = { + .dfl_cftypes = iotrace_def_files, + .pd_alloc_fn = iotrace_pd_alloc, + .pd_init_fn = iotrace_pd_init, + .pd_free_fn = iotrace_pd_free, +}; + +static void iotrace_account_bio(struct iotrace_grp *iot, struct bio *bio, + u64 now) +{ + u64 delta, start = bio_issue_time(&bio->bi_issue); + u64 delta_disk, start_disk = bio_issue_time(&bio->bi_start); + struct iotrace_stat *stat; + int i, t; + + now = __bio_issue_time(now); + + if (now <= start) + return; + + switch (bio_op(bio)) { + case REQ_OP_READ: + t = IOT_READ; + break; + case REQ_OP_WRITE: + t = IOT_WRITE; + break; + default: + t = IOT_OTHER; + break; + } + + if (start) + delta = now - start; + else + delta = 0; + stat = get_cpu_ptr(iot->stat_pcpu); + stat->ios[t]++; + stat->sts[t] += bio_issue_size(&bio->bi_issue); + stat->tms[t] += delta; + if (start_disk && (start_disk > start) && (now > start_disk)) + delta_disk = now - start_disk; + else + delta_disk = 0; + stat->dtms[t] += delta_disk; + if (delta >= iot->thresh_ns[LAT_BUCKET_NR - 1]) { + stat->hit[t][LAT_BUCKET_NR]++; + } else { + for (i = 0; i < LAT_BUCKET_NR; i++) { + if (delta < iot->thresh_ns[i]) { + stat->hit[t][i]++; + break; + } + } + } + + if (iotrace_mode != IOTRACE_ENABLE_ALL) { + put_cpu_ptr(stat); + return; + } + + if (delta_disk >= iot->thresh_ns[LAT_BUCKET_NR - 1]) + stat->dhit[t][LAT_BUCKET_NR]++; + else { + for (i = 0; i < LAT_BUCKET_NR; i++) { + if (delta_disk < iot->thresh_ns[i]) { + stat->dhit[t][i]++; + break; + } + } + } + + put_cpu_ptr(stat); +} + +static void blkcg_iotrace_done_bio(struct rq_qos *rqos, struct bio *bio) +{ + struct blkcg_gq *blkg; + struct iotrace_grp *iot; + u64 now; + + if (iotrace_mode == IOTRACE_DISABLE) + return; + + now = ktime_to_ns(ktime_get()); + blkg = bio->bi_blkg; + /* account io statistics */ + while (blkg) { + iot = blkg_to_iot(blkg); + if (!iot) { + WARN_ONCE(1, "struct iotrace_grp is not sync with blkg"); + blkg = blkg->parent; + continue; + } + + iotrace_account_bio(iot, bio, now); + blkg = blkg->parent; + } +} + +static void blkcg_iotrace_issue(struct rq_qos *rqos, struct request *rq) +{ + struct bio *bio; + + if (iotrace_mode != IOTRACE_ENABLE_ALL) + return; + + __rq_for_each_bio(bio, rq) + bio_issue_init_time(&bio->bi_start, rq->io_start_time_ns); +} + +static void blkcg_iotrace_exit(struct rq_qos *rqos) +{ + struct blk_iotrace *blkiotrace = BLKIOTRACE(rqos); + + blkcg_deactivate_policy(rqos->q, &blkcg_policy_iotrace); + kfree(blkiotrace); +} + +static struct rq_qos_ops blkcg_iotrace_ops = { + .issue = blkcg_iotrace_issue, + .done_bio = blkcg_iotrace_done_bio, + .exit = blkcg_iotrace_exit, +}; + +int blk_iotrace_init(struct request_queue *q) +{ + struct blk_iotrace *blkiotrace; + struct rq_qos *rqos; + int ret; + + blkiotrace = kzalloc(sizeof(*blkiotrace), GFP_KERNEL); + if (!blkiotrace) + return -ENOMEM; + + rqos = &blkiotrace->rqos; + rqos->id = RQ_QOS_IOTRACE; + rqos->ops = &blkcg_iotrace_ops; + rqos->q = q; + + rq_qos_add(q, rqos); + + ret = blkcg_activate_policy(q, &blkcg_policy_iotrace); + if (ret) { + rq_qos_del(q, rqos); + kfree(blkiotrace); + return ret; + } + + return 0; +} + +static int __init iotrace_init(void) +{ + return blkcg_policy_register(&blkcg_policy_iotrace); +} + +static void __exit iotrace_exit(void) +{ + return blkcg_policy_unregister(&blkcg_policy_iotrace); +} + +module_init(iotrace_init); +module_exit(iotrace_exit); +MODULE_LICENSE("GPL"); diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 2bcb3495e376b..caecf5a99e11f 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -17,6 +17,7 @@ enum rq_qos_id { RQ_QOS_WBT, RQ_QOS_LATENCY, RQ_QOS_COST, + RQ_QOS_IOTRACE, }; struct rq_wait { @@ -88,6 +89,8 @@ static inline const char *rq_qos_id_to_name(enum rq_qos_id id) return "latency"; case RQ_QOS_COST: return "cost"; + case RQ_QOS_IOTRACE: + return "iotrace"; } return "unknown"; } diff --git a/block/blk.h b/block/blk.h index 0b8884353f6bf..363d9449979ba 100644 --- a/block/blk.h +++ b/block/blk.h @@ -173,6 +173,12 @@ static inline void bio_integrity_free(struct bio *bio) } #endif /* CONFIG_BLK_DEV_INTEGRITY */ +#ifdef CONFIG_BYTEDANCE_BLK_CGROUP_IOTRACE +int blk_iotrace_init(struct request_queue *q); +#else +static inline int blk_iotrace_init(struct request_queue *q) { return 0; } +#endif + unsigned long blk_rq_timeout(unsigned long timeout); void blk_add_timer(struct request *req); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d688b96d1d633..8fdb38830f331 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -136,7 +136,22 @@ static inline void bio_issue_init(struct bio_issue *issue, (ktime_get_ns() & BIO_ISSUE_TIME_MASK) | ((u64)size << BIO_ISSUE_SIZE_SHIFT)); } +#ifdef CONFIG_BYTEDANCE_BLK_CGROUP_IOTRACE +static inline void bio_issue_init_sector(struct bio_issue *issue, + sector_t size) +{ + size &= (1UL << BIO_ISSUE_SIZE_BITS) - 1; + issue->value = ((issue->value & BIO_ISSUE_RES_MASK) | + ((u64)size << BIO_ISSUE_SIZE_SHIFT) | + (issue->value & BIO_ISSUE_TIME_MASK)); +} +static inline void bio_issue_init_time(struct bio_issue *issue, + u64 time) +{ + issue->value = time & BIO_ISSUE_TIME_MASK; +} +#endif /* * main unit of I/O for the block layer and lower layers (ie drivers and * stacking drivers) @@ -169,6 +184,9 @@ struct bio { */ struct blkcg_gq *bi_blkg; struct bio_issue bi_issue; +#ifdef CONFIG_BYTEDANCE_BLK_CGROUP_IOTRACE + struct bio_issue bi_start; +#endif #ifdef CONFIG_BLK_CGROUP_IOCOST u64 bi_iocost_cost; #endif From 1ef9682ed816d9df2a8f806b5622a0b4d579a7e0 Mon Sep 17 00:00:00 2001 From: hanjinke Date: Thu, 16 Jun 2022 14:39:14 +0800 Subject: [PATCH 0388/5344] bytedance: iotrace: Enable config for iotrace add CONFIG_BYTEDANCE_BLK_CGROUP_IOTRACE to config files Signed-off-by: hanjinke --- config.aarch64 | 1 + config.x86_64 | 1 + 2 files changed, 2 insertions(+) diff --git a/config.aarch64 b/config.aarch64 index 497d4b9f794c3..d639a771f0520 100644 --- a/config.aarch64 +++ b/config.aarch64 @@ -804,6 +804,7 @@ CONFIG_BLK_DEV_THROTTLING=y # CONFIG_BLK_CMDLINE_PARSER is not set CONFIG_BLK_WBT=y # CONFIG_BLK_CGROUP_IOLATENCY is not set +CONFIG_BYTEDANCE_BLK_CGROUP_IOTRACE=y CONFIG_BLK_CGROUP_IOCOST=y CONFIG_BLK_WBT_MQ=y CONFIG_BLK_DEBUG_FS=y diff --git a/config.x86_64 b/config.x86_64 index b31fc2bebd54f..cc8745c835ecc 100644 --- a/config.x86_64 +++ b/config.x86_64 @@ -852,6 +852,7 @@ CONFIG_BLK_DEV_THROTTLING=y # CONFIG_BLK_CMDLINE_PARSER is not set CONFIG_BLK_WBT=y CONFIG_BLK_CGROUP_IOLATENCY=y +CONFIG_BYTEDANCE_BLK_CGROUP_IOTRACE=y CONFIG_BLK_CGROUP_IOCOST=y CONFIG_BLK_WBT_MQ=y CONFIG_BLK_DEBUG_FS=y From 6c393f48b6b5dc31909400e47d3d708873b252e3 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 15 Jul 2022 10:58:53 +0800 Subject: [PATCH 0389/5344] Revert "mm: hugetlb: add hugetlb_free_vmemmap sysctl" This reverts commit 1c0d93dea28b77805e18f20b506107ba13b659df. Signed-off-by: Muchun Song --- Documentation/admin-guide/sysctl/vm.rst | 13 ------------- include/linux/hugetlb.h | 5 ----- kernel/sysctl.c | 12 ------------ mm/hugetlb_vmemmap.c | 22 ++++++---------------- mm/hugetlb_vmemmap.h | 4 +--- 5 files changed, 7 insertions(+), 49 deletions(-) diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index c4b6f1a03b826..64aeee1009cab 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -548,19 +548,6 @@ Change the minimum size of the hugepage pool. See Documentation/admin-guide/mm/hugetlbpage.rst -hugetlb_free_vmemmap -==================== - -A toggle value indicating if vmemmap pages are allowed to be optimized. -If it is off (0), then it can be set true (1). Once true, the vmemmap -pages associated with each HugeTLB page will be optimized, and the toggle -cannot be set back to false. It only optimizes the subsequent allocation -of HugeTLB pages from buddy system, while already allocated HugeTLB pages -will not be optimized. - -See Documentation/admin-guide/mm/hugetlbpage.rst - - nr_hugepages_mempolicy ====================== diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 2759fd61e8558..c4c2746c808d0 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -773,11 +773,6 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr } #endif /* CONFIG_HUGETLB_PAGE */ -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP -int hugetlb_vmemmap_sysctl_handler(struct ctl_table *table, int write, - void *buffer, size_t *length, loff_t *ppos); -#endif - static inline spinlock_t *huge_pte_lock(struct hstate *h, struct mm_struct *mm, pte_t *pte) { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 620faea6fd87c..67def5e774fd3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1462,18 +1462,6 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = hugetlb_sysctl_handler, }, -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP - { - .procname = "hugetlb_free_vmemmap", - .data = &hugetlb_free_vmemmap_enabled_key.key, - .maxlen = sizeof(hugetlb_free_vmemmap_enabled_key.key.enabled), - .mode = 0644, - /* only handle a transition from default "0" to "1" */ - .proc_handler = hugetlb_vmemmap_sysctl_handler, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_ONE, - }, -#endif #ifdef CONFIG_NUMA { .procname = "nr_hugepages_mempolicy", diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 087b79f36bc43..10f0c74db987c 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -284,14 +284,17 @@ void __init hugetlb_vmemmap_init(struct hstate *h) BUILD_BUG_ON(__NR_USED_SUBPAGE >= RESERVE_VMEMMAP_SIZE / sizeof(struct page)); - if (!is_power_of_2(sizeof(struct page))) { + if (!hugetlb_free_vmemmap_enabled()) + return; + + if (IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON) && + !is_power_of_2(sizeof(struct page))) { /* * The hugetlb_free_vmemmap_enabled_key can be enabled when * CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON. It should * be disabled if "struct page" crosses page boundaries. */ - if (IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON)) - static_branch_disable(&hugetlb_free_vmemmap_enabled_key); + static_branch_disable(&hugetlb_free_vmemmap_enabled_key); return; } @@ -310,16 +313,3 @@ void __init hugetlb_vmemmap_init(struct hstate *h) pr_info("can free %d vmemmap pages for %s\n", h->nr_free_vmemmap_pages, h->name); } - -int hugetlb_vmemmap_sysctl_handler(struct ctl_table *table, int write, - void *buffer, size_t *length, loff_t *ppos) -{ - /* - * The vmemmap pages cannot be optimized if a "struct page" crosses page - * boundaries. - */ - if (write && !is_power_of_2(sizeof(struct page))) - return -EPERM; - - return proc_do_static_key(table, write, buffer, length, ppos); -} diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index b67a159027f4a..cb2bef8f9e736 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -21,9 +21,7 @@ void hugetlb_vmemmap_init(struct hstate *h); */ static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h) { - if (hugetlb_free_vmemmap_enabled()) - return h->nr_free_vmemmap_pages; - return 0; + return h->nr_free_vmemmap_pages; } #else static inline int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) From 55317b6df9eaec87a01f9abfc46c6efad271430f Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 15 Jul 2022 10:59:05 +0800 Subject: [PATCH 0390/5344] Revert "mm: hugetlb: disable freeing vmemmap pages when struct page crosses page boundaries" This reverts commit 3efcecc968c461f676cc779ddb7d8f50915f17e9. Signed-off-by: Muchun Song --- mm/hugetlb_vmemmap.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 10f0c74db987c..791626983c2e1 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -287,17 +287,6 @@ void __init hugetlb_vmemmap_init(struct hstate *h) if (!hugetlb_free_vmemmap_enabled()) return; - if (IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON) && - !is_power_of_2(sizeof(struct page))) { - /* - * The hugetlb_free_vmemmap_enabled_key can be enabled when - * CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON. It should - * be disabled if "struct page" crosses page boundaries. - */ - static_branch_disable(&hugetlb_free_vmemmap_enabled_key); - return; - } - vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; /* * The head page is not to be freed to buddy allocator, the other tail From c2a7053e29fe36fcff27aa0bd890a989cd7eb649 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Apr 2022 23:16:03 -0700 Subject: [PATCH 0391/5344] mm: hugetlb_vmemmap: introduce ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP commit 2e4ec02bbcc05b8905d65c763ebde6bc85508e90 upstream. The feature of minimizing overhead of struct page associated with each HugeTLB page is implemented on x86_64, however, the infrastructure of this feature is already there, we could easily enable it for other architectures. Introduce ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP for other architectures to be easily enabled. Just select this config if they want to enable this feature. Link: https://lkml.kernel.org/r/20220331065640.5777-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Suggested-by: Andrew Morton Reviewed-by: Barry Song Tested-by: Barry Song Reviewed-by: Anshuman Khandual Cc: Bodeddula Balasubramaniam Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Rientjes Cc: Fam Zheng Cc: James Morse Cc: Mark Rutland Cc: Mike Kravetz Cc: Oscar Salvador Cc: Will Deacon Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Muchun Song --- arch/x86/Kconfig | 1 + fs/Kconfig | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 36a28b9e46cbd..0a230bd7fd12e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -96,6 +96,7 @@ config X86 select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANT_HUGE_PMD_SHARE + select ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP if X86_64 select ARCH_WANTS_THP_SWAP if X86_64 select BUILDTIME_EXTABLE_SORT select CLKEVT_I8253 diff --git a/fs/Kconfig b/fs/Kconfig index 00b1af97c119b..001501b3d5ad0 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -213,9 +213,17 @@ config HUGETLBFS config HUGETLB_PAGE def_bool HUGETLBFS +# +# Select this config option from the architecture Kconfig, if it is preferred +# to enable the feature of minimizing overhead of struct page associated with +# each HugeTLB page. +# +config ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP + bool + config HUGETLB_PAGE_FREE_VMEMMAP def_bool HUGETLB_PAGE - depends on X86_64 + depends on ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP depends on SPARSEMEM_VMEMMAP config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON From e23644d23070509831bc6abbf8b0e79bca390448 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 2 Mar 2022 16:46:23 +0800 Subject: [PATCH 0392/5344] arm64: avoid flushing icache multiple times on contiguous HugeTLB commit cf5a501d985ba1b6ace9b18c64346441819bffea upstream. When a contiguous HugeTLB page is mapped, set_pte_at() will be called CONT_PTES/CONT_PMDS times. Therefore, __sync_icache_dcache() will flush cache multiple times if the page is executable (to ensure the I-D cache coherency). However, the first flushing cache already covers subsequent cache flush operations. So only flusing cache for the head page if it is a HugeTLB page to avoid redundant cache flushing. In the next patch, it is also depends on this change since the tail vmemmap pages of HugeTLB is mapped with read-only meanning only head page struct can be modified. Signed-off-by: Muchun Song Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20220302084624.33340-1-songmuchun@bytedance.com Signed-off-by: Will Deacon Signed-off-by: Muchun Song --- arch/arm64/mm/flush.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index ac485163a4a76..1cef12e2f7097 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -55,6 +55,13 @@ void __sync_icache_dcache(pte_t pte) { struct page *page = pte_page(pte); + /* + * HugeTLB pages are always fully mapped, so only setting head page's + * PG_dcache_clean flag is enough. + */ + if (PageHuge(page)) + page = compound_head(page); + if (!test_and_set_bit(PG_dcache_clean, &page->flags)) sync_icache_aliases(page_address(page), page_size(page)); } From 30252e4cda948af1da028ccf97ca3317bbfa5b15 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Apr 2022 23:16:03 -0700 Subject: [PATCH 0393/5344] arm64: mm: hugetlb: enable HUGETLB_PAGE_FREE_VMEMMAP for arm64 commit 1e63ac088f20f7a4425c430c31ecd3cf167fb3f2 upstream. The feature of minimizing overhead of struct page associated with each HugeTLB page aims to free its vmemmap pages (used as struct page) to save memory, where is ~14GB/16GB per 1TB HugeTLB pages (2MB/1GB type). In short, when a HugeTLB page is allocated or freed, the vmemmap array representing the range associated with the page will need to be remapped. When a page is allocated, vmemmap pages are freed after remapping. When a page is freed, previously discarded vmemmap pages must be allocated before remapping. More implementations and details can be found here [1]. The infrastructure of freeing vmemmap pages associated with each HugeTLB page is already there, we can easily enable HUGETLB_PAGE_FREE_VMEMMAP for arm64, the only thing to be fixed is flush_dcache_page() . flush_dcache_page() need to be adapted to operate on the head page's flags since the tail vmemmap pages are mapped with read-only after the feature is enabled (clear operation is not permitted). There was some discussions about this in the thread [2], but there was no conclusion in the end. And I copied the concern proposed by Anshuman to here and explain why those concern is superfluous. It is safe to enable it for x86_64 as well as arm64. 1st concern: ''' But what happens when a hot remove section's vmemmap area (which is being teared down) is nearby another vmemmap area which is either created or being destroyed for HugeTLB alloc/free purpose. As you mentioned HugeTLB pages inside the hot remove section might be safe. But what about other HugeTLB areas whose vmemmap area shares page table entries with vmemmap entries for a section being hot removed ? Massive HugeTLB alloc /use/free test cycle using memory just adjacent to a memory hotplug area, which is always added and removed periodically, should be able to expose this problem. ''' Answer: At the time memory is removed, all HugeTLB pages either have been migrated away or dissolved. So there is no race between memory hot remove and free_huge_page_vmemmap(). Therefore, HugeTLB pages inside the hot remove section is safe. Let's talk your question "what about other HugeTLB areas whose vmemmap area shares page table entries with vmemmap entries for a section being hot removed ?", the question is not established. The minimal granularity size of hotplug memory 128MB (on arm64, 4k base page), any HugeTLB smaller than 128MB is within a section, then, there is no share PTE page tables between HugeTLB in this section and ones in other sections and a HugeTLB page could not cross two sections. In this case, the section cannot be freed. Any HugeTLB bigger than 128MB (section size) whose vmemmap pages is an integer multiple of 2MB (PMD-mapped). As long as: 1) HugeTLBs are naturally aligned, power-of-two sizes 2) The HugeTLB size >= the section size 3) The HugeTLB size >= the vmemmap leaf mapping size Then a HugeTLB will not share any leaf page table entries with *anything else*, but will share intermediate entries. In this case, at the time memory is removed, all HugeTLB pages either have been migrated away or dissolved. So there is also no race between memory hot remove and free_huge_page_vmemmap(). 2nd concern: ''' differently, not sure if ptdump would require any synchronization. Dumping an wrong value is probably okay but crashing because a page table entry is being freed after ptdump acquired the pointer is bad. On arm64, ptdump() is protected against hotremove via [get|put]_online_mems(). ''' Answer: The ptdump should be fine since vmemmap_remap_free() only exchanges PTEs or splits the PMD entry (which means allocating a PTE page table). Both operations do not free any page tables (PTE), so ptdump cannot run into a UAF on any page tables. The worst case is just dumping an wrong value. [1] https://lore.kernel.org/all/20210510030027.56044-1-songmuchun@bytedance.com/ [2] https://lore.kernel.org/all/20210518091826.36937-1-songmuchun@bytedance.com/ [songmuchun@bytedance.com: restructure the code comment inside flush_dcache_page()] Link: https://lkml.kernel.org/r/20220414072646.21910-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220331065640.5777-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Barry Song Tested-by: Barry Song Cc: Will Deacon Cc: David Hildenbrand Cc: Bodeddula Balasubramaniam Cc: Oscar Salvador Cc: Mike Kravetz Cc: David Rientjes Cc: Mark Rutland Cc: Catalin Marinas Cc: James Morse Cc: Xiongchun Duan Cc: Fam Zheng Signed-off-by: Andrew Morton Signed-off-by: Muchun Song --- arch/arm64/Kconfig | 1 + arch/arm64/mm/flush.c | 14 ++++++++++++++ 2 files changed, 15 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7d6be6144b090..77defc140bcf3 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -73,6 +73,7 @@ config ARM64 select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) + select ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP select ARCH_HAS_UBSAN_SANITIZE_ALL select ARM_AMBA select ARM_ARCH_TIMER diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 1cef12e2f7097..629dec90d2465 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -74,6 +74,20 @@ EXPORT_SYMBOL_GPL(__sync_icache_dcache); */ void flush_dcache_page(struct page *page) { + /* + * Only the head page's flags of HugeTLB can be cleared since the tail + * vmemmap pages associated with each HugeTLB page are mapped with + * read-only when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP is enabled (more + * details can refer to vmemmap_remap_pte()). Although + * __sync_icache_dcache() only set PG_dcache_clean flag on the head + * page struct, there is more than one page struct with PG_dcache_clean + * associated with the HugeTLB page since the head vmemmap page frame + * is reused (more details can refer to the comments above + * page_fixed_fake_head()). + */ + if (hugetlb_free_vmemmap_enabled() && PageHuge(page)) + page = compound_head(page); + if (test_bit(PG_dcache_clean, &page->flags)) clear_bit(PG_dcache_clean, &page->flags); } From 1f10e2eff071a6a1b4053f857385ec2aaf8ee1ca Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Apr 2022 23:16:14 -0700 Subject: [PATCH 0394/5344] mm: hugetlb_vmemmap: cleanup hugetlb_vmemmap related functions commit 5981611d0a006472d367d7a8e6ead8afaecf17c7 upstream. Patch series "cleanup hugetlb_vmemmap". The word of "free" is not expressive enough to express the feature of optimizing vmemmap pages associated with each HugeTLB, rename this keywork to "optimize" is more clear. In this series, cheanup related codes to make it more clear and expressive. This is suggested by David. This patch (of 3): The word of "free" is not expressive enough to express the feature of optimizing vmemmap pages associated with each HugeTLB, rename this keywork to "optimize". And some function names are prefixed with "huge_page" instead of "hugetlb", it is easily to be confused with THP. In this patch, cheanup related functions to make code more clear and expressive. Link: https://lkml.kernel.org/r/20220404074652.68024-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220404074652.68024-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: David Hildenbrand Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Muchun Song --- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 8 ++++---- mm/hugetlb_vmemmap.c | 42 ++++++++++++++++++++--------------------- mm/hugetlb_vmemmap.h | 20 ++++++++++---------- 4 files changed, 35 insertions(+), 37 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index c4c2746c808d0..071daed842be4 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -382,7 +382,7 @@ struct hstate { unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP - unsigned int nr_free_vmemmap_pages; + unsigned int optimize_vmemmap_pages; #endif #ifdef CONFIG_CGROUP_HUGETLB /* cgroup control files */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 386b151c24d61..30845de8db7fb 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1391,7 +1391,7 @@ static void hpage_free_vmemmap_workfn(struct work_struct *work) */ h = size_to_hstate(page_size(page)); - if (alloc_huge_page_vmemmap(h, page)) { + if (hugetlb_vmemmap_alloc(h, page)) { spin_lock(&hugetlb_lock); add_hugetlb_page(h, page); spin_unlock(&hugetlb_lock); @@ -1409,7 +1409,7 @@ static DECLARE_WORK(hpage_free_vmemmap_work, hpage_free_vmemmap_workfn); static inline void flush_free_hpage_work(struct hstate *h) { - if (free_vmemmap_pages_per_hpage(h)) + if (hugetlb_optimize_vmemmap_pages(h)) flush_work(&hpage_free_vmemmap_work); } @@ -1603,7 +1603,7 @@ void free_huge_page(struct page *page) static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { ClearHPageVmemmapOptimized(page); - free_huge_page_vmemmap(h, page); + hugetlb_vmemmap_free(h, page); INIT_LIST_HEAD(&page->lru); set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); spin_lock(&hugetlb_lock); @@ -1902,7 +1902,7 @@ int dissolve_free_huge_page(struct page *page) * Attempt to allocate vmemmmap here so that we can take * appropriate action on failure. */ - rc = alloc_huge_page_vmemmap(h, head); + rc = hugetlb_vmemmap_alloc(h, head); spin_lock(&hugetlb_lock); if (!rc) { /* diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 791626983c2e1..91b79b9d9e25d 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Free some vmemmap pages of HugeTLB + * Optimize vmemmap pages associated with HugeTLB * * Copyright (c) 2020, Bytedance. All rights reserved. * @@ -192,7 +192,7 @@ DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, hugetlb_free_vmemmap_enabled_key); EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled_key); -static int __init early_hugetlb_free_vmemmap_param(char *buf) +static int __init hugetlb_vmemmap_early_param(char *buf) { /* We cannot optimize if a "struct page" crosses page boundaries. */ if (!is_power_of_2(sizeof(struct page))) { @@ -212,29 +212,26 @@ static int __init early_hugetlb_free_vmemmap_param(char *buf) return 0; } -early_param("hugetlb_free_vmemmap", early_hugetlb_free_vmemmap_param); - -static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h) -{ - return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT; -} +early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_early_param); /* * Previously discarded vmemmap pages will be allocated and remapping * after this function returns zero. */ -int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) +int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) { int ret; unsigned long vmemmap_addr = (unsigned long)head; - unsigned long vmemmap_end, vmemmap_reuse; + unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages; if (!HPageVmemmapOptimized(head)) return 0; - vmemmap_addr += RESERVE_VMEMMAP_SIZE; - vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h); - vmemmap_reuse = vmemmap_addr - PAGE_SIZE; + vmemmap_addr += RESERVE_VMEMMAP_SIZE; + vmemmap_pages = hugetlb_optimize_vmemmap_pages(h); + vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); + vmemmap_reuse = vmemmap_addr - PAGE_SIZE; + /* * The pages which the vmemmap virtual address range [@vmemmap_addr, * @vmemmap_end) are mapped to are freed to the buddy allocator, and @@ -250,17 +247,18 @@ int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) return ret; } -void free_huge_page_vmemmap(struct hstate *h, struct page *head) +void hugetlb_vmemmap_free(struct hstate *h, struct page *head) { unsigned long vmemmap_addr = (unsigned long)head; - unsigned long vmemmap_end, vmemmap_reuse; + unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages; - if (!free_vmemmap_pages_per_hpage(h)) + vmemmap_pages = hugetlb_optimize_vmemmap_pages(h); + if (!vmemmap_pages) return; - vmemmap_addr += RESERVE_VMEMMAP_SIZE; - vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h); - vmemmap_reuse = vmemmap_addr - PAGE_SIZE; + vmemmap_addr += RESERVE_VMEMMAP_SIZE; + vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); + vmemmap_reuse = vmemmap_addr - PAGE_SIZE; /* * Remap the vmemmap virtual address range [@vmemmap_addr, @vmemmap_end) @@ -297,8 +295,8 @@ void __init hugetlb_vmemmap_init(struct hstate *h) * hugetlbpage.rst for more details. */ if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR)) - h->nr_free_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR; + h->optimize_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR; - pr_info("can free %d vmemmap pages for %s\n", h->nr_free_vmemmap_pages, - h->name); + pr_info("can optimize %d vmemmap pages for %s\n", + h->optimize_vmemmap_pages, h->name); } diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index cb2bef8f9e736..9760537849b57 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Free some vmemmap pages of HugeTLB + * Optimize vmemmap pages associated with HugeTLB * * Copyright (c) 2020, Bytedance. All rights reserved. * @@ -11,25 +11,25 @@ #include #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP -int alloc_huge_page_vmemmap(struct hstate *h, struct page *head); -void free_huge_page_vmemmap(struct hstate *h, struct page *head); +int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head); +void hugetlb_vmemmap_free(struct hstate *h, struct page *head); void hugetlb_vmemmap_init(struct hstate *h); /* - * How many vmemmap pages associated with a HugeTLB page that can be freed - * to the buddy allocator. + * How many vmemmap pages associated with a HugeTLB page that can be + * optimized and freed to the buddy allocator. */ -static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h) +static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h) { - return h->nr_free_vmemmap_pages; + return h->optimize_vmemmap_pages; } #else -static inline int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) +static inline int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) { return 0; } -static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head) +static inline void hugetlb_vmemmap_free(struct hstate *h, struct page *head) { } @@ -37,7 +37,7 @@ static inline void hugetlb_vmemmap_init(struct hstate *h) { } -static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h) +static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h) { return 0; } From 96ba6d94b37664e8b221fc49a40ce25600f65a51 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Apr 2022 23:16:15 -0700 Subject: [PATCH 0395/5344] mm: hugetlb_vmemmap: cleanup hugetlb_free_vmemmap_enabled* commit f10f1442c309ccef7a80ba3dc4abde0978e86fb4 upstream. The word of "free" is not expressive enough to express the feature of optimizing vmemmap pages associated with each HugeTLB, rename this keywork to "optimize". In this patch , cheanup the static key and hugetlb_free_vmemmap_enabled() to make code more expressive. Link: https://lkml.kernel.org/r/20220404074652.68024-3-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: David Hildenbrand Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Muchun Song --- arch/arm64/mm/flush.c | 2 +- include/linux/page-flags.h | 12 ++++++------ mm/hugetlb_vmemmap.c | 10 +++++----- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 629dec90d2465..6118a25d84ec4 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -85,7 +85,7 @@ void flush_dcache_page(struct page *page) * is reused (more details can refer to the comments above * page_fixed_fake_head()). */ - if (hugetlb_free_vmemmap_enabled() && PageHuge(page)) + if (hugetlb_optimize_vmemmap_enabled() && PageHuge(page)) page = compound_head(page); if (test_bit(PG_dcache_clean, &page->flags)) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 87bb899189021..e0bbc93b459dd 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -171,16 +171,16 @@ struct page; /* forward declaration */ #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, - hugetlb_free_vmemmap_enabled_key); + hugetlb_optimize_vmemmap_key); -static __always_inline bool hugetlb_free_vmemmap_enabled(void) +static __always_inline bool hugetlb_optimize_vmemmap_enabled(void) { return static_branch_maybe(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, - &hugetlb_free_vmemmap_enabled_key); + &hugetlb_optimize_vmemmap_key); } /* - * If the feature of freeing some vmemmap pages associated with each HugeTLB + * If the feature of optimizing vmemmap pages associated with each HugeTLB * page is enabled, the head vmemmap page frame is reused and all of the tail * vmemmap addresses map to the head vmemmap page frame (furture details can * refer to the figure at the head of the mm/hugetlb_vmemmap.c). In other @@ -197,7 +197,7 @@ static __always_inline bool hugetlb_free_vmemmap_enabled(void) */ static __always_inline struct page *page_fixed_fake_head(struct page *page) { - if (!hugetlb_free_vmemmap_enabled()) + if (!hugetlb_optimize_vmemmap_enabled()) return page; /* @@ -226,7 +226,7 @@ static inline struct page *page_fixed_fake_head(struct page *page) return page; } -static inline bool hugetlb_free_vmemmap_enabled(void) +static inline bool hugetlb_optimize_vmemmap_enabled(void) { return false; } diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 91b79b9d9e25d..f252949733981 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -189,8 +189,8 @@ #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, - hugetlb_free_vmemmap_enabled_key); -EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled_key); + hugetlb_optimize_vmemmap_key); +EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); static int __init hugetlb_vmemmap_early_param(char *buf) { @@ -204,9 +204,9 @@ static int __init hugetlb_vmemmap_early_param(char *buf) return -EINVAL; if (!strcmp(buf, "on")) - static_branch_enable(&hugetlb_free_vmemmap_enabled_key); + static_branch_enable(&hugetlb_optimize_vmemmap_key); else if (!strcmp(buf, "off")) - static_branch_disable(&hugetlb_free_vmemmap_enabled_key); + static_branch_disable(&hugetlb_optimize_vmemmap_key); else return -EINVAL; @@ -282,7 +282,7 @@ void __init hugetlb_vmemmap_init(struct hstate *h) BUILD_BUG_ON(__NR_USED_SUBPAGE >= RESERVE_VMEMMAP_SIZE / sizeof(struct page)); - if (!hugetlb_free_vmemmap_enabled()) + if (!hugetlb_optimize_vmemmap_enabled()) return; vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; From f9464caff846301ea0f70038c3cd728671d09a6b Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Apr 2022 23:16:15 -0700 Subject: [PATCH 0396/5344] mm: hugetlb_vmemmap: cleanup CONFIG_HUGETLB_PAGE_FREE_VMEMMAP* commit 47010c040dec8af6347ec6259104fc13f7e7e30a upstream. The word of "free" is not expressive enough to express the feature of optimizing vmemmap pages associated with each HugeTLB, rename this keywork to "optimize". In this patch , cheanup configs to make code more expressive. Link: https://lkml.kernel.org/r/20220404074652.68024-4-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Mike Kravetz Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Muchun Song --- Documentation/admin-guide/kernel-parameters.txt | 4 ++-- arch/arm64/Kconfig | 2 +- arch/arm64/mm/flush.c | 2 +- arch/x86/Kconfig | 2 +- arch/x86/mm/init_64.c | 2 +- config.x86_64 | 4 ++-- fs/Kconfig | 16 ++++++++-------- include/linux/hugetlb.h | 2 +- include/linux/mm.h | 2 +- include/linux/page-flags.h | 6 +++--- mm/Makefile | 2 +- mm/hugetlb_vmemmap.c | 4 ++-- mm/hugetlb_vmemmap.h | 4 ++-- mm/sparse-vmemmap.c | 4 ++-- 14 files changed, 28 insertions(+), 28 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index ddbc95bf4917a..ad4742acb2e5c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1428,7 +1428,7 @@ (when the CPU supports the "pdpe1gb" cpuinfo flag). hugetlb_free_vmemmap= - [KNL] Reguires CONFIG_HUGETLB_PAGE_FREE_VMEMMAP + [KNL] Reguires CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP enabled. Allows heavy hugetlb users to free up some more memory (7 * PAGE_SIZE for each 2MB hugetlb page). @@ -1437,7 +1437,7 @@ on: enable the feature off: disable the feature - Built with CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON=y, + Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y, the default is on. hung_task_panic= diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 77defc140bcf3..a7ab8b52a7425 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -73,7 +73,7 @@ config ARM64 select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) - select ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP + select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP select ARCH_HAS_UBSAN_SANITIZE_ALL select ARM_AMBA select ARM_ARCH_TIMER diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 6118a25d84ec4..d087a509a20ef 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -77,7 +77,7 @@ void flush_dcache_page(struct page *page) /* * Only the head page's flags of HugeTLB can be cleared since the tail * vmemmap pages associated with each HugeTLB page are mapped with - * read-only when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP is enabled (more + * read-only when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is enabled (more * details can refer to vmemmap_remap_pte()). Although * __sync_icache_dcache() only set PG_dcache_clean flag on the head * page struct, there is more than one page struct with PG_dcache_clean diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0a230bd7fd12e..cd5a6c7ff30aa 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -96,7 +96,7 @@ config X86 select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANT_HUGE_PMD_SHARE - select ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP if X86_64 + select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP if X86_64 select ARCH_WANTS_THP_SWAP if X86_64 select BUILDTIME_EXTABLE_SORT select CLKEVT_I8253 diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index b7e8bf4a2ce96..02cde3026f846 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1223,7 +1223,7 @@ static struct kcore_list kcore_vsyscall; static void __init register_page_bootmem_info(void) { -#if defined(CONFIG_NUMA) || defined(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP) +#if defined(CONFIG_NUMA) || defined(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) int i; for_each_online_node(i) diff --git a/config.x86_64 b/config.x86_64 index cc8745c835ecc..e66f2bb094127 100644 --- a/config.x86_64 +++ b/config.x86_64 @@ -8277,8 +8277,8 @@ CONFIG_TMPFS_POSIX_ACL=y CONFIG_TMPFS_XATTR=y CONFIG_HUGETLBFS=y CONFIG_HUGETLB_PAGE=y -CONFIG_HUGETLB_PAGE_FREE_VMEMMAP=y -# CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON is not set +CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP=y +# CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON is not set CONFIG_MEMFD_CREATE=y CONFIG_ARCH_HAS_GIGANTIC_PAGE=y CONFIG_CONFIGFS_FS=m diff --git a/fs/Kconfig b/fs/Kconfig index 001501b3d5ad0..597de17a36250 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -218,22 +218,22 @@ config HUGETLB_PAGE # to enable the feature of minimizing overhead of struct page associated with # each HugeTLB page. # -config ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP +config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP bool -config HUGETLB_PAGE_FREE_VMEMMAP +config HUGETLB_PAGE_OPTIMIZE_VMEMMAP def_bool HUGETLB_PAGE - depends on ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP + depends on ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP depends on SPARSEMEM_VMEMMAP -config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON - bool "Default freeing vmemmap pages of HugeTLB to on" +config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON + bool "Default optimizing vmemmap pages of HugeTLB to on" default n - depends on HUGETLB_PAGE_FREE_VMEMMAP + depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP help - When using HUGETLB_PAGE_FREE_VMEMMAP, the freeing unused vmemmap + When using HUGETLB_PAGE_OPTIMIZE_VMEMMAP, the optimizing unused vmemmap pages associated with each HugeTLB page is default off. Say Y here - to enable freeing vmemmap pages of HugeTLB by default. It can then + to enable optimizing vmemmap pages of HugeTLB by default. It can then be disabled on the command line via hugetlb_free_vmemmap=off. config MEMFD_CREATE diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 071daed842be4..87f3d3293a9db 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -381,7 +381,7 @@ struct hstate { unsigned int nr_huge_pages_node[MAX_NUMNODES]; unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP unsigned int optimize_vmemmap_pages; #endif #ifdef CONFIG_CGROUP_HUGETLB diff --git a/include/linux/mm.h b/include/linux/mm.h index e4b8ce39eb7e9..0ab176c66d949 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2774,7 +2774,7 @@ static inline void print_vma_addr(char *prefix, unsigned long rip) } #endif -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP int vmemmap_remap_free(unsigned long start, unsigned long end, unsigned long reuse); int vmemmap_remap_alloc(unsigned long start, unsigned long end, diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e0bbc93b459dd..71c4a3ad754c0 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -169,13 +169,13 @@ enum pageflags { struct page; /* forward declaration */ -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP -DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, hugetlb_optimize_vmemmap_key); static __always_inline bool hugetlb_optimize_vmemmap_enabled(void) { - return static_branch_maybe(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, + return static_branch_maybe(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, &hugetlb_optimize_vmemmap_key); } diff --git a/mm/Makefile b/mm/Makefile index e4e7d4f68d5f5..7e4ac69191ead 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -61,7 +61,7 @@ obj-$(CONFIG_FRONTSWAP) += frontswap.o obj-$(CONFIG_ZSWAP) += zswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o -obj-$(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP) += hugetlb_vmemmap.o +obj-$(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) += hugetlb_vmemmap.o obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index f252949733981..2655434a946bd 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -188,7 +188,7 @@ #define RESERVE_VMEMMAP_NR 1U #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) -DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, +DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, hugetlb_optimize_vmemmap_key); EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); @@ -276,7 +276,7 @@ void __init hugetlb_vmemmap_init(struct hstate *h) /* * There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct - * page structs that can be used when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP, + * page structs that can be used when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP, * so add a BUILD_BUG_ON to catch invalid usage of the tail struct page. */ BUILD_BUG_ON(__NR_USED_SUBPAGE >= diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index 9760537849b57..109b0a53b6fe9 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -10,7 +10,7 @@ #define _LINUX_HUGETLB_VMEMMAP_H #include -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head); void hugetlb_vmemmap_free(struct hstate *h, struct page *head); void hugetlb_vmemmap_init(struct hstate *h); @@ -41,5 +41,5 @@ static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h) { return 0; } -#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */ +#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ #endif /* _LINUX_HUGETLB_VMEMMAP_H */ diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 3d80bcf5c403f..8de45bbcf334b 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -34,7 +34,7 @@ #include #include -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP /** * struct vmemmap_remap_walk - walk vmemmap page table * @@ -420,7 +420,7 @@ int vmemmap_remap_alloc(unsigned long start, unsigned long end, return 0; } -#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */ +#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ /* * Allocate a block of memory to be used to back the virtual memory map From 7b7333900cbdb6747a8840f8bc19464e825f6957 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 13 May 2022 16:48:56 -0700 Subject: [PATCH 0397/5344] mm: hugetlb_vmemmap: disable hugetlb_optimize_vmemmap when struct page crosses page boundaries commit 0effdf461c5789be02d40c1868c70cc02ea24627 upstream. Patch series "add hugetlb_optimize_vmemmap sysctl", v11. This series aims to add hugetlb_optimize_vmemmap sysctl to enable or disable the feature of optimizing vmemmap pages associated with HugeTLB pages. This patch (of 4): If the size of "struct page" is not the power of two but with the feature of minimizing overhead of struct page associated with each HugeTLB is enabled, then the vmemmap pages of HugeTLB will be corrupted after remapping (panic is about to happen in theory). But this only exists when !CONFIG_MEMCG && !CONFIG_SLUB on x86_64. However, it is not a conventional configuration nowadays. So it is not a real word issue, just the result of a code review. But we cannot prevent anyone from configuring that combined configure. This hugetlb_optimize_vmemmap should be disable in this case to fix this issue. Link: https://lkml.kernel.org/r/20220512041142.39501-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220512041142.39501-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Mike Kravetz Cc: David Hildenbrand Cc: Iurii Zaikin Cc: Jonathan Corbet Cc: Kees Cook Cc: Luis Chamberlain Cc: Masahiro Yamada Cc: Oscar Salvador Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Muchun Song --- mm/hugetlb_vmemmap.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 2655434a946bd..4b8e59eeb971c 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -194,12 +194,6 @@ EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); static int __init hugetlb_vmemmap_early_param(char *buf) { - /* We cannot optimize if a "struct page" crosses page boundaries. */ - if (!is_power_of_2(sizeof(struct page))) { - pr_warn("cannot free vmemmap pages because \"struct page\" crosses page boundaries\n"); - return 0; - } - if (!buf) return -EINVAL; @@ -285,6 +279,12 @@ void __init hugetlb_vmemmap_init(struct hstate *h) if (!hugetlb_optimize_vmemmap_enabled()) return; + if (!is_power_of_2(sizeof(struct page))) { + pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n"); + static_branch_disable(&hugetlb_optimize_vmemmap_key); + return; + } + vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; /* * The head page is not to be freed to buddy allocator, the other tail From 0e1e7cefb9b52b412c6ab48d0f62bca8ac04aaf7 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 13 May 2022 16:48:56 -0700 Subject: [PATCH 0398/5344] mm: hugetlb_vmemmap: use kstrtobool for hugetlb_vmemmap param parsing commit 9c54c522bb76cbef480722bd44059e2ba8304bd2 upstream. Use kstrtobool rather than open coding "on" and "off" parsing in mm/hugetlb_vmemmap.c, which is more powerful to handle all kinds of parameters like 'Yy1Nn0' or [oO][NnFf] for "on" and "off". Link: https://lkml.kernel.org/r/20220512041142.39501-4-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Mike Kravetz Cc: David Hildenbrand Cc: Iurii Zaikin Cc: Jonathan Corbet Cc: Kees Cook Cc: Luis Chamberlain Cc: Masahiro Yamada Cc: Oscar Salvador Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Muchun Song --- Documentation/admin-guide/kernel-parameters.txt | 6 +++--- mm/hugetlb_vmemmap.c | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index ad4742acb2e5c..c961795bff3c1 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1432,10 +1432,10 @@ enabled. Allows heavy hugetlb users to free up some more memory (7 * PAGE_SIZE for each 2MB hugetlb page). - Format: { on | off (default) } + Format: { [oO][Nn]/Y/y/1 | [oO][Ff]/N/n/0 (default) } - on: enable the feature - off: disable the feature + [oO][Nn]/Y/y/1: enable the feature + [oO][Ff]/N/n/0: disable the feature Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y, the default is on. diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 4b8e59eeb971c..112e74504905c 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -194,15 +194,15 @@ EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); static int __init hugetlb_vmemmap_early_param(char *buf) { - if (!buf) + bool enable; + + if (kstrtobool(buf, &enable)) return -EINVAL; - if (!strcmp(buf, "on")) + if (enable) static_branch_enable(&hugetlb_optimize_vmemmap_key); - else if (!strcmp(buf, "off")) - static_branch_disable(&hugetlb_optimize_vmemmap_key); else - return -EINVAL; + static_branch_disable(&hugetlb_optimize_vmemmap_key); return 0; } From 0143a85029f495118a4a439da37a011e9e3310ba Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:10:50 -0800 Subject: [PATCH 0399/5344] sysctl: add a new register_sysctl_init() interface commit 3ddd9a808cee7284931312f2f3e854c9617f44b2 upstream. Patch series "sysctl: first set of kernel/sysctl cleanups", v2. Finally had time to respin the series of the work we had started last year on cleaning up the kernel/sysct.c kitchen sink. People keeps stuffing their sysctls in that file and this creates a maintenance burden. So this effort is aimed at placing sysctls where they actually belong. I'm going to split patches up into series as there is quite a bit of work. This first set adds register_sysctl_init() for uses of registerting a sysctl on the init path, adds const where missing to a few places, generalizes common values so to be more easy to share, and starts the move of a few kernel/sysctl.c out where they belong. The majority of rework on v2 in this first patch set is 0-day fixes. Eric Biederman's feedback is later addressed in subsequent patch sets. I'll only post the first two patch sets for now. We can address the rest once the first two patch sets get completely reviewed / Acked. This patch (of 9): The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. Today though folks heavily rely on tables on kernel/sysctl.c so they can easily just extend this table with their needed sysctls. In order to help users move their sysctls out we need to provide a helper which can be used during code initialization. We special-case the initialization use of register_sysctl() since it *is* safe to fail, given all that sysctls do is provide a dynamic interface to query or modify at runtime an existing variable. So the use case of register_sysctl() on init should *not* stop if the sysctls don't end up getting registered. It would be counter productive to stop boot if a simple sysctl registration failed. Provide a helper for init then, and document the recommended init levels to use for callers of this routine. We will later use this in subsequent patches to start slimming down kernel/sysctl.c tables and moving sysctl registration to the code which actually needs these sysctls. [mcgrof@kernel.org: major commit log and documentation rephrasing also moved to fs/proc/proc_sysctl.c ] Link: https://lkml.kernel.org/r/20211123202347.818157-1-mcgrof@kernel.org Link: https://lkml.kernel.org/r/20211123202347.818157-2-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Reviewed-by: Kees Cook Cc: Iurii Zaikin Cc: "Eric W. Biederman" Cc: Peter Zijlstra Cc: Greg Kroah-Hartman Cc: Paul Turner Cc: Andy Shevchenko Cc: Sebastian Reichel Cc: Tetsuo Handa Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Qing Wang Cc: Benjamin LaHaise Cc: Al Viro Cc: Jan Kara Cc: Amir Goldstein Cc: Stephen Kitt Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Muchun Song --- fs/proc/proc_sysctl.c | 33 +++++++++++++++++++++++++++++++++ include/linux/sysctl.h | 3 +++ 2 files changed, 36 insertions(+) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index d80989b6c3448..f4264dd4ea31b 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "internal.h" static const struct dentry_operations proc_sys_dentry_operations; @@ -1397,6 +1398,38 @@ struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *tab } EXPORT_SYMBOL(register_sysctl); +/** + * __register_sysctl_init() - register sysctl table to path + * @path: path name for sysctl base + * @table: This is the sysctl table that needs to be registered to the path + * @table_name: The name of sysctl table, only used for log printing when + * registration fails + * + * The sysctl interface is used by userspace to query or modify at runtime + * a predefined value set on a variable. These variables however have default + * values pre-set. Code which depends on these variables will always work even + * if register_sysctl() fails. If register_sysctl() fails you'd just loose the + * ability to query or modify the sysctls dynamically at run time. Chances of + * register_sysctl() failing on init are extremely low, and so for both reasons + * this function does not return any error as it is used by initialization code. + * + * Context: Can only be called after your respective sysctl base path has been + * registered. So for instance, most base directories are registered early on + * init before init levels are processed through proc_sys_init() and + * sysctl_init(). + */ +void __init __register_sysctl_init(const char *path, struct ctl_table *table, + const char *table_name) +{ + struct ctl_table_header *hdr = register_sysctl(path, table); + + if (unlikely(!hdr)) { + pr_err("failed when register_sysctl %s to %s\n", table_name, path); + return; + } + kmemleak_not_leak(hdr); +} + static char *append_path(const char *path, char *pos, const char *name) { int namelen; diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index e995a035a109f..7a102099702f1 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -210,6 +210,9 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, void unregister_sysctl_table(struct ctl_table_header * table); extern int sysctl_init(void); +extern void __register_sysctl_init(const char *path, struct ctl_table *table, + const char *table_name); +#define register_sysctl_init(path, table) __register_sysctl_init(path, table, #table) extern struct ctl_table sysctl_mount_point[]; From b5d9f74671db5adedcb0aa8da1a75d8614415011 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 13 May 2022 16:48:56 -0700 Subject: [PATCH 0400/5344] mm: hugetlb_vmemmap: add hugetlb_optimize_vmemmap sysctl commit 78f39084b41d287aedb2ea55f2c1895cfa11d61a upstream. We must add hugetlb_free_vmemmap=on (or "off") to the boot cmdline and reboot the server to enable or disable the feature of optimizing vmemmap pages associated with HugeTLB pages. However, rebooting usually takes a long time. So add a sysctl to enable or disable the feature at runtime without rebooting. Why we need this? There are 3 use cases. 1) The feature of minimizing overhead of struct page associated with each HugeTLB is disabled by default without passing "hugetlb_free_vmemmap=on" to the boot cmdline. When we (ByteDance) deliver the servers to the users who want to enable this feature, they have to configure the grub (change boot cmdline) and reboot the servers, whereas rebooting usually takes a long time (we have thousands of servers). It's a very bad experience for the users. So we need a approach to enable this feature after rebooting. This is a use case in our practical environment. 2) Some use cases are that HugeTLB pages are allocated 'on the fly' instead of being pulled from the HugeTLB pool, those workloads would be affected with this feature enabled. Those workloads could be identified by the characteristics of they never explicitly allocating huge pages with 'nr_hugepages' but only set 'nr_overcommit_hugepages' and then let the pages be allocated from the buddy allocator at fault time. We can confirm it is a real use case from the commit 099730d67417. For those workloads, the page fault time could be ~2x slower than before. We suspect those users want to disable this feature if the system has enabled this before and they don't think the memory savings benefit is enough to make up for the performance drop. 3) If the workload which wants vmemmap pages to be optimized and the workload which wants to set 'nr_overcommit_hugepages' and does not want the extera overhead at fault time when the overcommitted pages be allocated from the buddy allocator are deployed in the same server. The user could enable this feature and set 'nr_hugepages' and 'nr_overcommit_hugepages', then disable the feature. In this case, the overcommited HugeTLB pages will not encounter the extra overhead at fault time. Link: https://lkml.kernel.org/r/20220512041142.39501-5-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Mike Kravetz Cc: Jonathan Corbet Cc: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: Oscar Salvador Cc: David Hildenbrand Cc: Masahiro Yamada Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Muchun Song --- Documentation/admin-guide/sysctl/vm.rst | 39 +++++++++++ mm/hugetlb_vmemmap.c | 93 ++++++++++++++++++++++--- 2 files changed, 123 insertions(+), 9 deletions(-) diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 64aeee1009cab..47985671f6931 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -548,6 +548,45 @@ Change the minimum size of the hugepage pool. See Documentation/admin-guide/mm/hugetlbpage.rst +hugetlb_optimize_vmemmap +======================== + +This knob is not available when memory_hotplug.memmap_on_memory (kernel parameter) +is configured or the size of 'struct page' (a structure defined in +include/linux/mm_types.h) is not power of two (an unusual system config could +result in this). + +Enable (set to 1) or disable (set to 0) the feature of optimizing vmemmap pages +associated with each HugeTLB page. + +Once enabled, the vmemmap pages of subsequent allocation of HugeTLB pages from +buddy allocator will be optimized (7 pages per 2MB HugeTLB page and 4095 pages +per 1GB HugeTLB page), whereas already allocated HugeTLB pages will not be +optimized. When those optimized HugeTLB pages are freed from the HugeTLB pool +to the buddy allocator, the vmemmap pages representing that range needs to be +remapped again and the vmemmap pages discarded earlier need to be rellocated +again. If your use case is that HugeTLB pages are allocated 'on the fly' (e.g. +never explicitly allocating HugeTLB pages with 'nr_hugepages' but only set +'nr_overcommit_hugepages', those overcommitted HugeTLB pages are allocated 'on +the fly') instead of being pulled from the HugeTLB pool, you should weigh the +benefits of memory savings against the more overhead (~2x slower than before) +of allocation or freeing HugeTLB pages between the HugeTLB pool and the buddy +allocator. Another behavior to note is that if the system is under heavy memory +pressure, it could prevent the user from freeing HugeTLB pages from the HugeTLB +pool to the buddy allocator since the allocation of vmemmap pages could be +failed, you have to retry later if your system encounter this situation. + +Once disabled, the vmemmap pages of subsequent allocation of HugeTLB pages from +buddy allocator will not be optimized meaning the extra overhead at allocation +time from buddy allocator disappears, whereas already optimized HugeTLB pages +will not be affected. If you want to make sure there are no optimized HugeTLB +pages, you can set "nr_hugepages" to 0 first and then disable this. Note that +writing 0 to nr_hugepages will make any "in use" HugeTLB pages become surplus +pages. So, those surplus pages are still optimized until they are no longer +in use. You would need to wait for those surplus pages to be released before +there are no optimized pages in the system. + + nr_hugepages_mempolicy ====================== diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 112e74504905c..6a0d2d43359a7 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -176,6 +176,7 @@ */ #define pr_fmt(fmt) "HugeTLB: " fmt +#include #include "hugetlb_vmemmap.h" /* @@ -188,21 +189,40 @@ #define RESERVE_VMEMMAP_NR 1U #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) +enum vmemmap_optimize_mode { + VMEMMAP_OPTIMIZE_OFF, + VMEMMAP_OPTIMIZE_ON, +}; + DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, hugetlb_optimize_vmemmap_key); EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); +static enum vmemmap_optimize_mode vmemmap_optimize_mode = + IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON); + +static void vmemmap_optimize_mode_switch(enum vmemmap_optimize_mode to) +{ + if (vmemmap_optimize_mode == to) + return; + + if (to == VMEMMAP_OPTIMIZE_OFF) + static_branch_dec(&hugetlb_optimize_vmemmap_key); + else + static_branch_inc(&hugetlb_optimize_vmemmap_key); + WRITE_ONCE(vmemmap_optimize_mode, to); +} + static int __init hugetlb_vmemmap_early_param(char *buf) { bool enable; + enum vmemmap_optimize_mode mode; if (kstrtobool(buf, &enable)) return -EINVAL; - if (enable) - static_branch_enable(&hugetlb_optimize_vmemmap_key); - else - static_branch_disable(&hugetlb_optimize_vmemmap_key); + mode = enable ? VMEMMAP_OPTIMIZE_ON : VMEMMAP_OPTIMIZE_OFF; + vmemmap_optimize_mode_switch(mode); return 0; } @@ -235,8 +255,10 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) */ ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse, GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE); - if (!ret) + if (!ret) { ClearHPageVmemmapOptimized(head); + static_branch_dec(&hugetlb_optimize_vmemmap_key); + } return ret; } @@ -250,6 +272,11 @@ void hugetlb_vmemmap_free(struct hstate *h, struct page *head) if (!vmemmap_pages) return; + if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF) + return; + + static_branch_inc(&hugetlb_optimize_vmemmap_key); + vmemmap_addr += RESERVE_VMEMMAP_SIZE; vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); vmemmap_reuse = vmemmap_addr - PAGE_SIZE; @@ -259,7 +286,9 @@ void hugetlb_vmemmap_free(struct hstate *h, struct page *head) * to the page which @vmemmap_reuse is mapped to, then free the pages * which the range [@vmemmap_addr, @vmemmap_end] is mapped to. */ - if (!vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse)) + if (vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse)) + static_branch_dec(&hugetlb_optimize_vmemmap_key); + else SetHPageVmemmapOptimized(head); } @@ -276,9 +305,6 @@ void __init hugetlb_vmemmap_init(struct hstate *h) BUILD_BUG_ON(__NR_USED_SUBPAGE >= RESERVE_VMEMMAP_SIZE / sizeof(struct page)); - if (!hugetlb_optimize_vmemmap_enabled()) - return; - if (!is_power_of_2(sizeof(struct page))) { pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n"); static_branch_disable(&hugetlb_optimize_vmemmap_key); @@ -300,3 +326,52 @@ void __init hugetlb_vmemmap_init(struct hstate *h) pr_info("can optimize %d vmemmap pages for %s\n", h->optimize_vmemmap_pages, h->name); } + +#ifdef CONFIG_PROC_SYSCTL +static int hugetlb_optimize_vmemmap_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, + loff_t *ppos) +{ + int ret; + enum vmemmap_optimize_mode mode; + static DEFINE_MUTEX(sysctl_mutex); + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&sysctl_mutex); + mode = vmemmap_optimize_mode; + table->data = &mode; + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (write && !ret) + vmemmap_optimize_mode_switch(mode); + mutex_unlock(&sysctl_mutex); + + return ret; +} + +static struct ctl_table hugetlb_vmemmap_sysctls[] = { + { + .procname = "hugetlb_optimize_vmemmap", + .maxlen = sizeof(enum vmemmap_optimize_mode), + .mode = 0644, + .proc_handler = hugetlb_optimize_vmemmap_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { } +}; + +static __init int hugetlb_vmemmap_sysctls_init(void) +{ + /* + * If "struct page" crosses page boundaries, the vmemmap pages cannot + * be optimized. + */ + if (is_power_of_2(sizeof(struct page))) + register_sysctl_init("vm", hugetlb_vmemmap_sysctls); + + return 0; +} +late_initcall(hugetlb_vmemmap_sysctls_init); +#endif /* CONFIG_PROC_SYSCTL */ From 25564e865594dba3e606dc76a0ce376fb108f943 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 27 May 2022 16:19:48 +0800 Subject: [PATCH 0401/5344] mm: hugetlb_vmemmap: fix CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON commit 0111def915b280c64c05f73f01b59ca404255aa3 upstream. The following: commit 47010c040dec ("mm: hugetlb_vmemmap: cleanup CONFIG_HUGETLB_PAGE_FREE_VMEMMAP*") forgot to update CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON used in vmemmap_optimize_mode to CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON. The result is we cannot enable hugetlb_optimize_vmemmap at boot time when we configure CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON. Fix it. Link: https://lkml.kernel.org/r/20220527081948.68832-1-songmuchun@bytedance.com Fixes: 47010c040dec ("mm: hugetlb_vmemmap: cleanup CONFIG_HUGETLB_PAGE_FREE_VMEMMAP*") Signed-off-by: Muchun Song Reported-by: Vlastimil Babka Acked-by: Vlastimil Babka Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Muchun Song --- mm/hugetlb_vmemmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 6a0d2d43359a7..6c4ec509be7f6 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -199,7 +199,7 @@ DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); static enum vmemmap_optimize_mode vmemmap_optimize_mode = - IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON); + IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); static void vmemmap_optimize_mode_switch(enum vmemmap_optimize_mode to) { From ff33b8792bf46e29848799d7349bdd5a3617940e Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 20 Jun 2022 10:30:19 +0800 Subject: [PATCH 0402/5344] mm: sparsemem: fix missing higher order allocation splitting commit 39d35edee4537487e5178f258e23518272a66413 upstream. Higher order allocations for vmemmap pages from buddy allocator must be able to be treated as indepdenent small pages as they can be freed individually by the caller. There is no problem for higher order vmemmap pages allocated at boot time since each individual small page will be initialized at boot time. However, it will be an issue for memory hotplug case since those higher order vmemmap pages are allocated from buddy allocator without initializing each individual small page's refcount. The system will panic in put_page_testzero() when CONFIG_DEBUG_VM is enabled if the vmemmap page is freed. Link: https://lkml.kernel.org/r/20220620023019.94257-1-songmuchun@bytedance.com Fixes: d8d55f5616cf ("mm: sparsemem: use page table lock to protect kernel pmd operations") Signed-off-by: Muchun Song Reviewed-by: Mike Kravetz Cc: Xiongchun Duan Cc: Signed-off-by: Andrew Morton Signed-off-by: Muchun Song --- mm/sparse-vmemmap.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 8de45bbcf334b..52e88eddeed40 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -78,6 +78,14 @@ static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) spin_lock(&init_mm.page_table_lock); if (likely(pmd_leaf(*pmd))) { + /* + * Higher order allocations from buddy allocator must be able to + * be treated as indepdenent small pages (as they can be freed + * individually). + */ + if (!PageReserved(page)) + split_page(page, get_order(PMD_SIZE)); + /* Make pte visible before pmd. See comment in __pte_alloc(). */ smp_wmb(); pmd_populate_kernel(&init_mm, pmd, pgtable); From 7c29f4326b7498bbe6e6a71dc3627fe1a4150581 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Thu, 28 Apr 2022 23:16:15 -0700 Subject: [PATCH 0403/5344] mm/hugetlb_vmemmap: move comment block to Documentation/vm commit 60a427db0f80f16b9bb9efe6cc79c93f336e8466 upstream. In preparation for device-dax for using hugetlbfs compound page tail deduplication technique, move the comment block explanation into a common place in Documentation/vm. Link: https://lkml.kernel.org/r/20220420155310.9712-4-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Muchun Song Reviewed-by: Dan Williams Suggested-by: Dan Williams Cc: Muchun Song Cc: Mike Kravetz Cc: Christoph Hellwig Cc: Jane Chu Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Vishal Verma Signed-off-by: Andrew Morton Signed-off-by: Muchun Song --- Documentation/vm/index.rst | 1 + Documentation/vm/vmemmap_dedup.rst | 173 +++++++++++++++++++++++++++++ mm/hugetlb_vmemmap.c | 168 +--------------------------- 3 files changed, 175 insertions(+), 167 deletions(-) create mode 100644 Documentation/vm/vmemmap_dedup.rst diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst index 30813498c74d9..7d3692df39fba 100644 --- a/Documentation/vm/index.rst +++ b/Documentation/vm/index.rst @@ -50,5 +50,6 @@ descriptions of data structures and algorithms. split_page_table_lock transhuge unevictable-lru + vmemmap_dedup z3fold zsmalloc diff --git a/Documentation/vm/vmemmap_dedup.rst b/Documentation/vm/vmemmap_dedup.rst new file mode 100644 index 0000000000000..485ccf4f7b10b --- /dev/null +++ b/Documentation/vm/vmemmap_dedup.rst @@ -0,0 +1,173 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================== +Free some vmemmap pages of HugeTLB +================================== + +The struct page structures (page structs) are used to describe a physical +page frame. By default, there is a one-to-one mapping from a page frame to +it's corresponding page struct. + +HugeTLB pages consist of multiple base page size pages and is supported by many +architectures. See Documentation/admin-guide/mm/hugetlbpage.rst for more +details. On the x86-64 architecture, HugeTLB pages of size 2MB and 1GB are +currently supported. Since the base page size on x86 is 4KB, a 2MB HugeTLB page +consists of 512 base pages and a 1GB HugeTLB page consists of 4096 base pages. +For each base page, there is a corresponding page struct. + +Within the HugeTLB subsystem, only the first 4 page structs are used to +contain unique information about a HugeTLB page. __NR_USED_SUBPAGE provides +this upper limit. The only 'useful' information in the remaining page structs +is the compound_head field, and this field is the same for all tail pages. + +By removing redundant page structs for HugeTLB pages, memory can be returned +to the buddy allocator for other uses. + +Different architectures support different HugeTLB pages. For example, the +following table is the HugeTLB page size supported by x86 and arm64 +architectures. Because arm64 supports 4k, 16k, and 64k base pages and +supports contiguous entries, so it supports many kinds of sizes of HugeTLB +page. + ++--------------+-----------+-----------------------------------------------+ +| Architecture | Page Size | HugeTLB Page Size | ++--------------+-----------+-----------+-----------+-----------+-----------+ +| x86-64 | 4KB | 2MB | 1GB | | | ++--------------+-----------+-----------+-----------+-----------+-----------+ +| | 4KB | 64KB | 2MB | 32MB | 1GB | +| +-----------+-----------+-----------+-----------+-----------+ +| arm64 | 16KB | 2MB | 32MB | 1GB | | +| +-----------+-----------+-----------+-----------+-----------+ +| | 64KB | 2MB | 512MB | 16GB | | ++--------------+-----------+-----------+-----------+-----------+-----------+ + +When the system boot up, every HugeTLB page has more than one struct page +structs which size is (unit: pages):: + + struct_size = HugeTLB_Size / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE + +Where HugeTLB_Size is the size of the HugeTLB page. We know that the size +of the HugeTLB page is always n times PAGE_SIZE. So we can get the following +relationship:: + + HugeTLB_Size = n * PAGE_SIZE + +Then:: + + struct_size = n * PAGE_SIZE / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE + = n * sizeof(struct page) / PAGE_SIZE + +We can use huge mapping at the pud/pmd level for the HugeTLB page. + +For the HugeTLB page of the pmd level mapping, then:: + + struct_size = n * sizeof(struct page) / PAGE_SIZE + = PAGE_SIZE / sizeof(pte_t) * sizeof(struct page) / PAGE_SIZE + = sizeof(struct page) / sizeof(pte_t) + = 64 / 8 + = 8 (pages) + +Where n is how many pte entries which one page can contains. So the value of +n is (PAGE_SIZE / sizeof(pte_t)). + +This optimization only supports 64-bit system, so the value of sizeof(pte_t) +is 8. And this optimization also applicable only when the size of struct page +is a power of two. In most cases, the size of struct page is 64 bytes (e.g. +x86-64 and arm64). So if we use pmd level mapping for a HugeTLB page, the +size of struct page structs of it is 8 page frames which size depends on the +size of the base page. + +For the HugeTLB page of the pud level mapping, then:: + + struct_size = PAGE_SIZE / sizeof(pmd_t) * struct_size(pmd) + = PAGE_SIZE / 8 * 8 (pages) + = PAGE_SIZE (pages) + +Where the struct_size(pmd) is the size of the struct page structs of a +HugeTLB page of the pmd level mapping. + +E.g.: A 2MB HugeTLB page on x86_64 consists in 8 page frames while 1GB +HugeTLB page consists in 4096. + +Next, we take the pmd level mapping of the HugeTLB page as an example to +show the internal implementation of this optimization. There are 8 pages +struct page structs associated with a HugeTLB page which is pmd mapped. + +Here is how things look before optimization:: + + HugeTLB struct pages(8 pages) page frame(8 pages) + +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ + | | | 0 | -------------> | 0 | + | | +-----------+ +-----------+ + | | | 1 | -------------> | 1 | + | | +-----------+ +-----------+ + | | | 2 | -------------> | 2 | + | | +-----------+ +-----------+ + | | | 3 | -------------> | 3 | + | | +-----------+ +-----------+ + | | | 4 | -------------> | 4 | + | PMD | +-----------+ +-----------+ + | level | | 5 | -------------> | 5 | + | mapping | +-----------+ +-----------+ + | | | 6 | -------------> | 6 | + | | +-----------+ +-----------+ + | | | 7 | -------------> | 7 | + | | +-----------+ +-----------+ + | | + | | + | | + +-----------+ + +The value of page->compound_head is the same for all tail pages. The first +page of page structs (page 0) associated with the HugeTLB page contains the 4 +page structs necessary to describe the HugeTLB. The only use of the remaining +pages of page structs (page 1 to page 7) is to point to page->compound_head. +Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs +will be used for each HugeTLB page. This will allow us to free the remaining +7 pages to the buddy allocator. + +Here is how things look after remapping:: + + HugeTLB struct pages(8 pages) page frame(8 pages) + +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ + | | | 0 | -------------> | 0 | + | | +-----------+ +-----------+ + | | | 1 | ---------------^ ^ ^ ^ ^ ^ ^ + | | +-----------+ | | | | | | + | | | 2 | -----------------+ | | | | | + | | +-----------+ | | | | | + | | | 3 | -------------------+ | | | | + | | +-----------+ | | | | + | | | 4 | ---------------------+ | | | + | PMD | +-----------+ | | | + | level | | 5 | -----------------------+ | | + | mapping | +-----------+ | | + | | | 6 | -------------------------+ | + | | +-----------+ | + | | | 7 | ---------------------------+ + | | +-----------+ + | | + | | + | | + +-----------+ + +When a HugeTLB is freed to the buddy system, we should allocate 7 pages for +vmemmap pages and restore the previous mapping relationship. + +For the HugeTLB page of the pud level mapping. It is similar to the former. +We also can use this approach to free (PAGE_SIZE - 1) vmemmap pages. + +Apart from the HugeTLB page of the pmd/pud level mapping, some architectures +(e.g. aarch64) provides a contiguous bit in the translation table entries +that hints to the MMU to indicate that it is one of a contiguous set of +entries that can be cached in a single TLB entry. + +The contiguous bit is used to increase the mapping size at the pmd and pte +(last) level. So this type of HugeTLB page can be optimized only when its +size of the struct page structs is greater than 1 page. + +Notice: The head vmemmap page is not freed to the buddy allocator and all +tail vmemmap pages are mapped to the head vmemmap page frame. So we can see +more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page) +associated with each HugeTLB page. The compound_head() can handle this +correctly (more details refer to the comment above compound_head()). diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 6c4ec509be7f6..d6cd5236ad61f 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -6,173 +6,7 @@ * * Author: Muchun Song * - * The struct page structures (page structs) are used to describe a physical - * page frame. By default, there is a one-to-one mapping from a page frame to - * it's corresponding page struct. - * - * HugeTLB pages consist of multiple base page size pages and is supported by - * many architectures. See hugetlbpage.rst in the Documentation directory for - * more details. On the x86-64 architecture, HugeTLB pages of size 2MB and 1GB - * are currently supported. Since the base page size on x86 is 4KB, a 2MB - * HugeTLB page consists of 512 base pages and a 1GB HugeTLB page consists of - * 4096 base pages. For each base page, there is a corresponding page struct. - * - * Within the HugeTLB subsystem, only the first 4 page structs are used to - * contain unique information about a HugeTLB page. __NR_USED_SUBPAGE provides - * this upper limit. The only 'useful' information in the remaining page structs - * is the compound_head field, and this field is the same for all tail pages. - * - * By removing redundant page structs for HugeTLB pages, memory can be returned - * to the buddy allocator for other uses. - * - * Different architectures support different HugeTLB pages. For example, the - * following table is the HugeTLB page size supported by x86 and arm64 - * architectures. Because arm64 supports 4k, 16k, and 64k base pages and - * supports contiguous entries, so it supports many kinds of sizes of HugeTLB - * page. - * - * +--------------+-----------+-----------------------------------------------+ - * | Architecture | Page Size | HugeTLB Page Size | - * +--------------+-----------+-----------+-----------+-----------+-----------+ - * | x86-64 | 4KB | 2MB | 1GB | | | - * +--------------+-----------+-----------+-----------+-----------+-----------+ - * | | 4KB | 64KB | 2MB | 32MB | 1GB | - * | +-----------+-----------+-----------+-----------+-----------+ - * | arm64 | 16KB | 2MB | 32MB | 1GB | | - * | +-----------+-----------+-----------+-----------+-----------+ - * | | 64KB | 2MB | 512MB | 16GB | | - * +--------------+-----------+-----------+-----------+-----------+-----------+ - * - * When the system boot up, every HugeTLB page has more than one struct page - * structs which size is (unit: pages): - * - * struct_size = HugeTLB_Size / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE - * - * Where HugeTLB_Size is the size of the HugeTLB page. We know that the size - * of the HugeTLB page is always n times PAGE_SIZE. So we can get the following - * relationship. - * - * HugeTLB_Size = n * PAGE_SIZE - * - * Then, - * - * struct_size = n * PAGE_SIZE / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE - * = n * sizeof(struct page) / PAGE_SIZE - * - * We can use huge mapping at the pud/pmd level for the HugeTLB page. - * - * For the HugeTLB page of the pmd level mapping, then - * - * struct_size = n * sizeof(struct page) / PAGE_SIZE - * = PAGE_SIZE / sizeof(pte_t) * sizeof(struct page) / PAGE_SIZE - * = sizeof(struct page) / sizeof(pte_t) - * = 64 / 8 - * = 8 (pages) - * - * Where n is how many pte entries which one page can contains. So the value of - * n is (PAGE_SIZE / sizeof(pte_t)). - * - * This optimization only supports 64-bit system, so the value of sizeof(pte_t) - * is 8. And this optimization also applicable only when the size of struct page - * is a power of two. In most cases, the size of struct page is 64 bytes (e.g. - * x86-64 and arm64). So if we use pmd level mapping for a HugeTLB page, the - * size of struct page structs of it is 8 page frames which size depends on the - * size of the base page. - * - * For the HugeTLB page of the pud level mapping, then - * - * struct_size = PAGE_SIZE / sizeof(pmd_t) * struct_size(pmd) - * = PAGE_SIZE / 8 * 8 (pages) - * = PAGE_SIZE (pages) - * - * Where the struct_size(pmd) is the size of the struct page structs of a - * HugeTLB page of the pmd level mapping. - * - * E.g.: A 2MB HugeTLB page on x86_64 consists in 8 page frames while 1GB - * HugeTLB page consists in 4096. - * - * Next, we take the pmd level mapping of the HugeTLB page as an example to - * show the internal implementation of this optimization. There are 8 pages - * struct page structs associated with a HugeTLB page which is pmd mapped. - * - * Here is how things look before optimization. - * - * HugeTLB struct pages(8 pages) page frame(8 pages) - * +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ - * | | | 0 | -------------> | 0 | - * | | +-----------+ +-----------+ - * | | | 1 | -------------> | 1 | - * | | +-----------+ +-----------+ - * | | | 2 | -------------> | 2 | - * | | +-----------+ +-----------+ - * | | | 3 | -------------> | 3 | - * | | +-----------+ +-----------+ - * | | | 4 | -------------> | 4 | - * | PMD | +-----------+ +-----------+ - * | level | | 5 | -------------> | 5 | - * | mapping | +-----------+ +-----------+ - * | | | 6 | -------------> | 6 | - * | | +-----------+ +-----------+ - * | | | 7 | -------------> | 7 | - * | | +-----------+ +-----------+ - * | | - * | | - * | | - * +-----------+ - * - * The value of page->compound_head is the same for all tail pages. The first - * page of page structs (page 0) associated with the HugeTLB page contains the 4 - * page structs necessary to describe the HugeTLB. The only use of the remaining - * pages of page structs (page 1 to page 7) is to point to page->compound_head. - * Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs - * will be used for each HugeTLB page. This will allow us to free the remaining - * 7 pages to the buddy allocator. - * - * Here is how things look after remapping. - * - * HugeTLB struct pages(8 pages) page frame(8 pages) - * +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ - * | | | 0 | -------------> | 0 | - * | | +-----------+ +-----------+ - * | | | 1 | ---------------^ ^ ^ ^ ^ ^ ^ - * | | +-----------+ | | | | | | - * | | | 2 | -----------------+ | | | | | - * | | +-----------+ | | | | | - * | | | 3 | -------------------+ | | | | - * | | +-----------+ | | | | - * | | | 4 | ---------------------+ | | | - * | PMD | +-----------+ | | | - * | level | | 5 | -----------------------+ | | - * | mapping | +-----------+ | | - * | | | 6 | -------------------------+ | - * | | +-----------+ | - * | | | 7 | ---------------------------+ - * | | +-----------+ - * | | - * | | - * | | - * +-----------+ - * - * When a HugeTLB is freed to the buddy system, we should allocate 7 pages for - * vmemmap pages and restore the previous mapping relationship. - * - * For the HugeTLB page of the pud level mapping. It is similar to the former. - * We also can use this approach to free (PAGE_SIZE - 1) vmemmap pages. - * - * Apart from the HugeTLB page of the pmd/pud level mapping, some architectures - * (e.g. aarch64) provides a contiguous bit in the translation table entries - * that hints to the MMU to indicate that it is one of a contiguous set of - * entries that can be cached in a single TLB entry. - * - * The contiguous bit is used to increase the mapping size at the pmd and pte - * (last) level. So this type of HugeTLB page can be optimized only when its - * size of the struct page structs is greater than 1 page. - * - * Notice: The head vmemmap page is not freed to the buddy allocator and all - * tail vmemmap pages are mapped to the head vmemmap page frame. So we can see - * more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page) - * associated with each HugeTLB page. The compound_head() can handle this - * correctly (more details refer to the comment above compound_head()). + * See Documentation/vm/vmemmap_dedup.rst */ #define pr_fmt(fmt) "HugeTLB: " fmt From 496720092199653b67fad69f5e2216560a972a74 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 8 Jun 2022 20:22:05 +0200 Subject: [PATCH 0404/5344] mm/slub: add missing TID updates on slab deactivation commit eeaa345e128515135ccb864c04482180c08e3259 upstream. The fastpath in slab_alloc_node() assumes that c->slab is stable as long as the TID stays the same. However, two places in __slab_alloc() currently don't update the TID when deactivating the CPU slab. If multiple operations race the right way, this could lead to an object getting lost; or, in an even more unlikely situation, it could even lead to an object being freed onto the wrong slab's freelist, messing up the `inuse` counter and eventually causing a page to be freed to the page allocator while it still contains slab objects. (I haven't actually tested these cases though, this is just based on looking at the code. Writing testcases for this stuff seems like it'd be a pain...) The race leading to state inconsistency is (all operations on the same CPU and kmem_cache): - task A: begin do_slab_free(): - read TID - read pcpu freelist (==NULL) - check `slab == c->slab` (true) - [PREEMPT A->B] - task B: begin slab_alloc_node(): - fastpath fails (`c->freelist` is NULL) - enter __slab_alloc() - slub_get_cpu_ptr() (disables preemption) - enter ___slab_alloc() - take local_lock_irqsave() - read c->freelist as NULL - get_freelist() returns NULL - write `c->slab = NULL` - drop local_unlock_irqrestore() - goto new_slab - slub_percpu_partial() is NULL - get_partial() returns NULL - slub_put_cpu_ptr() (enables preemption) - [PREEMPT B->A] - task A: finish do_slab_free(): - this_cpu_cmpxchg_double() succeeds() - [CORRUPT STATE: c->slab==NULL, c->freelist!=NULL] From there, the object on c->freelist will get lost if task B is allowed to continue from here: It will proceed to the retry_load_slab label, set c->slab, then jump to load_freelist, which clobbers c->freelist. But if we instead continue as follows, we get worse corruption: - task A: run __slab_free() on object from other struct slab: - CPU_PARTIAL_FREE case (slab was on no list, is now on pcpu partial) - task A: run slab_alloc_node() with NUMA node constraint: - fastpath fails (c->slab is NULL) - call __slab_alloc() - slub_get_cpu_ptr() (disables preemption) - enter ___slab_alloc() - c->slab is NULL: goto new_slab - slub_percpu_partial() is non-NULL - set c->slab to slub_percpu_partial(c) - [CORRUPT STATE: c->slab points to slab-1, c->freelist has objects from slab-2] - goto redo - node_match() fails - goto deactivate_slab - existing c->freelist is passed into deactivate_slab() - inuse count of slab-1 is decremented to account for object from slab-2 At this point, the inuse count of slab-1 is 1 lower than it should be. This means that if we free all allocated objects in slab-1 except for one, SLUB will think that slab-1 is completely unused, and may free its page, leading to use-after-free. Fixes: c17dda40a6a4e ("slub: Separate out kmem_cache_cpu processing from deactivate_slab") Fixes: 03e404af26dc2 ("slub: fast release on full slab") Cc: stable@vger.kernel.org Signed-off-by: Jann Horn Acked-by: Christoph Lameter Acked-by: David Rientjes Reviewed-by: Muchun Song Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka Link: https://lore.kernel.org/r/20220608182205.2945720-1-jannh@google.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Qi Zheng --- mm/slub.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 73d3f79f04848..bd40445941d18 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2201,6 +2201,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, c->page = NULL; c->freelist = NULL; + c->tid = next_tid(c->tid); } /* @@ -2334,8 +2335,6 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { stat(s, CPUSLAB_FLUSH); deactivate_slab(s, c->page, c->freelist, c); - - c->tid = next_tid(c->tid); } /* @@ -2619,6 +2618,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, if (!freelist) { c->page = NULL; + c->tid = next_tid(c->tid); stat(s, DEACTIVATE_BYPASS); goto new_slab; } From f1d69f0362932b277380ba3237fd236123dc966a Mon Sep 17 00:00:00 2001 From: "huangjie.albert" Date: Wed, 29 Jun 2022 12:10:54 +0800 Subject: [PATCH 0405/5344] bytedance: net: rps: supoort a single flow to use rps In some scenarios(ipsec) or products(VPN gateway), we need to enable a single network stream to support RPS. but In the current rps implementation, there is no support for a single stream to utilize rps. Introduce a hashtable to record flows that need to use rps. For these flows, use round robin to put them on different CPUs for further processing how to use: echo xxx > /sys/class/net/xxx/queues/rx-x/rps_cpus echo 1 > /sys/class/net/xxx/queues/rx-x/rps_single_flow and create a flow node with the function: rps_flow_node_add the flow aging time can be get/set by the interface below: (default:5000 ms) /sys/class/net/xxx/queues/rx-x/rps_flow_aging_time This patch can improve the performance of a single stream, for example: On my virtual machine (4 vcpu + 8g), a single ipsec stream (3des encryption) can improve throughput by 3-4 times: before after 212 Mbits/sec 698 Mbits/sec Signed-off-by: huangjie.albert --- include/linux/netdevice.h | 42 +++++ net/core/dev.c | 314 ++++++++++++++++++++++++++++++++++++++ net/core/net-sysfs.c | 67 ++++++++ 3 files changed, 423 insertions(+) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9b97b4b5cb144..909e96f7a6ef0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -739,6 +739,8 @@ struct netdev_rx_queue { #ifdef CONFIG_RPS struct rps_map __rcu *rps_map; struct rps_dev_flow_table __rcu *rps_flow_table; + unsigned long rps_single_flow_flags; +#define RPS_SINGLE_FLOW_ENABLE (0) #endif struct kobject kobj; struct net_device *dev; @@ -757,6 +759,46 @@ struct rx_queue_attribute { ssize_t (*store)(struct netdev_rx_queue *queue, const char *buf, size_t len); }; +#ifdef CONFIG_RPS +struct rps_flow_info_node { + union { + struct { + __be32 saddr; + __be32 daddr; + __u8 protocol; + } in4_u; + + struct { + struct in6_addr saddr; + struct in6_addr daddr; + __u8 flow_lbl[3]; + } in6_u; + + }; +#define v4_saddr in4_u.saddr +#define v4_daddr in4_u.daddr +#define v4_protocol in4_u.protocol +#define v6_saddr in6_u.saddr.s6_addr32 +#define v6_daddr in6_u.daddr.s6_addr32 +#define v6_flow_lbl in6_u.flow_lbl + u64 jiffies; /* keep the time update entry */ + struct hlist_node node; +}; +DECLARE_STATIC_KEY_FALSE(rps_flow_queue_enable_key); +void rps_single_flow_enable_set(bool enable); +unsigned long rps_flow_node_aging_time_get(void); +void rps_flow_node_aging_time_set(unsigned long aging_time); +bool __rps_flow_node_add(const void *iph, bool ipv6); + +static inline void rps_flow_node_add(const void *iph, bool ipv6) +{ + /* no rps_single_flow_enable */ + if (static_branch_unlikely(&rps_flow_queue_enable_key)) + __rps_flow_node_add(iph, ipv6); +} +#else +static inline void rps_flow_node_add(const void *iph, bool ipv6) { } +#endif #ifdef CONFIG_XPS /* diff --git a/net/core/dev.c b/net/core/dev.c index 5cc958716862b..52ac625c82265 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3943,6 +3943,256 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, * CPU from the RPS map of the receiving queue for a given skb. * rcu_read_lock must be held on entry. */ +#define DEFAULT_HASH_BUCKET_BIT (8) +#define DEFAULT_QUOTA_PER_CPU (128) +static unsigned int quota_per_cpu = DEFAULT_QUOTA_PER_CPU; +DEFINE_STATIC_KEY_FALSE(rps_flow_queue_enable_key); +static atomic_t last_aging_jiffies; +/* default 5000 ms */ +static unsigned long max_aging_time = 5000; +static DEFINE_HASHTABLE(rps_flow_info_htable, DEFAULT_HASH_BUCKET_BIT); +static DEFINE_RWLOCK(rps_flow_rwlock); + +unsigned long rps_flow_node_aging_time_get(void) +{ + return max_aging_time; +} + +void rps_flow_node_aging_time_set(unsigned long aging_time) +{ + max_aging_time = aging_time; +} + +static inline bool ipv4_flow_match(const struct iphdr *iph, struct rps_flow_info_node *flow) +{ + return iph->saddr == flow->v4_saddr && iph->daddr == flow->v4_daddr && + iph->protocol == flow->v4_protocol; +} + +static inline u32 ipv4_hash_key(const struct iphdr *iph) +{ + return iph->saddr ^ iph->daddr ^ iph->protocol; +} + +static inline bool ipv6_flow_match(const struct ipv6hdr *ip6h, struct rps_flow_info_node *p_flow) +{ + /* flow_lbl maximum probability is different, so compare it first */ + if (memcmp(ip6h->flow_lbl, p_flow->v6_flow_lbl, sizeof(p_flow->v6_flow_lbl))) + return false; + + if (memcmp(ip6h->daddr.s6_addr32, p_flow->v6_daddr, sizeof(p_flow->v6_daddr))) + return false; + + if (memcmp(ip6h->saddr.s6_addr32, p_flow->v6_saddr, sizeof(p_flow->v6_saddr))) + return false; + + return true; +} + +static inline u32 ipv6_hash_key(const struct ipv6hdr *ip6h) +{ + /* saddr hash daddr hash and lbl hash */ + u32 hash = 0; + u8 count; + + for (count = 0; count < sizeof(ip6h->saddr.s6_addr32) / sizeof(u32); count++) + hash ^= ip6h->saddr.s6_addr32[count]; + + for (count = 0; count < sizeof(ip6h->daddr.s6_addr32) / sizeof(u32); count++) + hash ^= ip6h->daddr.s6_addr32[count]; + + for (count = 0; count < sizeof(ip6h->flow_lbl); count++) + hash ^= ip6h->flow_lbl[count]; + + return hash; +} + +static bool rps_flow_node_create_v6(const struct ipv6hdr *ip6h) +{ + struct rps_flow_info_node *p_flow; + u32 hash_key; + + hash_key = ipv6_hash_key(ip6h); + + /* find if the node already exist */ + read_lock(&rps_flow_rwlock); + hash_for_each_possible(rps_flow_info_htable, p_flow, node, hash_key) { + if (ipv6_flow_match(ip6h, p_flow)) { + p_flow->jiffies = jiffies; + read_unlock(&rps_flow_rwlock); + return true; + } + } + read_unlock(&rps_flow_rwlock); + + /* if not exist. create a new one */ + p_flow = kzalloc(sizeof(*p_flow), GFP_KERNEL); + if (!p_flow) + return false; + + memcpy(p_flow->v6_saddr, ip6h->saddr.s6_addr, sizeof(ip6h->saddr.s6_addr)); + memcpy(p_flow->v6_daddr, ip6h->daddr.s6_addr, sizeof(ip6h->daddr.s6_addr)); + memcpy(p_flow->v6_flow_lbl, ip6h->flow_lbl, sizeof(ip6h->flow_lbl)); + + p_flow->jiffies = jiffies; + + /* add the hash node */ + write_lock(&rps_flow_rwlock); + hash_add(rps_flow_info_htable, &p_flow->node, hash_key); + write_unlock(&rps_flow_rwlock); + return true; +} + +static bool rps_flow_node_create_v4(const struct iphdr *iph) +{ + struct rps_flow_info_node *p_flow; + u32 hash_key; + + hash_key = ipv4_hash_key(iph); + + /* find if the node already exist */ + read_lock(&rps_flow_rwlock); + hash_for_each_possible(rps_flow_info_htable, p_flow, node, hash_key) { + if (ipv4_flow_match(iph, p_flow)) { + p_flow->jiffies = jiffies; + read_unlock(&rps_flow_rwlock); + return true; + } + } + read_unlock(&rps_flow_rwlock); + + /* if not exist. create a new one */ + p_flow = kzalloc(sizeof(*p_flow), GFP_KERNEL); + if (!p_flow) + return false; + + p_flow->v4_saddr = iph->saddr; + p_flow->v4_daddr = iph->daddr; + p_flow->v4_protocol = iph->protocol; + p_flow->jiffies = jiffies; + + /* add the hash node */ + write_lock(&rps_flow_rwlock); + hash_add(rps_flow_info_htable, &p_flow->node, hash_key); + write_unlock(&rps_flow_rwlock); + return true; +} + +/* create rps flow node if not exist. + * update Timestamp for rps flow node if exist + */ +bool __rps_flow_node_add(const void *iph, bool ipv6) +{ + struct ipv6hdr *ip6h; + struct iphdr *ip4h; + + if (!ipv6) { + ip4h = (struct iphdr *)iph; + return rps_flow_node_create_v4(ip4h); + } + + ip6h = (struct ipv6hdr *)iph; + return rps_flow_node_create_v6(ip6h); +} + +static void rps_flow_node_clear(void) +{ + struct rps_flow_info_node *p_rps_flow_entry; + struct hlist_node *tmp; + u32 bucket; + + write_lock(&rps_flow_rwlock); + hash_for_each_safe(rps_flow_info_htable, bucket, tmp, p_rps_flow_entry, node) { + hash_del(&p_rps_flow_entry->node); + kfree(p_rps_flow_entry); + } + write_unlock(&rps_flow_rwlock); +} + +void rps_single_flow_enable_set(bool enable) +{ + if (enable) { + static_branch_inc(&rps_flow_queue_enable_key); + } else { + static_branch_dec(&rps_flow_queue_enable_key); + if (static_key_false(&rps_flow_queue_enable_key.key)) + rps_flow_node_clear(); + } +} + +/* compute hash */ +static inline u32 rps_flow_hash_update(void) +{ + static u32 packet_count; + static u32 hash_count; + + packet_count++; + if (packet_count % quota_per_cpu) { + packet_count = 0; + hash_count++; + } + return hash_count; +} + +/* delete aging rps_flow */ +static inline bool rps_flow_node_aging_period(void) +{ + struct rps_flow_info_node *p_rps_flow_entry; + struct hlist_node *tmp; + u32 bucket; + + if (jiffies_to_msecs(jiffies - atomic_read(&last_aging_jiffies)) < max_aging_time) + return false; + + atomic_set(&last_aging_jiffies, jiffies); + write_lock(&rps_flow_rwlock); + hash_for_each_safe(rps_flow_info_htable, bucket, tmp, p_rps_flow_entry, node) { + if (jiffies_to_msecs(jiffies - p_rps_flow_entry->jiffies) >= max_aging_time) { + hash_del(&p_rps_flow_entry->node); + kfree(p_rps_flow_entry); + } + } + write_unlock(&rps_flow_rwlock); + return true; +} + +/* find active rps_flow */ +static inline +struct rps_flow_info_node *rps_flow_find_active_node_v4(const struct iphdr *iph) +{ + struct rps_flow_info_node *p_rps_flow; + u32 hash_key = ipv4_hash_key(iph); + + read_lock(&rps_flow_rwlock); + hash_for_each_possible(rps_flow_info_htable, p_rps_flow, node, hash_key) { + if (ipv4_flow_match(iph, p_rps_flow) && + (jiffies_to_msecs(jiffies - p_rps_flow->jiffies) < max_aging_time)) { + read_unlock(&rps_flow_rwlock); + return p_rps_flow; + } + } + read_unlock(&rps_flow_rwlock); + return NULL; +} + +static inline +struct rps_flow_info_node *rps_flow_find_active_node_v6(const struct ipv6hdr *ip6h) +{ + struct rps_flow_info_node *p_rps_flow; + u32 hash_key = ipv6_hash_key(ip6h); + + read_lock(&rps_flow_rwlock); + hash_for_each_possible(rps_flow_info_htable, p_rps_flow, node, hash_key) { + if (ipv6_flow_match(ip6h, p_rps_flow) && + (jiffies_to_msecs(jiffies - p_rps_flow->jiffies) < max_aging_time)) { + read_unlock(&rps_flow_rwlock); + return p_rps_flow; + } + } + read_unlock(&rps_flow_rwlock); + return NULL; +} + static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow **rflowp) { @@ -3975,6 +4225,70 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, goto done; skb_reset_network_header(skb); + /* this is to set rps for single flow */ + if (unlikely(test_bit(RPS_SINGLE_FLOW_ENABLE, &rxqueue->rps_single_flow_flags))) { + u32 hash_single_flow; + + /* clean the old node */ + rps_flow_node_aging_period(); + + /* no rps ,skip it */ + if (!map) + goto origin_rps; + + /* skip vlan first */ + if (skb_vlan_tag_present(skb)) + goto origin_rps; + + switch (skb->protocol) { + /* ipv4 */ + case htons(ETH_P_IP): { + const struct iphdr *iph; + struct rps_flow_info_node *p_flow; + + iph = (struct iphdr *)skb_network_header(skb); + /* hash map to match the src and dest ipaddr */ + p_flow = rps_flow_find_active_node_v4(iph); + /* check if vailed */ + if (p_flow) { + pr_debug("v4 rps, flow info: saddr = %pI4, daddr =%pI4, proto=%d\n", + &p_flow->v4_saddr, + &p_flow->v4_daddr, + p_flow->v4_protocol); + } else { + goto origin_rps; + } + break; + } + case htons(ETH_P_IPV6): { + const struct ipv6hdr *ip6h; + struct rps_flow_info_node *p_flow; + + ip6h = (struct ipv6hdr *)skb_network_header(skb); + p_flow = rps_flow_find_active_node_v6(ip6h); + if (p_flow) { + pr_debug("v6 rps,flow info:saddr = %pI6, daddr = %pI6.\n", + &p_flow->v6_saddr, &p_flow->v6_daddr); + } else { + goto origin_rps; + } + break; + } + default: + goto origin_rps; + } + + /* get the target cpu */ + hash_single_flow = rps_flow_hash_update(); + tcpu = map->cpus[hash_single_flow % map->len]; + if (cpu_online(tcpu)) { + cpu = tcpu; + pr_debug("single flow rps, target cpu id = %d\n", cpu); + return cpu; + } + } + +origin_rps: hash = skb_get_hash(skb); if (!hash) goto done; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 98474d85fb51f..4a80408b4b368 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -844,6 +844,71 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, return len; } +static ssize_t show_rps_flow_aging_time(struct netdev_rx_queue *queue, char *buf) +{ + unsigned long val; + + val = rps_flow_node_aging_time_get(); + return sprintf(buf, "%lu\n", val); +} + +static ssize_t store_rps_flow_aging_time(struct netdev_rx_queue *queue, const char *buf, size_t len) +{ + unsigned long val; + int rc; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + rc = kstrtoul(buf, 0, &val); + if (rc < 0) + return rc; + + /* if changed, store the new one */ + rps_flow_node_aging_time_set(val); + + return len; +} + +static ssize_t show_rps_single_flow(struct netdev_rx_queue *queue, char *buf) +{ + unsigned long val; + + val = test_bit(RPS_SINGLE_FLOW_ENABLE, &queue->rps_single_flow_flags); + return sprintf(buf, "%lu\n", val); +} + +static ssize_t store_rps_single_flow(struct netdev_rx_queue *queue, const char *buf, size_t len) +{ + bool enable; + int rc; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + rc = kstrtobool(buf, &enable); + if (rc < 0) + return rc; + + /* if changed, store the new one */ + if (enable != test_bit(RPS_SINGLE_FLOW_ENABLE, &queue->rps_single_flow_flags)) { + if (enable) + set_bit(RPS_SINGLE_FLOW_ENABLE, &queue->rps_single_flow_flags); + else + clear_bit(RPS_SINGLE_FLOW_ENABLE, &queue->rps_single_flow_flags); + + rps_single_flow_enable_set(enable); + } + + return len; +} + +static struct rx_queue_attribute rps_flow_aging_time_attribute __ro_after_init = + __ATTR(rps_flow_aging_time, 0644, show_rps_flow_aging_time, store_rps_flow_aging_time); + +static struct rx_queue_attribute rps_single_flow_attribute __ro_after_init = + __ATTR(rps_single_flow, 0644, show_rps_single_flow, store_rps_single_flow); + static struct rx_queue_attribute rps_cpus_attribute __ro_after_init = __ATTR(rps_cpus, 0644, show_rps_map, store_rps_map); @@ -854,6 +919,8 @@ static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute __ro_after_ini static struct attribute *rx_queue_default_attrs[] __ro_after_init = { #ifdef CONFIG_RPS + &rps_flow_aging_time_attribute.attr, + &rps_single_flow_attribute.attr, &rps_cpus_attribute.attr, &rps_dev_flow_table_cnt_attribute.attr, #endif From 6bd77060ab3ed3e31666d485e22726f52fc21fd7 Mon Sep 17 00:00:00 2001 From: "huangjie.albert" Date: Tue, 28 Jun 2022 15:16:09 +0800 Subject: [PATCH 0406/5344] bytedance: xfrm: support rps for single ipsec flow add ipsec single flow support for rps to improve throughput. Signed-off-by: huangjie.albert --- net/ipv4/xfrm4_state.c | 3 +++ net/ipv6/xfrm6_state.c | 2 ++ 2 files changed, 5 insertions(+) diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index f8ed3c3bb928f..a53716c7d93b4 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -14,6 +14,7 @@ #include #include #include +#include int xfrm4_extract_header(struct sk_buff *skb) { @@ -28,6 +29,8 @@ int xfrm4_extract_header(struct sk_buff *skb) memset(XFRM_MODE_SKB_CB(skb)->flow_lbl, 0, sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl)); + /* for single flow rps for ipsec */ + rps_flow_node_add(iph, false); return 0; } diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index 78daadecbdef5..07f9a6163ad83 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -20,6 +20,7 @@ #include #include #include +#include int xfrm6_extract_header(struct sk_buff *skb) { @@ -34,6 +35,7 @@ int xfrm6_extract_header(struct sk_buff *skb) memcpy(XFRM_MODE_SKB_CB(skb)->flow_lbl, iph->flow_lbl, sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl)); + rps_flow_node_add(iph, true); return 0; } From 1af951ba7d68d7778bd7dbe4705a043da4cae8f0 Mon Sep 17 00:00:00 2001 From: Charan Teja Reddy Date: Fri, 16 Apr 2021 20:32:16 +0530 Subject: [PATCH 0407/5344] sched,psi: Handle potential task count underflow bugs more gracefully commit 9d10a13d1e4c349b76f1c675a874a7f981d6d3b4 upstream. psi_group_cpu->tasks, represented by the unsigned int, stores the number of tasks that could be stalled on a psi resource(io/mem/cpu). Decrementing these counters at zero leads to wrapping which further leads to the psi_group_cpu->state_mask is being set with the respective pressure state. This could result into the unnecessary time sampling for the pressure state thus cause the spurious psi events. This can further lead to wrong actions being taken at the user land based on these psi events. Though psi_bug is set under these conditions but that just for debug purpose. Fix it by decrementing the ->tasks count only when it is non-zero. Signed-off-by: Charan Teja Reddy Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lkml.kernel.org/r/1618585336-37219-1-git-send-email-charante@codeaurora.org Signed-off-by: Chengming Zhou --- kernel/sched/psi.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index d1104f07daca1..f5cb8d151beb2 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -705,14 +705,15 @@ static void psi_group_change(struct psi_group *group, int cpu, for (t = 0, m = clear; m; m &= ~(1 << t), t++) { if (!(m & (1 << t))) continue; - if (groupc->tasks[t] == 0 && !psi_bug) { + if (groupc->tasks[t]) { + groupc->tasks[t]--; + } else if (!psi_bug) { printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", cpu, t, groupc->tasks[0], groupc->tasks[1], groupc->tasks[2], groupc->tasks[3], clear, set); psi_bug = 1; } - groupc->tasks[t]--; } for (t = 0; set; set &= ~(1 << t), t++) From bdc1696ac91a969ca7ff74b3b604104caa0d6a62 Mon Sep 17 00:00:00 2001 From: Brian Chen Date: Wed, 10 Nov 2021 21:33:12 +0000 Subject: [PATCH 0408/5344] psi: Fix PSI_MEM_FULL state when tasks are in memstall and doing reclaim commit cb0e52b7748737b2cf6481fdd9b920ce7e1ebbdf upstream. We've noticed cases where tasks in a cgroup are stalled on memory but there is little memory FULL pressure since tasks stay on the runqueue in reclaim. A simple example involves a single threaded program that keeps leaking and touching large amounts of memory. It runs in a cgroup with swap enabled, memory.high set at 10M and cpu.max ratio set at 5%. Though there is significant CPU pressure and memory SOME, there is barely any memory FULL since the task enters reclaim and stays on the runqueue. However, this memory-bound task is effectively stalled on memory and we expect memory FULL to match memory SOME in this scenario. The code is confused about memstall && running, thinking there is a stalled task and a productive task when there's only one task: a reclaimer that's counted as both. To fix this, we redefine the condition for PSI_MEM_FULL to check that all running tasks are in an active memstall instead of checking that there are no running tasks. case PSI_MEM_FULL: - return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]); + return unlikely(tasks[NR_MEMSTALL] && + tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]); This will capture reclaimers. It will also capture tasks that called psi_memstall_enter() and are about to sleep, but this should be negligible noise. Signed-off-by: Brian Chen Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20211110213312.310243-1-brianchen118@gmail.com Signed-off-by: Chengming Zhou --- include/linux/psi_types.h | 13 ++++++++++- kernel/sched/psi.c | 45 ++++++++++++++++++++++++--------------- kernel/sched/stats.h | 5 ++++- 3 files changed, 44 insertions(+), 19 deletions(-) diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 0a23300d49af7..0819c82dba920 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -21,7 +21,17 @@ enum psi_task_count { * don't have to special case any state tracking for it. */ NR_ONCPU, - NR_PSI_TASK_COUNTS = 4, + /* + * For IO and CPU stalls the presence of running/oncpu tasks + * in the domain means a partial rather than a full stall. + * For memory it's not so simple because of page reclaimers: + * they are running/oncpu while representing a stall. To tell + * whether a domain has productivity left or not, we need to + * distinguish between regular running (i.e. productive) + * threads and memstall ones. + */ + NR_MEMSTALL_RUNNING, + NR_PSI_TASK_COUNTS = 5, }; /* Task state bitmasks */ @@ -29,6 +39,7 @@ enum psi_task_count { #define TSK_MEMSTALL (1 << NR_MEMSTALL) #define TSK_RUNNING (1 << NR_RUNNING) #define TSK_ONCPU (1 << NR_ONCPU) +#define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING) /* Resources that workloads could be stalled on */ enum psi_res { diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index f5cb8d151beb2..70b761a2d3741 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -34,13 +34,19 @@ * delayed on that resource such that nobody is advancing and the CPU * goes idle. This leaves both workload and CPU unproductive. * - * Naturally, the FULL state doesn't exist for the CPU resource at the - * system level, but exist at the cgroup level, means all non-idle tasks - * in a cgroup are delayed on the CPU resource which used by others outside - * of the cgroup or throttled by the cgroup cpu.max configuration. - * * SOME = nr_delayed_tasks != 0 - * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0 + * FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0 + * + * What it means for a task to be productive is defined differently + * for each resource. For IO, productive means a running task. For + * memory, productive means a running task that isn't a reclaimer. For + * CPU, productive means an oncpu task. + * + * Naturally, the FULL state doesn't exist for the CPU resource at the + * system level, but exist at the cgroup level. At the cgroup level, + * FULL means all non-idle tasks in the cgroup are delayed on the CPU + * resource which is being used by others outside of the cgroup or + * throttled by the cgroup cpu.max configuration. * * The percentage of wallclock time spent in those compound stall * states gives pressure numbers between 0 and 100 for each resource, @@ -81,13 +87,13 @@ * * threads = min(nr_nonidle_tasks, nr_cpus) * SOME = min(nr_delayed_tasks / threads, 1) - * FULL = (threads - min(nr_running_tasks, threads)) / threads + * FULL = (threads - min(nr_productive_tasks, threads)) / threads * * For the 257 number crunchers on 256 CPUs, this yields: * * threads = min(257, 256) * SOME = min(1 / 256, 1) = 0.4% - * FULL = (256 - min(257, 256)) / 256 = 0% + * FULL = (256 - min(256, 256)) / 256 = 0% * * For the 1 out of 4 memory-delayed tasks, this yields: * @@ -112,7 +118,7 @@ * For each runqueue, we track: * * tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0) - * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu]) + * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu]) * tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0) * * and then periodically aggregate: @@ -229,7 +235,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state) case PSI_MEM_SOME: return unlikely(tasks[NR_MEMSTALL]); case PSI_MEM_FULL: - return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]); + return unlikely(tasks[NR_MEMSTALL] && + tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]); case PSI_CPU_SOME: return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]); case PSI_CPU_FULL: @@ -708,10 +715,11 @@ static void psi_group_change(struct psi_group *group, int cpu, if (groupc->tasks[t]) { groupc->tasks[t]--; } else if (!psi_bug) { - printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n", cpu, t, groupc->tasks[0], groupc->tasks[1], groupc->tasks[2], - groupc->tasks[3], clear, set); + groupc->tasks[3], groupc->tasks[4], + clear, set); psi_bug = 1; } } @@ -849,12 +857,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, int clear = TSK_ONCPU, set = 0; /* - * When we're going to sleep, psi_dequeue() lets us handle - * TSK_RUNNING and TSK_IOWAIT here, where we can combine it - * with TSK_ONCPU and save walking common ancestors twice. + * When we're going to sleep, psi_dequeue() lets us + * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and + * TSK_IOWAIT here, where we can combine it with + * TSK_ONCPU and save walking common ancestors twice. */ if (sleep) { clear |= TSK_RUNNING; + if (prev->in_memstall) + clear |= TSK_MEMSTALL_RUNNING; if (prev->in_iowait) set |= TSK_IOWAIT; } @@ -903,7 +914,7 @@ void psi_memstall_enter(unsigned long *flags) rq = this_rq_lock_irq(&rf); current->in_memstall = 1; - psi_task_change(current, 0, TSK_MEMSTALL); + psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING); rq_unlock_irq(rq, &rf); } @@ -932,7 +943,7 @@ void psi_memstall_leave(unsigned long *flags) rq = this_rq_lock_irq(&rf); current->in_memstall = 0; - psi_task_change(current, TSK_MEMSTALL, 0); + psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0); rq_unlock_irq(rq, &rf); } diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index dc218e9f45585..fd9cb6b15603f 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -69,6 +69,9 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup) if (static_branch_likely(&psi_disabled)) return; + if (p->in_memstall) + set |= TSK_MEMSTALL_RUNNING; + if (!wakeup || p->sched_psi_wake_requeue) { if (p->in_memstall) set |= TSK_MEMSTALL; @@ -99,7 +102,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep) return; if (p->in_memstall) - clear |= TSK_MEMSTALL; + clear |= (TSK_MEMSTALL | TSK_MEMSTALL_RUNNING); psi_task_change(p, clear, 0); } From 8dddfa7e22d7e718c7682b2b1bc18a5742477127 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Sun, 21 Mar 2021 13:51:56 -0700 Subject: [PATCH 0409/5344] psi: Reduce calls to sched_clock() in psi commit df77430639c9cf73559bac0f25084518bf9a812d upstream. We noticed that the cost of psi increases with the increase in the levels of the cgroups. Particularly the cost of cpu_clock() sticks out as the kernel calls it multiple times as it traverses up the cgroup tree. This patch reduces the calls to cpu_clock(). Performed perf bench on Intel Broadwell with 3 levels of cgroup. Before the patch: $ perf bench sched all # Running sched/messaging benchmark... # 20 sender and receiver processes per group # 10 groups == 400 processes run Total time: 0.747 [sec] # Running sched/pipe benchmark... # Executed 1000000 pipe operations between two processes Total time: 3.516 [sec] 3.516689 usecs/op 284358 ops/sec After the patch: $ perf bench sched all # Running sched/messaging benchmark... # 20 sender and receiver processes per group # 10 groups == 400 processes run Total time: 0.640 [sec] # Running sched/pipe benchmark... # Executed 1000000 pipe operations between two processes Total time: 3.329 [sec] 3.329820 usecs/op 300316 ops/sec Signed-off-by: Shakeel Butt Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lkml.kernel.org/r/20210321205156.4186483-1-shakeelb@google.com Signed-off-by: Hao Jia --- kernel/sched/psi.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 70b761a2d3741..5d56728adc015 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -655,12 +655,10 @@ static void poll_timer_fn(struct timer_list *t) wake_up_interruptible(&group->poll_wait); } -static void record_times(struct psi_group_cpu *groupc, int cpu) +static void record_times(struct psi_group_cpu *groupc, u64 now) { u32 delta; - u64 now; - now = cpu_clock(cpu); delta = now - groupc->state_start; groupc->state_start = now; @@ -687,7 +685,7 @@ static void record_times(struct psi_group_cpu *groupc, int cpu) } static void psi_group_change(struct psi_group *group, int cpu, - unsigned int clear, unsigned int set, + unsigned int clear, unsigned int set, u64 now, bool wake_clock) { struct psi_group_cpu *groupc; @@ -707,7 +705,7 @@ static void psi_group_change(struct psi_group *group, int cpu, */ write_seqcount_begin(&groupc->seq); - record_times(groupc, cpu); + record_times(groupc, now); for (t = 0, m = clear; m; m &= ~(1 << t), t++) { if (!(m & (1 << t))) @@ -801,12 +799,14 @@ void psi_task_change(struct task_struct *task, int clear, int set) struct psi_group *group; bool wake_clock = true; void *iter = NULL; + u64 now; if (!task->pid) return; psi_flags_change(task, clear, set); + now = cpu_clock(cpu); /* * Periodic aggregation shuts off if there is a period of no * task changes, so we wake it back up if necessary. However, @@ -819,7 +819,7 @@ void psi_task_change(struct task_struct *task, int clear, int set) wake_clock = false; while ((group = iterate_groups(task, &iter))) - psi_group_change(group, cpu, clear, set, wake_clock); + psi_group_change(group, cpu, clear, set, now, wake_clock); } void psi_task_switch(struct task_struct *prev, struct task_struct *next, @@ -828,6 +828,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, struct psi_group *group, *common = NULL; int cpu = task_cpu(prev); void *iter; + u64 now = cpu_clock(cpu); if (next->pid) { bool identical_state; @@ -849,7 +850,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, break; } - psi_group_change(group, cpu, 0, TSK_ONCPU, true); + psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); } } @@ -874,7 +875,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, iter = NULL; while ((group = iterate_groups(prev, &iter)) && group != common) - psi_group_change(group, cpu, clear, set, true); + psi_group_change(group, cpu, clear, set, now, true); /* * TSK_ONCPU is handled up to the common ancestor. If we're tasked @@ -883,7 +884,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, if (sleep) { clear &= ~TSK_ONCPU; for (; group; group = iterate_groups(prev, &iter)) - psi_group_change(group, cpu, clear, set, true); + psi_group_change(group, cpu, clear, set, now, true); } } } From bed10a7f7c4695ff7667116aed0e06413161e2ca Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 24 May 2021 12:53:39 -0700 Subject: [PATCH 0410/5344] cgroup: make per-cgroup pressure stall tracking configurable commit 3958e2d0c34e18c41b60dc01832bd670a59ef70f upstream. PSI accounts stalls for each cgroup separately and aggregates it at each level of the hierarchy. This causes additional overhead with psi_avgs_work being called for each cgroup in the hierarchy. psi_avgs_work has been highly optimized, however on systems with large number of cgroups the overhead becomes noticeable. Systems which use PSI only at the system level could avoid this overhead if PSI can be configured to skip per-cgroup stall accounting. Add "cgroup_disable=pressure" kernel command-line option to allow requesting system-wide only pressure stall accounting. When set, it keeps system-wide accounting under /proc/pressure/ but skips accounting for individual cgroups and does not expose PSI nodes in cgroup hierarchy. Signed-off-by: Suren Baghdasaryan Acked-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Signed-off-by: Tejun Heo Signed-off-by: Hao Jia --- .../admin-guide/kernel-parameters.txt | 9 +++- include/linux/cgroup-defs.h | 1 + include/linux/cgroup.h | 7 +++ kernel/cgroup/cgroup.c | 48 +++++++++++++++++++ kernel/sched/psi.c | 30 +++++++----- 5 files changed, 80 insertions(+), 15 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index c961795bff3c1..87443ffe7e5e4 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -484,16 +484,21 @@ ccw_timeout_log [S390] See Documentation/s390/common_io.rst for details. - cgroup_disable= [KNL] Disable a particular controller - Format: {name of the controller(s) to disable} + cgroup_disable= [KNL] Disable a particular controller or optional feature + Format: {name of the controller(s) or feature(s) to disable} The effects of cgroup_disable=foo are: - foo isn't auto-mounted if you mount all cgroups in a single hierarchy - foo isn't visible as an individually mountable subsystem + - if foo is an optional feature then the feature is + disabled and corresponding cgroup files are not + created {Currently only "memory" controller deal with this and cut the overhead, others just disable the usage. So only cgroup_disable=memory is actually worthy} + Specifying "pressure" disables per-cgroup pressure + stall information accounting feature cgroup_no_v1= [KNL] Disable cgroup controllers and named hierarchies in v1 Format: { { controller | "all" | "named" } diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index b22598c5a6061..0627c6f4d8740 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -105,6 +105,7 @@ enum { CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */ CFTYPE_DEBUG = (1 << 5), /* create when cgroup_debug */ + CFTYPE_PRESSURE = (1 << 6), /* only if pressure feature is enabled */ /* internal flags, do not use outside cgroup core proper */ __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 202852383ae95..8a0e1bd77a4f1 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -670,6 +670,8 @@ static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) return &cgrp->psi; } +bool cgroup_psi_enabled(void); + static inline void cgroup_init_kthreadd(void) { /* @@ -735,6 +737,11 @@ static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) return NULL; } +static inline bool cgroup_psi_enabled(void) +{ + return false; +} + static inline bool task_under_cgroup_hierarchy(struct task_struct *task, struct cgroup *ancestor) { diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f82fa1b159af4..d9df6b62ec734 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -212,6 +212,22 @@ struct cgroup_namespace init_cgroup_ns = { static struct cftype cgroup_base_files[]; +/* cgroup optional features */ +enum cgroup_opt_features { +#ifdef CONFIG_PSI + OPT_FEATURE_PRESSURE, +#endif + OPT_FEATURE_COUNT +}; + +static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = { +#ifdef CONFIG_PSI + "pressure", +#endif +}; + +static u16 cgroup_feature_disable_mask __read_mostly; + static int cgroup_apply_control(struct cgroup *cgrp); static void cgroup_finalize_control(struct cgroup *cgrp, int ret); static void css_task_iter_skip(struct css_task_iter *it, @@ -3700,6 +3716,18 @@ static void cgroup_pressure_release(struct kernfs_open_file *of) psi_trigger_replace(&ctx->psi.trigger, NULL); } + +bool cgroup_psi_enabled(void) +{ + return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0; +} + +#else /* CONFIG_PSI */ +bool cgroup_psi_enabled(void) +{ + return false; +} + #endif /* CONFIG_PSI */ static int cgroup_freeze_show(struct seq_file *seq, void *v) @@ -3966,6 +3994,8 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, restart: for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ + if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled()) + continue; if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) continue; if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp)) @@ -4043,6 +4073,9 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) WARN_ON(cft->ss || cft->kf_ops); + if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled()) + continue; + if (cft->seq_start) kf_ops = &cgroup_kf_ops; else @@ -4956,6 +4989,7 @@ static struct cftype cgroup_base_files[] = { #ifdef CONFIG_PSI { .name = "io.pressure", + .flags = CFTYPE_PRESSURE, .seq_show = cgroup_io_pressure_show, .write = cgroup_io_pressure_write, .poll = cgroup_pressure_poll, @@ -4963,6 +4997,7 @@ static struct cftype cgroup_base_files[] = { }, { .name = "memory.pressure", + .flags = CFTYPE_PRESSURE, .seq_show = cgroup_memory_pressure_show, .write = cgroup_memory_pressure_write, .poll = cgroup_pressure_poll, @@ -4970,6 +5005,7 @@ static struct cftype cgroup_base_files[] = { }, { .name = "cpu.pressure", + .flags = CFTYPE_PRESSURE, .seq_show = cgroup_cpu_pressure_show, .write = cgroup_cpu_pressure_write, .poll = cgroup_pressure_poll, @@ -6207,6 +6243,15 @@ static int __init cgroup_disable(char *str) pr_info("Disabling %s control group subsystem\n", ss->name); } + + for (i = 0; i < OPT_FEATURE_COUNT; i++) { + if (strcmp(token, cgroup_opt_feature_names[i])) + continue; + cgroup_feature_disable_mask |= 1 << i; + pr_info("Disabling %s control group feature\n", + cgroup_opt_feature_names[i]); + break; + } } return 1; } @@ -6510,6 +6555,9 @@ static ssize_t show_delegatable_files(struct cftype *files, char *buf, if (!(cft->flags & CFTYPE_NS_DELEGATABLE)) continue; + if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled()) + continue; + if (prefix) ret += snprintf(buf + ret, size - ret, "%s.", prefix); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 5d56728adc015..34c54db7a8df4 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -154,6 +154,7 @@ static int psi_bug __read_mostly; DEFINE_STATIC_KEY_FALSE(psi_disabled); +DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled); #ifdef CONFIG_PSI_DEFAULT_DISABLED static bool psi_enable; @@ -221,6 +222,9 @@ void __init psi_init(void) return; } + if (!cgroup_psi_enabled()) + static_branch_disable(&psi_cgroups_enabled); + psi_period = jiffies_to_nsecs(PSI_FREQ); group_init(&psi_system); } @@ -756,23 +760,23 @@ static void psi_group_change(struct psi_group *group, int cpu, static struct psi_group *iterate_groups(struct task_struct *task, void **iter) { + if (*iter == &psi_system) + return NULL; + #ifdef CONFIG_CGROUPS - struct cgroup *cgroup = NULL; + if (static_branch_likely(&psi_cgroups_enabled)) { + struct cgroup *cgroup = NULL; - if (!*iter) - cgroup = task->cgroups->dfl_cgrp; - else if (*iter == &psi_system) - return NULL; - else - cgroup = cgroup_parent(*iter); + if (!*iter) + cgroup = task->cgroups->dfl_cgrp; + else + cgroup = cgroup_parent(*iter); - if (cgroup && cgroup_parent(cgroup)) { - *iter = cgroup; - return cgroup_psi(cgroup); + if (cgroup && cgroup_parent(cgroup)) { + *iter = cgroup; + return cgroup_psi(cgroup); + } } -#else - if (*iter) - return NULL; #endif *iter = &psi_system; return &psi_system; From 4fe11073a14691ccb831064b97f2e4d705ee6d6b Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 11 Jan 2022 15:23:09 -0800 Subject: [PATCH 0411/5344] psi: Fix uaf issue when psi trigger is destroyed while being polled commit a06247c6804f1a7c86a2e5398a4c1f1db1471848 upstream. With write operation on psi files replacing old trigger with a new one, the lifetime of its waitqueue is totally arbitrary. Overwriting an existing trigger causes its waitqueue to be freed and pending poll() will stumble on trigger->event_wait which was destroyed. Fix this by disallowing to redefine an existing psi trigger. If a write operation is used on a file descriptor with an already existing psi trigger, the operation will fail with EBUSY error. Also bypass a check for psi_disabled in the psi_trigger_destroy as the flag can be flipped after the trigger is created, leading to a memory leak. Fixes: 0e94682b73bf ("psi: introduce psi monitor") Reported-by: syzbot+cdb5dd11c97cc532efad@syzkaller.appspotmail.com Suggested-by: Linus Torvalds Analyzed-by: Eric Biggers Signed-off-by: Suren Baghdasaryan Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Eric Biggers Acked-by: Johannes Weiner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220111232309.1786347-1-surenb@google.com Signed-off-by: Hao Jia --- Documentation/accounting/psi.rst | 3 +- include/linux/psi.h | 2 +- include/linux/psi_types.h | 3 -- kernel/cgroup/cgroup.c | 11 ++++-- kernel/sched/psi.c | 66 ++++++++++++++------------------ 5 files changed, 40 insertions(+), 45 deletions(-) diff --git a/Documentation/accounting/psi.rst b/Documentation/accounting/psi.rst index 4a87e1eb12a6c..ad31ebae0af8a 100644 --- a/Documentation/accounting/psi.rst +++ b/Documentation/accounting/psi.rst @@ -89,7 +89,8 @@ Triggers can be set on more than one psi metric and more than one trigger for the same psi metric can be specified. However for each trigger a separate file descriptor is required to be able to poll it separately from others, therefore for each trigger a separate open() syscall should be made even -when opening the same psi interface file. +when opening the same psi interface file. Write operations to a file descriptor +with an already existing psi trigger will fail with EBUSY. Monitors activate only when system enters stall state for the monitored psi metric and deactivates upon exit from the stall state. While system is diff --git a/include/linux/psi.h b/include/linux/psi.h index 65eb1476ac705..74f7148dfb9f4 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -32,7 +32,7 @@ void cgroup_move_task(struct task_struct *p, struct css_set *to); struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, size_t nbytes, enum psi_res res); -void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *t); +void psi_trigger_destroy(struct psi_trigger *t); __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait); diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 0819c82dba920..6f190002a2022 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -140,9 +140,6 @@ struct psi_trigger { * events to one per window */ u64 last_event_time; - - /* Refcounting to prevent premature destruction */ - struct kref refcount; }; struct psi_group { diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d9df6b62ec734..007b861b75516 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3667,6 +3667,12 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, cgroup_get(cgrp); cgroup_kn_unlock(of->kn); + /* Allow only one trigger per file descriptor */ + if (ctx->psi.trigger) { + cgroup_put(cgrp); + return -EBUSY; + } + psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; new = psi_trigger_create(psi, buf, nbytes, res); if (IS_ERR(new)) { @@ -3674,8 +3680,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, return PTR_ERR(new); } - psi_trigger_replace(&ctx->psi.trigger, new); - + smp_store_release(&ctx->psi.trigger, new); cgroup_put(cgrp); return nbytes; @@ -3714,7 +3719,7 @@ static void cgroup_pressure_release(struct kernfs_open_file *of) { struct cgroup_file_ctx *ctx = of->priv; - psi_trigger_replace(&ctx->psi.trigger, NULL); + psi_trigger_destroy(ctx->psi.trigger); } bool cgroup_psi_enabled(void) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 34c54db7a8df4..ae9826818c5a8 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1157,7 +1157,6 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, t->event = 0; t->last_event_time = 0; init_waitqueue_head(&t->event_wait); - kref_init(&t->refcount); mutex_lock(&group->trigger_lock); @@ -1186,15 +1185,19 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, return t; } -static void psi_trigger_destroy(struct kref *ref) +void psi_trigger_destroy(struct psi_trigger *t) { - struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); - struct psi_group *group = t->group; + struct psi_group *group; struct task_struct *task_to_destroy = NULL; - if (static_branch_likely(&psi_disabled)) + /* + * We do not check psi_disabled since it might have been disabled after + * the trigger got created. + */ + if (!t) return; + group = t->group; /* * Wakeup waiters to stop polling. Can happen if cgroup is deleted * from under a polling process. @@ -1230,9 +1233,9 @@ static void psi_trigger_destroy(struct kref *ref) mutex_unlock(&group->trigger_lock); /* - * Wait for both *trigger_ptr from psi_trigger_replace and - * poll_task RCUs to complete their read-side critical sections - * before destroying the trigger and optionally the poll_task + * Wait for psi_schedule_poll_work RCU to complete its read-side + * critical section before destroying the trigger and optionally the + * poll_task. */ synchronize_rcu(); /* @@ -1249,18 +1252,6 @@ static void psi_trigger_destroy(struct kref *ref) kfree(t); } -void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new) -{ - struct psi_trigger *old = *trigger_ptr; - - if (static_branch_likely(&psi_disabled)) - return; - - rcu_assign_pointer(*trigger_ptr, new); - if (old) - kref_put(&old->refcount, psi_trigger_destroy); -} - __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait) { @@ -1270,24 +1261,15 @@ __poll_t psi_trigger_poll(void **trigger_ptr, if (static_branch_likely(&psi_disabled)) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - rcu_read_lock(); - - t = rcu_dereference(*(void __rcu __force **)trigger_ptr); - if (!t) { - rcu_read_unlock(); + t = smp_load_acquire(trigger_ptr); + if (!t) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - } - kref_get(&t->refcount); - - rcu_read_unlock(); poll_wait(file, &t->event_wait, wait); if (cmpxchg(&t->event, 1, 0) == 1) ret |= EPOLLPRI; - kref_put(&t->refcount, psi_trigger_destroy); - return ret; } @@ -1311,14 +1293,24 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, buf[buf_size - 1] = '\0'; - new = psi_trigger_create(&psi_system, buf, nbytes, res); - if (IS_ERR(new)) - return PTR_ERR(new); - seq = file->private_data; + /* Take seq->lock to protect seq->private from concurrent writes */ mutex_lock(&seq->lock); - psi_trigger_replace(&seq->private, new); + + /* Allow only one trigger per file descriptor */ + if (seq->private) { + mutex_unlock(&seq->lock); + return -EBUSY; + } + + new = psi_trigger_create(&psi_system, buf, nbytes, res); + if (IS_ERR(new)) { + mutex_unlock(&seq->lock); + return PTR_ERR(new); + } + + smp_store_release(&seq->private, new); mutex_unlock(&seq->lock); return nbytes; @@ -1353,7 +1345,7 @@ static int psi_fop_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; - psi_trigger_replace(&seq->private, NULL); + psi_trigger_destroy(seq->private); return single_release(inode, file); } From cd28d46d7fe476c18f7984b1d1479b5d8e475b6e Mon Sep 17 00:00:00 2001 From: Zhaoyang Huang Date: Tue, 25 Jan 2022 14:56:58 +0800 Subject: [PATCH 0412/5344] psi: fix possible trigger missing in the window commit e6df4ead85d9da1b07dd40bd4c6d2182f3e210c4 upstream. When a new threshold breaching stall happens after a psi event was generated and within the window duration, the new event is not generated because the events are rate-limited to one per window. If after that no new stall is recorded then the event will not be generated even after rate-limiting duration has passed. This is happening because with no new stall, window_update will not be called even though threshold was previously breached. To fix this, record threshold breaching occurrence and generate the event once window duration is passed. Suggested-by: Suren Baghdasaryan Signed-off-by: Zhaoyang Huang Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Acked-by: Suren Baghdasaryan Link: https://lore.kernel.org/r/1643093818-19835-1-git-send-email-huangzhaoyang@gmail.com Signed-off-by: Hao Jia --- include/linux/psi_types.h | 3 +++ kernel/sched/psi.c | 46 +++++++++++++++++++++++++-------------- 2 files changed, 33 insertions(+), 16 deletions(-) diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 6f190002a2022..72a50a05e9576 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -140,6 +140,9 @@ struct psi_trigger { * events to one per window */ u64 last_event_time; + + /* Deferred event(s) from previous ratelimit window */ + bool pending_event; }; struct psi_group { diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index ae9826818c5a8..92547574c6e39 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -522,7 +522,7 @@ static void init_triggers(struct psi_group *group, u64 now) static u64 update_triggers(struct psi_group *group, u64 now) { struct psi_trigger *t; - bool new_stall = false; + bool update_total = false; u64 *total = group->total[PSI_POLL]; /* @@ -531,24 +531,35 @@ static u64 update_triggers(struct psi_group *group, u64 now) */ list_for_each_entry(t, &group->triggers, node) { u64 growth; + bool new_stall; - /* Check for stall activity */ - if (group->polling_total[t->state] == total[t->state]) - continue; + new_stall = group->polling_total[t->state] != total[t->state]; + /* Check for stall activity or a previous threshold breach */ + if (!new_stall && !t->pending_event) + continue; /* - * Multiple triggers might be looking at the same state, - * remember to update group->polling_total[] once we've - * been through all of them. Also remember to extend the - * polling time if we see new stall activity. + * Check for new stall activity, as well as deferred + * events that occurred in the last window after the + * trigger had already fired (we want to ratelimit + * events without dropping any). */ - new_stall = true; - - /* Calculate growth since last update */ - growth = window_update(&t->win, now, total[t->state]); - if (growth < t->threshold) - continue; - + if (new_stall) { + /* + * Multiple triggers might be looking at the same state, + * remember to update group->polling_total[] once we've + * been through all of them. Also remember to extend the + * polling time if we see new stall activity. + */ + update_total = true; + + /* Calculate growth since last update */ + growth = window_update(&t->win, now, total[t->state]); + if (growth < t->threshold) + continue; + + t->pending_event = true; + } /* Limit event signaling to once per window */ if (now < t->last_event_time + t->win.size) continue; @@ -557,9 +568,11 @@ static u64 update_triggers(struct psi_group *group, u64 now) if (cmpxchg(&t->event, 0, 1) == 0) wake_up_interruptible(&t->event_wait); t->last_event_time = now; + /* Reset threshold breach flag once event got generated */ + t->pending_event = false; } - if (new_stall) + if (update_total) memcpy(group->polling_total, total, sizeof(group->polling_total)); @@ -1157,6 +1170,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, t->event = 0; t->last_event_time = 0; init_waitqueue_head(&t->event_wait); + t->pending_event = false; mutex_lock(&group->trigger_lock); From c34696ea10170a8008cbb66cf76396a47022a0be Mon Sep 17 00:00:00 2001 From: Hailong Liu Date: Fri, 1 Apr 2022 13:10:11 +0800 Subject: [PATCH 0413/5344] psi: Fix trigger being fired unexpectedly at initial commit 915a087e4c47334a2f7ba2a4092c4bade0873769 upstream. When a trigger being created, its win.start_value and win.start_time are reset to zero. If group->total[PSI_POLL][t->state] has accumulated before, this trigger will be fired unexpectedly in the next period, even if its growth time does not reach its threshold. So set the window of the new trigger to the current state value. Signed-off-by: Hailong Liu Signed-off-by: Peter Zijlstra (Intel) Acked-by: Suren Baghdasaryan Link: https://lore.kernel.org/r/1648789811-3788971-1-git-send-email-liuhailong@linux.alibaba.com Signed-off-by: Hao Jia --- kernel/sched/psi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 92547574c6e39..516229c443925 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1165,7 +1165,8 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, t->state = state; t->threshold = threshold_us * NSEC_PER_USEC; t->win.size = window_us * NSEC_PER_USEC; - window_reset(&t->win, 0, 0, 0); + window_reset(&t->win, sched_clock(), + group->total[PSI_POLL][t->state], 0); t->event = 0; t->last_event_time = 0; From 7ed31b7e9f01888f4bf9e2de89f1a349c4abcb1e Mon Sep 17 00:00:00 2001 From: Zhang Tianci Date: Tue, 26 Jul 2022 20:43:21 +0800 Subject: [PATCH 0414/5344] bytedance: config: Add VDUSE related config on aarch64 CONFIG_VIRTIO_PCI_LIB=m CONFIG_VIRTIO_VDPA=m CONFIG_VDPA=m CONFIG_VDPA_SIM=m CONFIG_VDPA_SIM_NET=m CONFIG_VDPA_SIM_BLOCK=m CONFIG_VDPA_USER=m CONFIG_VHOST_IOTLB=m CONFIG_VHOST_RING=m CONFIG_VHOST_VDPA=m CONFIG_VIRTIO_FS=m Signed-off-by: Zhang Tianci --- config.aarch64 | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/config.aarch64 b/config.aarch64 index d639a771f0520..8b6a67527cec5 100644 --- a/config.aarch64 +++ b/config.aarch64 @@ -7165,21 +7165,30 @@ CONFIG_VFIO_PCI_INTX=y # CONFIG_VFIO_MDEV is not set CONFIG_VIRT_DRIVERS=y CONFIG_VIRTIO=m +CONFIG_VIRTIO_PCI_LIB=m CONFIG_VIRTIO_MENU=y CONFIG_VIRTIO_PCI=m CONFIG_VIRTIO_PCI_LEGACY=y +CONFIG_VIRTIO_VDPA=m # CONFIG_VIRTIO_PMEM is not set CONFIG_VIRTIO_BALLOON=m CONFIG_VIRTIO_INPUT=m CONFIG_VIRTIO_MMIO=m # CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES is not set -# CONFIG_VDPA is not set -CONFIG_VHOST_IOTLB=m +CONFIG_VDPA=m +CONFIG_VDPA_SIM=m +CONFIG_VDPA_SIM_NET=m +CONFIG_VDPA_SIM_BLOCK=m +CONFIG_VDPA_USER=m + CONFIG_VHOST=m +CONFIG_VHOST_IOTLB=m +CONFIG_VHOST_RING=m CONFIG_VHOST_MENU=y CONFIG_VHOST_NET=m CONFIG_VHOST_SCSI=m CONFIG_VHOST_VSOCK=m +CONFIG_VHOST_VDPA=m # CONFIG_VHOST_CROSS_ENDIAN_LEGACY is not set # @@ -8386,7 +8395,7 @@ CONFIG_QUOTACTL=y CONFIG_AUTOFS_FS=m CONFIG_FUSE_FS=m CONFIG_CUSE=m -# CONFIG_VIRTIO_FS is not set +CONFIG_VIRTIO_FS=m CONFIG_OVERLAY_FS=m # CONFIG_OVERLAY_FS_REDIRECT_DIR is not set CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW=y From a6ce42049ddb4dd3439045b9a311e760e6d93979 Mon Sep 17 00:00:00 2001 From: Feng Zhou Date: Mon, 1 Aug 2022 14:52:17 +0800 Subject: [PATCH 0415/5344] bytedance: config: enable CONFIG_DEBUG_INFO_BTF for arm64 enable CONFIG_DEBUG_INFO_BTF for arm64. Signed-off-by: Feng Zhou --- config.aarch64 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.aarch64 b/config.aarch64 index 8b6a67527cec5..20f4ef208e794 100644 --- a/config.aarch64 +++ b/config.aarch64 @@ -9105,7 +9105,7 @@ CONFIG_DEBUG_INFO=y # CONFIG_DEBUG_INFO_REDUCED is not set # CONFIG_DEBUG_INFO_SPLIT is not set # CONFIG_DEBUG_INFO_DWARF4 is not set -# CONFIG_DEBUG_INFO_BTF is not set +CONFIG_DEBUG_INFO_BTF=y # CONFIG_GDB_SCRIPTS is not set CONFIG_ENABLE_MUST_CHECK=y CONFIG_FRAME_WARN=2048 From ff0b76c22123034147327f9273c0d18bac95caa0 Mon Sep 17 00:00:00 2001 From: Jia Zhu Date: Thu, 31 Mar 2022 15:59:23 +0800 Subject: [PATCH 0416/5344] bytedance: fuse: Support restore fuse request by ioctl In read-only scenario, as long as the fuse device connection is not released, fuse client can restore the inflight request by moving the requests from processing list to pending list. Userspace can trigger a restore by: ioctl(devfd, FUSE_DEV_IOC_RESTORE_REQ, 0); At this point inflight requests will be recovered and processed. Signed-off-by: Jia Zhu --- fs/fuse/dev.c | 99 +++++++++++++++++++++++++++++---------- include/uapi/linux/fuse.h | 3 +- 2 files changed, 75 insertions(+), 27 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 7205a89fbb5f3..65683087aba36 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2232,42 +2232,89 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new) return 0; } -static long fuse_dev_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) +static int fuse_dev_ioc_clone(struct file *file, unsigned long arg) { - int err = -ENOTTY; + int err = -EFAULT; + int oldfd; - if (cmd == FUSE_DEV_IOC_CLONE) { - int oldfd; + if (!get_user(oldfd, (__u32 __user *) arg)) { + struct file *old = fget(oldfd); - err = -EFAULT; - if (!get_user(oldfd, (__u32 __user *) arg)) { - struct file *old = fget(oldfd); + err = -EINVAL; + if (old) { + struct fuse_dev *fud = NULL; - err = -EINVAL; - if (old) { - struct fuse_dev *fud = NULL; - - /* - * Check against file->f_op because CUSE - * uses the same ioctl handler. - */ - if (old->f_op == file->f_op && - old->f_cred->user_ns == file->f_cred->user_ns) - fud = fuse_get_dev(old); - - if (fud) { - mutex_lock(&fuse_mutex); - err = fuse_device_clone(fud->fc, file); - mutex_unlock(&fuse_mutex); - } - fput(old); + /* + * Check against file->f_op because CUSE + * uses the same ioctl handler. + */ + if (old->f_op == file->f_op && + old->f_cred->user_ns == file->f_cred->user_ns) + fud = fuse_get_dev(old); + + if (fud) { + mutex_lock(&fuse_mutex); + err = fuse_device_clone(fud->fc, file); + mutex_unlock(&fuse_mutex); } + fput(old); } } return err; } +static int fuse_dev_ioc_restore_req(struct file *file) +{ + unsigned int i; + LIST_HEAD(to_restore); + struct fuse_req *req, *next; + struct fuse_dev *fud = fuse_get_dev(file); + struct fuse_pqueue *fpq = &fud->pq; + struct fuse_conn *fc = fud->fc; + struct fuse_iqueue *fiq = &fc->iq; + + if (fc->sb && !sb_rdonly(fc->sb)) + return -EOPNOTSUPP; + + spin_lock(&fpq->lock); + list_for_each_entry_safe(req, next, &fpq->io, list) { + spin_lock(&req->waitq.lock); + if (!test_bit(FR_LOCKED, &req->flags)) { + set_bit(FR_PRIVATE, &req->flags); + __fuse_get_request(req); + list_move_tail(&req->list, &to_restore); + } + spin_unlock(&req->waitq.lock); + } + for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) + list_splice_init(&fpq->processing[i], &to_restore); + spin_unlock(&fpq->lock); + + list_for_each_entry(req, &to_restore, list) { + clear_bit(FR_SENT, &req->flags); + set_bit(FR_PENDING, &req->flags); + } + + spin_lock(&fiq->lock); + list_splice(&to_restore, &fiq->pending); + spin_unlock(&fiq->lock); + + return 0; +} + +static long fuse_dev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + switch (cmd) { + case FUSE_DEV_IOC_CLONE: + return fuse_dev_ioc_clone(file, arg); + case FUSE_DEV_IOC_RESTORE_REQ: + return fuse_dev_ioc_restore_req(file); + default: + return -ENOTTY; + } +} + const struct file_operations fuse_dev_operations = { .owner = THIS_MODULE, .open = fuse_dev_open, diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 6a8050e4abfa1..6fea4c7fdd246 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -878,7 +878,8 @@ struct fuse_notify_retrieve_in { }; /* Device ioctls: */ -#define FUSE_DEV_IOC_CLONE _IOR(229, 0, uint32_t) +#define FUSE_DEV_IOC_CLONE _IOR(229, 0, uint32_t) +#define FUSE_DEV_IOC_RESTORE_REQ _IO(229, 255) struct fuse_lseek_in { uint64_t fh; From b224b3e36725a73d73bad2d118a655ba32eb8a5e Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Sun, 13 Mar 2022 11:35:15 +0800 Subject: [PATCH 0417/5344] Revert "scsi: core: sysfs: Fix setting device state to SDEV_RUNNING" This reverts commit 6291ec834f340e0faebb34289e46abf5c4ff1605. Signed-off-by: Muchun Song --- drivers/scsi/scsi_sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index 0e3c46e6b8ab7..e51bdea676a73 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -787,7 +787,7 @@ store_state_field(struct device *dev, struct device_attribute *attr, mutex_lock(&sdev->state_mutex); if (sdev->sdev_state == SDEV_RUNNING && state == SDEV_RUNNING) { - ret = 0; + ret = count; } else { ret = scsi_device_set_state(sdev, state); if (ret == 0 && state == SDEV_RUNNING) From 712948379302f12a3861226008afd64bd60f9d02 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Sun, 13 Mar 2022 11:36:27 +0800 Subject: [PATCH 0418/5344] Revert "scsi: core: sysfs: Fix hang when device state is set via sysfs" This reverts commit 449892352d13dbadcbcfeb9f417dc938505be01d. Signed-off-by: Muchun Song --- drivers/scsi/scsi_sysfs.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index e51bdea676a73..6aeb79e744e0b 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -767,7 +767,6 @@ store_state_field(struct device *dev, struct device_attribute *attr, int i, ret; struct scsi_device *sdev = to_scsi_device(dev); enum scsi_device_state state = 0; - bool rescan_dev = false; for (i = 0; i < ARRAY_SIZE(sdev_states); i++) { const int len = strlen(sdev_states[i].name); @@ -786,27 +785,20 @@ store_state_field(struct device *dev, struct device_attribute *attr, } mutex_lock(&sdev->state_mutex); - if (sdev->sdev_state == SDEV_RUNNING && state == SDEV_RUNNING) { - ret = count; - } else { - ret = scsi_device_set_state(sdev, state); - if (ret == 0 && state == SDEV_RUNNING) - rescan_dev = true; - } - mutex_unlock(&sdev->state_mutex); - - if (rescan_dev) { - /* - * If the device state changes to SDEV_RUNNING, we need to - * run the queue to avoid I/O hang, and rescan the device - * to revalidate it. Running the queue first is necessary - * because another thread may be waiting inside - * blk_mq_freeze_queue_wait() and because that call may be - * waiting for pending I/O to finish. - */ + ret = scsi_device_set_state(sdev, state); + /* + * If the device state changes to SDEV_RUNNING, we need to + * run the queue to avoid I/O hang, and rescan the device + * to revalidate it. Running the queue first is necessary + * because another thread may be waiting inside + * blk_mq_freeze_queue_wait() and because that call may be + * waiting for pending I/O to finish. + */ + if (ret == 0 && state == SDEV_RUNNING) { blk_mq_run_hw_queues(sdev->request_queue, true); scsi_rescan_device(dev); } + mutex_unlock(&sdev->state_mutex); return ret == 0 ? count : -EINVAL; } From f87cd2607d6a0e61884ed7ec3ce6606c4233ad12 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Sun, 13 Mar 2022 12:00:22 +0800 Subject: [PATCH 0419/5344] Revert "fuse: Pass correct lend value to filemap_write_and_wait_range()" This reverts commit b80e674ea5b92c2d5e687a2386be9eb2a663b194. Signed-off-by: Muchun Song --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 52a29d55a684f..58c0b42d4711b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -3259,7 +3259,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end) { - int err = filemap_write_and_wait_range(inode->i_mapping, start, LLONG_MAX); + int err = filemap_write_and_wait_range(inode->i_mapping, start, -1); if (!err) fuse_sync_writes(inode); From 3ecdd749184ed25da28f90900cb00f5319335fcd Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Sun, 13 Mar 2022 12:04:06 +0800 Subject: [PATCH 0420/5344] Revert "Ampere: drivers/pci: Add ecam quirk for Ampere Altra SOC." This reverts commit 9038dae0a8fa718df7d4cd4ee067a23364d2d2c7. Signed-off-by: Muchun Song --- drivers/acpi/pci_mcfg.c | 20 -------------------- drivers/pci/ecam.c | 9 --------- include/linux/pci-ecam.h | 1 - 3 files changed, 30 deletions(-) diff --git a/drivers/acpi/pci_mcfg.c b/drivers/acpi/pci_mcfg.c index 715a382d92df0..6b347d9920cc2 100644 --- a/drivers/acpi/pci_mcfg.c +++ b/drivers/acpi/pci_mcfg.c @@ -142,26 +142,6 @@ static struct mcfg_fixup mcfg_quirks[] = { XGENE_V2_ECAM_MCFG(4, 0), XGENE_V2_ECAM_MCFG(4, 1), XGENE_V2_ECAM_MCFG(4, 2), - -#define AMPERE_ECAM32(rev, seg) \ - { "Ampere", "Altra ", rev, seg, MCFG_BUS_ANY, &pci_32b_read_ops } - - AMPERE_ECAM32(1, 0), - AMPERE_ECAM32(1, 1), - AMPERE_ECAM32(1, 2), - AMPERE_ECAM32(1, 3), - AMPERE_ECAM32(1, 4), - AMPERE_ECAM32(1, 5), - AMPERE_ECAM32(1, 6), - AMPERE_ECAM32(1, 7), - AMPERE_ECAM32(1, 8), - AMPERE_ECAM32(1, 9), - AMPERE_ECAM32(1, 10), - AMPERE_ECAM32(1, 11), - AMPERE_ECAM32(1, 12), - AMPERE_ECAM32(1, 13), - AMPERE_ECAM32(1, 14), - AMPERE_ECAM32(1, 15), }; static char mcfg_oem_id[ACPI_OEM_ID_SIZE]; diff --git a/drivers/pci/ecam.c b/drivers/pci/ecam.c index fcad9af6df04e..1a81af0ba961a 100644 --- a/drivers/pci/ecam.c +++ b/drivers/pci/ecam.c @@ -164,13 +164,4 @@ struct pci_ecam_ops pci_32b_ops = { .write = pci_generic_config_write32, } }; - -struct pci_ecam_ops pci_32b_read_ops = { - .bus_shift = 20, - .pci_ops = { - .map_bus = pci_ecam_map_bus, - .read = pci_generic_config_read32, - .write = pci_generic_config_write, - } -}; #endif diff --git a/include/linux/pci-ecam.h b/include/linux/pci-ecam.h index d0260824dc5dc..a73164c85e78b 100644 --- a/include/linux/pci-ecam.h +++ b/include/linux/pci-ecam.h @@ -57,7 +57,6 @@ extern struct pci_ecam_ops pci_thunder_ecam_ops; /* Cavium ThunderX 1.x */ extern struct pci_ecam_ops xgene_v1_pcie_ecam_ops; /* APM X-Gene PCIe v1 */ extern struct pci_ecam_ops xgene_v2_pcie_ecam_ops; /* APM X-Gene PCIe v2.x */ extern struct pci_ecam_ops al_pcie_ops; /* Amazon Annapurna Labs PCIe */ -extern struct pci_ecam_ops pci_32b_read_ops; /* 32-bit read accesses only */ #endif #ifdef CONFIG_PCI_HOST_COMMON From bad724eb023149a29826575041c74402348c1550 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Sun, 13 Mar 2022 12:20:05 +0800 Subject: [PATCH 0421/5344] Revert "netfilter: fix regression in looped (broad|multi)cast's MAC handling" This reverts commit c8f0dad0f69e86d5e104d9cdc9bab219cfde824c. Signed-off-by: Muchun Song --- net/netfilter/nfnetlink_log.c | 3 +-- net/netfilter/nfnetlink_queue.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index b36af4741ad3c..7ca2ca4bba055 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -557,8 +557,7 @@ __build_packet_message(struct nfnl_log_net *log, goto nla_put_failure; if (indev && skb->dev && - skb_mac_header_was_set(skb) && - skb_mac_header_len(skb) != 0) { + skb->mac_header != skb->network_header) { struct nfulnl_msg_packet_hw phw; int len; diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index ca21f8f4a47c1..a8cb562da3fea 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -562,8 +562,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, goto nla_put_failure; if (indev && entskb->dev && - skb_mac_header_was_set(entskb) && - skb_mac_header_len(entskb) != 0) { + skb_mac_header_was_set(entskb)) { struct nfqnl_msg_packet_hw phw; int len; From 4c1dad61c64e6e7ca0e665aa7bec20e08a65bc8b Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Sun, 13 Mar 2022 17:14:39 +0800 Subject: [PATCH 0422/5344] Revert "Ampere: NOUPSTEAM: ipmi: ssif: add timeout for I2C client" This reverts commit 4397367b987d551aae676c6ce8f3b4dc364ac2aa. Signed-off-by: Muchun Song --- drivers/char/ipmi/ipmi_ssif.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c index dc655276c0171..8ac390c2b5147 100644 --- a/drivers/char/ipmi/ipmi_ssif.c +++ b/drivers/char/ipmi/ipmi_ssif.c @@ -1725,7 +1725,6 @@ static int ssif_probe(struct i2c_client *client, const struct i2c_device_id *id) client->addr, client->adapter->name, slave_addr); ssif_info->client = client; - ssif_info->client->adapter->timeout = 10*HZ; /* Set to 10 seconds in this case */ i2c_set_clientdata(client, ssif_info); /* Now check for system interface capabilities */ From 9ffd886d7db88f47562f6bf9e35544ca33364fb7 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sat, 10 Jul 2021 07:50:33 -0700 Subject: [PATCH 0423/5344] ARC: Fix CONFIG_STACKDEPOT [ Upstream commit bf79167fd86f3b97390fe2e70231d383526bd9cc ] Enabling CONFIG_STACKDEPOT results in the following build error. arc-elf-ld: lib/stackdepot.o: in function `filter_irq_stacks': stackdepot.c:(.text+0x456): undefined reference to `__irqentry_text_start' arc-elf-ld: stackdepot.c:(.text+0x456): undefined reference to `__irqentry_text_start' arc-elf-ld: stackdepot.c:(.text+0x476): undefined reference to `__irqentry_text_end' arc-elf-ld: stackdepot.c:(.text+0x476): undefined reference to `__irqentry_text_end' arc-elf-ld: stackdepot.c:(.text+0x484): undefined reference to `__softirqentry_text_start' arc-elf-ld: stackdepot.c:(.text+0x484): undefined reference to `__softirqentry_text_start' arc-elf-ld: stackdepot.c:(.text+0x48c): undefined reference to `__softirqentry_text_end' arc-elf-ld: stackdepot.c:(.text+0x48c): undefined reference to `__softirqentry_text_end' Other architectures address this problem by adding IRQENTRY_TEXT and SOFTIRQENTRY_TEXT to the text segment, so do the same here. Signed-off-by: Guenter Roeck Signed-off-by: Vineet Gupta Signed-off-by: Sasha Levin --- arch/arc/kernel/vmlinux.lds.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arc/kernel/vmlinux.lds.S b/arch/arc/kernel/vmlinux.lds.S index 6c693a9d29b6d..0391b8293ad85 100644 --- a/arch/arc/kernel/vmlinux.lds.S +++ b/arch/arc/kernel/vmlinux.lds.S @@ -88,6 +88,8 @@ SECTIONS CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT + IRQENTRY_TEXT + SOFTIRQENTRY_TEXT *(.fixup) *(.gnu.warning) } From ba78d13f3ce99d40708bbd6a48c5a2dad962c203 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 27 Jul 2021 00:29:19 +0200 Subject: [PATCH 0424/5344] netfilter: conntrack: collect all entries in one cycle [ Upstream commit 4608fdfc07e116f9fc0895beb40abad7cdb5ee3d ] Michal Kubecek reports that conntrack gc is responsible for frequent wakeups (every 125ms) on idle systems. On busy systems, timed out entries are evicted during lookup. The gc worker is only needed to remove entries after system becomes idle after a busy period. To resolve this, always scan the entire table. If the scan is taking too long, reschedule so other work_structs can run and resume from next bucket. After a completed scan, wait for 2 minutes before the next cycle. Heuristics for faster re-schedule are removed. GC_SCAN_INTERVAL could be exposed as a sysctl in the future to allow tuning this as-needed or even turn the gc worker off. Reported-by: Michal Kubecek Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nf_conntrack_core.c | 71 ++++++++++--------------------- 1 file changed, 22 insertions(+), 49 deletions(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 4a988ce4264cb..4bcc36e4b2ef0 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -66,22 +66,17 @@ EXPORT_SYMBOL_GPL(nf_conntrack_hash); struct conntrack_gc_work { struct delayed_work dwork; - u32 last_bucket; + u32 next_bucket; bool exiting; bool early_drop; - long next_gc_run; }; static __read_mostly struct kmem_cache *nf_conntrack_cachep; static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); static __read_mostly bool nf_conntrack_locks_all; -/* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */ -#define GC_MAX_BUCKETS_DIV 128u -/* upper bound of full table scan */ -#define GC_MAX_SCAN_JIFFIES (16u * HZ) -/* desired ratio of entries found to be expired */ -#define GC_EVICT_RATIO 50u +#define GC_SCAN_INTERVAL (120u * HZ) +#define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) static struct conntrack_gc_work conntrack_gc_work; @@ -1226,17 +1221,13 @@ static void nf_ct_offload_timeout(struct nf_conn *ct) static void gc_worker(struct work_struct *work) { - unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u); - unsigned int i, goal, buckets = 0, expired_count = 0; - unsigned int nf_conntrack_max95 = 0; + unsigned long end_time = jiffies + GC_SCAN_MAX_DURATION; + unsigned int i, hashsz, nf_conntrack_max95 = 0; + unsigned long next_run = GC_SCAN_INTERVAL; struct conntrack_gc_work *gc_work; - unsigned int ratio, scanned = 0; - unsigned long next_run; - gc_work = container_of(work, struct conntrack_gc_work, dwork.work); - goal = nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV; - i = gc_work->last_bucket; + i = gc_work->next_bucket; if (gc_work->early_drop) nf_conntrack_max95 = nf_conntrack_max / 100u * 95u; @@ -1244,22 +1235,21 @@ static void gc_worker(struct work_struct *work) struct nf_conntrack_tuple_hash *h; struct hlist_nulls_head *ct_hash; struct hlist_nulls_node *n; - unsigned int hashsz; struct nf_conn *tmp; - i++; rcu_read_lock(); nf_conntrack_get_ht(&ct_hash, &hashsz); - if (i >= hashsz) - i = 0; + if (i >= hashsz) { + rcu_read_unlock(); + break; + } hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) { struct net *net; tmp = nf_ct_tuplehash_to_ctrack(h); - scanned++; if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) { nf_ct_offload_timeout(tmp); continue; @@ -1267,7 +1257,6 @@ static void gc_worker(struct work_struct *work) if (nf_ct_is_expired(tmp)) { nf_ct_gc_expired(tmp); - expired_count++; continue; } @@ -1299,7 +1288,14 @@ static void gc_worker(struct work_struct *work) */ rcu_read_unlock(); cond_resched(); - } while (++buckets < goal); + i++; + + if (time_after(jiffies, end_time) && i < hashsz) { + gc_work->next_bucket = i; + next_run = 0; + break; + } + } while (i < hashsz); if (gc_work->exiting) return; @@ -1310,40 +1306,17 @@ static void gc_worker(struct work_struct *work) * * This worker is only here to reap expired entries when system went * idle after a busy period. - * - * The heuristics below are supposed to balance conflicting goals: - * - * 1. Minimize time until we notice a stale entry - * 2. Maximize scan intervals to not waste cycles - * - * Normally, expire ratio will be close to 0. - * - * As soon as a sizeable fraction of the entries have expired - * increase scan frequency. */ - ratio = scanned ? expired_count * 100 / scanned : 0; - if (ratio > GC_EVICT_RATIO) { - gc_work->next_gc_run = min_interval; - } else { - unsigned int max = GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV; - - BUILD_BUG_ON((GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV) == 0); - - gc_work->next_gc_run += min_interval; - if (gc_work->next_gc_run > max) - gc_work->next_gc_run = max; + if (next_run) { + gc_work->early_drop = false; + gc_work->next_bucket = 0; } - - next_run = gc_work->next_gc_run; - gc_work->last_bucket = i; - gc_work->early_drop = false; queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run); } static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) { INIT_DEFERRABLE_WORK(&gc_work->dwork, gc_worker); - gc_work->next_gc_run = HZ; gc_work->exiting = false; } From a76c742c154d8bd411355c5c67840ebdc4ec08c0 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 6 Aug 2021 16:21:24 +0800 Subject: [PATCH 0425/5344] once: Fix panic when module unload [ Upstream commit 1027b96ec9d34f9abab69bc1a4dc5b1ad8ab1349 ] DO_ONCE DEFINE_STATIC_KEY_TRUE(___once_key); __do_once_done once_disable_jump(once_key); INIT_WORK(&w->work, once_deferred); struct once_work *w; w->key = key; schedule_work(&w->work); module unload //*the key is destroy* process_one_work once_deferred BUG_ON(!static_key_enabled(work->key)); static_key_count((struct static_key *)x) //*access key, crash* When module uses DO_ONCE mechanism, it could crash due to the above concurrency problem, we could reproduce it with link[1]. Fix it by add/put module refcount in the once work process. [1] https://lore.kernel.org/netdev/eaa6c371-465e-57eb-6be9-f4b16b9d7cbf@huawei.com/ Cc: Hannes Frederic Sowa Cc: Daniel Borkmann Cc: David S. Miller Cc: Eric Dumazet Reported-by: Minmin chen Signed-off-by: Kefeng Wang Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/linux/once.h | 4 ++-- lib/once.c | 11 ++++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/include/linux/once.h b/include/linux/once.h index 9225ee6d96c75..ae6f4eb41cbe7 100644 --- a/include/linux/once.h +++ b/include/linux/once.h @@ -7,7 +7,7 @@ bool __do_once_start(bool *done, unsigned long *flags); void __do_once_done(bool *done, struct static_key_true *once_key, - unsigned long *flags); + unsigned long *flags, struct module *mod); /* Call a function exactly once. The idea of DO_ONCE() is to perform * a function call such as initialization of random seeds, etc, only @@ -46,7 +46,7 @@ void __do_once_done(bool *done, struct static_key_true *once_key, if (unlikely(___ret)) { \ func(__VA_ARGS__); \ __do_once_done(&___done, &___once_key, \ - &___flags); \ + &___flags, THIS_MODULE); \ } \ } \ ___ret; \ diff --git a/lib/once.c b/lib/once.c index 8b7d6235217ee..59149bf3bfb4a 100644 --- a/lib/once.c +++ b/lib/once.c @@ -3,10 +3,12 @@ #include #include #include +#include struct once_work { struct work_struct work; struct static_key_true *key; + struct module *module; }; static void once_deferred(struct work_struct *w) @@ -16,10 +18,11 @@ static void once_deferred(struct work_struct *w) work = container_of(w, struct once_work, work); BUG_ON(!static_key_enabled(work->key)); static_branch_disable(work->key); + module_put(work->module); kfree(work); } -static void once_disable_jump(struct static_key_true *key) +static void once_disable_jump(struct static_key_true *key, struct module *mod) { struct once_work *w; @@ -29,6 +32,8 @@ static void once_disable_jump(struct static_key_true *key) INIT_WORK(&w->work, once_deferred); w->key = key; + w->module = mod; + __module_get(mod); schedule_work(&w->work); } @@ -53,11 +58,11 @@ bool __do_once_start(bool *done, unsigned long *flags) EXPORT_SYMBOL(__do_once_start); void __do_once_done(bool *done, struct static_key_true *once_key, - unsigned long *flags) + unsigned long *flags, struct module *mod) __releases(once_lock) { *done = true; spin_unlock_irqrestore(&once_lock, *flags); - once_disable_jump(once_key); + once_disable_jump(once_key, mod); } EXPORT_SYMBOL(__do_once_done); From 6c48b7f95430f7a6379ebf93df3ee47b631b9e67 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 6 Aug 2021 10:03:12 +0200 Subject: [PATCH 0426/5344] ovl: fix uninitialized pointer read in ovl_lookup_real_one() [ Upstream commit 580c610429b3994e8db24418927747cf28443cde ] One error path can result in release_dentry_name_snapshot() being called before "name" was initialized by take_dentry_name_snapshot(). Fix by moving the release_dentry_name_snapshot() to immediately after the only use. Reported-by: Colin Ian King Signed-off-by: Miklos Szeredi Signed-off-by: Sasha Levin --- fs/overlayfs/export.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 11dd8177770df..19574ef174709 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -395,6 +395,7 @@ static struct dentry *ovl_lookup_real_one(struct dentry *connected, */ take_dentry_name_snapshot(&name, real); this = lookup_one_len(name.name.name, connected, name.name.len); + release_dentry_name_snapshot(&name); err = PTR_ERR(this); if (IS_ERR(this)) { goto fail; @@ -409,7 +410,6 @@ static struct dentry *ovl_lookup_real_one(struct dentry *connected, } out: - release_dentry_name_snapshot(&name); dput(parent); inode_unlock(dir); return this; From ea6af930a1ad04bba05de3af7d9f50915cdacb5e Mon Sep 17 00:00:00 2001 From: Shaik Sajida Bhanu Date: Fri, 16 Jul 2021 17:16:14 +0530 Subject: [PATCH 0427/5344] mmc: sdhci-msm: Update the software timeout value for sdhc [ Upstream commit 67b13f3e221ed81b46a657e2b499bf8b20162476 ] Whenever SDHC run at clock rate 50MHZ or below, the hardware data timeout value will be 21.47secs, which is approx. 22secs and we have a current software timeout value as 10secs. We have to set software timeout value more than the hardware data timeout value to avioid seeing the below register dumps. [ 332.953670] mmc2: Timeout waiting for hardware interrupt. [ 332.959608] mmc2: sdhci: ============ SDHCI REGISTER DUMP =========== [ 332.966450] mmc2: sdhci: Sys addr: 0x00000000 | Version: 0x00007202 [ 332.973256] mmc2: sdhci: Blk size: 0x00000200 | Blk cnt: 0x00000001 [ 332.980054] mmc2: sdhci: Argument: 0x00000000 | Trn mode: 0x00000027 [ 332.986864] mmc2: sdhci: Present: 0x01f801f6 | Host ctl: 0x0000001f [ 332.993671] mmc2: sdhci: Power: 0x00000001 | Blk gap: 0x00000000 [ 333.000583] mmc2: sdhci: Wake-up: 0x00000000 | Clock: 0x00000007 [ 333.007386] mmc2: sdhci: Timeout: 0x0000000e | Int stat: 0x00000000 [ 333.014182] mmc2: sdhci: Int enab: 0x03ff100b | Sig enab: 0x03ff100b [ 333.020976] mmc2: sdhci: ACmd stat: 0x00000000 | Slot int: 0x00000000 [ 333.027771] mmc2: sdhci: Caps: 0x322dc8b2 | Caps_1: 0x0000808f [ 333.034561] mmc2: sdhci: Cmd: 0x0000183a | Max curr: 0x00000000 [ 333.041359] mmc2: sdhci: Resp[0]: 0x00000900 | Resp[1]: 0x00000000 [ 333.048157] mmc2: sdhci: Resp[2]: 0x00000000 | Resp[3]: 0x00000000 [ 333.054945] mmc2: sdhci: Host ctl2: 0x00000000 [ 333.059657] mmc2: sdhci: ADMA Err: 0x00000000 | ADMA Ptr: 0x0000000ffffff218 [ 333.067178] mmc2: sdhci_msm: ----------- VENDOR REGISTER DUMP ----------- [ 333.074343] mmc2: sdhci_msm: DLL sts: 0x00000000 | DLL cfg: 0x6000642c | DLL cfg2: 0x0020a000 [ 333.083417] mmc2: sdhci_msm: DLL cfg3: 0x00000000 | DLL usr ctl: 0x00000000 | DDR cfg: 0x80040873 [ 333.092850] mmc2: sdhci_msm: Vndr func: 0x00008a9c | Vndr func2 : 0xf88218a8 Vndr func3: 0x02626040 [ 333.102371] mmc2: sdhci: ============================================ So, set software timeout value more than hardware timeout value. Signed-off-by: Shaik Sajida Bhanu Acked-by: Adrian Hunter Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/1626435974-14462-1-git-send-email-sbhanu@codeaurora.org Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- drivers/mmc/host/sdhci-msm.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c index 8bed81cf03adc..8ab963055238a 100644 --- a/drivers/mmc/host/sdhci-msm.c +++ b/drivers/mmc/host/sdhci-msm.c @@ -1589,6 +1589,23 @@ static void sdhci_msm_set_clock(struct sdhci_host *host, unsigned int clock) __sdhci_msm_set_clock(host, clock); } +static void sdhci_msm_set_timeout(struct sdhci_host *host, struct mmc_command *cmd) +{ + u32 count, start = 15; + + __sdhci_set_timeout(host, cmd); + count = sdhci_readb(host, SDHCI_TIMEOUT_CONTROL); + /* + * Update software timeout value if its value is less than hardware data + * timeout value. Qcom SoC hardware data timeout value was calculated + * using 4 * MCLK * 2^(count + 13). where MCLK = 1 / host->clock. + */ + if (cmd && cmd->data && host->clock > 400000 && + host->clock <= 50000000 && + ((1 << (count + start)) > (10 * host->clock))) + host->data_timeout = 22LL * NSEC_PER_SEC; +} + /* * Platform specific register write functions. This is so that, if any * register write needs to be followed up by platform specific actions, @@ -1753,6 +1770,7 @@ static const struct sdhci_ops sdhci_msm_ops = { .set_uhs_signaling = sdhci_msm_set_uhs_signaling, .write_w = sdhci_msm_writew, .write_b = sdhci_msm_writeb, + .set_timeout = sdhci_msm_set_timeout, }; static const struct sdhci_pltfm_data sdhci_msm_pdata = { From d3e015bab033055667d03e7e85e48b8efca6a4f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20M=C3=A4tje?= Date: Wed, 25 Aug 2021 23:52:27 +0200 Subject: [PATCH 0428/5344] can: usb: esd_usb2: esd_usb2_rx_event(): fix the interchange of the CAN RX and TX error counters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 044012b52029204900af9e4230263418427f4ba4 upstream. This patch fixes the interchanged fetch of the CAN RX and TX error counters from the ESD_EV_CAN_ERROR_EXT message. The RX error counter is really in struct rx_msg::data[2] and the TX error counter is in struct rx_msg::data[3]. Fixes: 96d8e90382dc ("can: Add driver for esd CAN-USB/2 device") Link: https://lore.kernel.org/r/20210825215227.4947-2-stefan.maetje@esd.eu Cc: stable@vger.kernel.org Signed-off-by: Stefan Mätje Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- drivers/net/can/usb/esd_usb2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/can/usb/esd_usb2.c b/drivers/net/can/usb/esd_usb2.c index 485e20e0dec2c..8847942a8d97e 100644 --- a/drivers/net/can/usb/esd_usb2.c +++ b/drivers/net/can/usb/esd_usb2.c @@ -224,8 +224,8 @@ static void esd_usb2_rx_event(struct esd_usb2_net_priv *priv, if (id == ESD_EV_CAN_ERROR_EXT) { u8 state = msg->msg.rx.data[0]; u8 ecc = msg->msg.rx.data[1]; - u8 txerr = msg->msg.rx.data[2]; - u8 rxerr = msg->msg.rx.data[3]; + u8 rxerr = msg->msg.rx.data[2]; + u8 txerr = msg->msg.rx.data[3]; skb = alloc_can_err_skb(priv->netdev, &cf); if (skb == NULL) { From 0381b1510c0e28314ff1b1c4c6c56d1d4e7f257f Mon Sep 17 00:00:00 2001 From: Zhengjun Zhang Date: Mon, 9 Aug 2021 21:35:53 +0800 Subject: [PATCH 0429/5344] USB: serial: option: add new VID/PID to support Fibocom FG150 commit 2829a4e3cf3a6ac2fa3cdb681b37574630fb9c1a upstream. Fibocom FG150 is a 5G module based on Qualcomm SDX55 platform, support Sub-6G band. Here are the outputs of lsusb -v and usb-devices: > T: Bus=02 Lev=01 Prnt=01 Port=01 Cnt=01 Dev#= 2 Spd=5000 MxCh= 0 > D: Ver= 3.20 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 9 #Cfgs= 1 > P: Vendor=2cb7 ProdID=010b Rev=04.14 > S: Manufacturer=Fibocom > S: Product=Fibocom Modem_SN:XXXXXXXX > S: SerialNumber=XXXXXXXX > C: #Ifs= 5 Cfg#= 1 Atr=a0 MxPwr=896mA > I: If#=0x0 Alt= 0 #EPs= 1 Cls=ef(misc ) Sub=04 Prot=01 Driver=rndis_host > I: If#=0x1 Alt= 0 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=rndis_host > I: If#=0x2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none) > I: If#=0x3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=(none) > I: If#=0x4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none) > Bus 002 Device 002: ID 2cb7:010b Fibocom Fibocom Modem_SN:XXXXXXXX > Device Descriptor: > bLength 18 > bDescriptorType 1 > bcdUSB 3.20 > bDeviceClass 0 > bDeviceSubClass 0 > bDeviceProtocol 0 > bMaxPacketSize0 9 > idVendor 0x2cb7 Fibocom > idProduct 0x010b > bcdDevice 4.14 > iManufacturer 1 Fibocom > iProduct 2 Fibocom Modem_SN:XXXXXXXX > iSerial 3 XXXXXXXX > bNumConfigurations 1 > Configuration Descriptor: > bLength 9 > bDescriptorType 2 > wTotalLength 0x00e6 > bNumInterfaces 5 > bConfigurationValue 1 > iConfiguration 4 RNDIS_DUN_DIAG_ADB > bmAttributes 0xa0 > (Bus Powered) > Remote Wakeup > MaxPower 896mA > Interface Association: > bLength 8 > bDescriptorType 11 > bFirstInterface 0 > bInterfaceCount 2 > bFunctionClass 239 Miscellaneous Device > bFunctionSubClass 4 > bFunctionProtocol 1 > iFunction 7 RNDIS > Interface Descriptor: > bLength 9 > bDescriptorType 4 > bInterfaceNumber 0 > bAlternateSetting 0 > bNumEndpoints 1 > bInterfaceClass 239 Miscellaneous Device > bInterfaceSubClass 4 > bInterfaceProtocol 1 > iInterface 0 > ** UNRECOGNIZED: 05 24 00 10 01 > ** UNRECOGNIZED: 05 24 01 00 01 > ** UNRECOGNIZED: 04 24 02 00 > ** UNRECOGNIZED: 05 24 06 00 01 > Endpoint Descriptor: > bLength 7 > bDescriptorType 5 > bEndpointAddress 0x81 EP 1 IN > bmAttributes 3 > Transfer Type Interrupt > Synch Type None > Usage Type Data > wMaxPacketSize 0x0008 1x 8 bytes > bInterval 9 > bMaxBurst 0 > Interface Descriptor: > bLength 9 > bDescriptorType 4 > bInterfaceNumber 1 > bAlternateSetting 0 > bNumEndpoints 2 > bInterfaceClass 10 CDC Data > bInterfaceSubClass 0 > bInterfaceProtocol 0 > iInterface 0 > Endpoint Descriptor: > bLength 7 > bDescriptorType 5 > bEndpointAddress 0x8e EP 14 IN > bmAttributes 2 > Transfer Type Bulk > Synch Type None > Usage Type Data > wMaxPacketSize 0x0400 1x 1024 bytes > bInterval 0 > bMaxBurst 6 > Endpoint Descriptor: > bLength 7 > bDescriptorType 5 > bEndpointAddress 0x0f EP 15 OUT > bmAttributes 2 > Transfer Type Bulk > Synch Type None > Usage Type Data > wMaxPacketSize 0x0400 1x 1024 bytes > bInterval 0 > bMaxBurst 6 > Interface Descriptor: > bLength 9 > bDescriptorType 4 > bInterfaceNumber 2 > bAlternateSetting 0 > bNumEndpoints 3 > bInterfaceClass 255 Vendor Specific Class > bInterfaceSubClass 0 > bInterfaceProtocol 0 > iInterface 0 > ** UNRECOGNIZED: 05 24 00 10 01 > ** UNRECOGNIZED: 05 24 01 00 00 > ** UNRECOGNIZED: 04 24 02 02 > ** UNRECOGNIZED: 05 24 06 00 00 > Endpoint Descriptor: > bLength 7 > bDescriptorType 5 > bEndpointAddress 0x83 EP 3 IN > bmAttributes 3 > Transfer Type Interrupt > Synch Type None > Usage Type Data > wMaxPacketSize 0x000a 1x 10 bytes > bInterval 9 > bMaxBurst 0 > Endpoint Descriptor: > bLength 7 > bDescriptorType 5 > bEndpointAddress 0x82 EP 2 IN > bmAttributes 2 > Transfer Type Bulk > Synch Type None > Usage Type Data > wMaxPacketSize 0x0400 1x 1024 bytes > bInterval 0 > bMaxBurst 0 > Endpoint Descriptor: > bLength 7 > bDescriptorType 5 > bEndpointAddress 0x01 EP 1 OUT > bmAttributes 2 > Transfer Type Bulk > Synch Type None > Usage Type Data > wMaxPacketSize 0x0400 1x 1024 bytes > bInterval 0 > bMaxBurst 0 > Interface Descriptor: > bLength 9 > bDescriptorType 4 > bInterfaceNumber 3 > bAlternateSetting 0 > bNumEndpoints 2 > bInterfaceClass 255 Vendor Specific Class > bInterfaceSubClass 255 Vendor Specific Subclass > bInterfaceProtocol 48 > iInterface 0 > Endpoint Descriptor: > bLength 7 > bDescriptorType 5 > bEndpointAddress 0x84 EP 4 IN > bmAttributes 2 > Transfer Type Bulk > Synch Type None > Usage Type Data > wMaxPacketSize 0x0400 1x 1024 bytes > bInterval 0 > bMaxBurst 0 > Endpoint Descriptor: > bLength 7 > bDescriptorType 5 > bEndpointAddress 0x02 EP 2 OUT > bmAttributes 2 > Transfer Type Bulk > Synch Type None > Usage Type Data > wMaxPacketSize 0x0400 1x 1024 bytes > bInterval 0 > bMaxBurst 0 > Interface Descriptor: > bLength 9 > bDescriptorType 4 > bInterfaceNumber 4 > bAlternateSetting 0 > bNumEndpoints 2 > bInterfaceClass 255 Vendor Specific Class > bInterfaceSubClass 66 > bInterfaceProtocol 1 > iInterface 0 > Endpoint Descriptor: > bLength 7 > bDescriptorType 5 > bEndpointAddress 0x03 EP 3 OUT > bmAttributes 2 > Transfer Type Bulk > Synch Type None > Usage Type Data > wMaxPacketSize 0x0400 1x 1024 bytes > bInterval 0 > bMaxBurst 0 > Endpoint Descriptor: > bLength 7 > bDescriptorType 5 > bEndpointAddress 0x85 EP 5 IN > bmAttributes 2 > Transfer Type Bulk > Synch Type None > Usage Type Data > wMaxPacketSize 0x0400 1x 1024 bytes > bInterval 0 > bMaxBurst 0 > Binary Object Store Descriptor: > bLength 5 > bDescriptorType 15 > wTotalLength 0x0016 > bNumDeviceCaps 2 > USB 2.0 Extension Device Capability: > bLength 7 > bDescriptorType 16 > bDevCapabilityType 2 > bmAttributes 0x00000006 > BESL Link Power Management (LPM) Supported > SuperSpeed USB Device Capability: > bLength 10 > bDescriptorType 16 > bDevCapabilityType 3 > bmAttributes 0x00 > wSpeedsSupported 0x000f > Device can operate at Low Speed (1Mbps) > Device can operate at Full Speed (12Mbps) > Device can operate at High Speed (480Mbps) > Device can operate at SuperSpeed (5Gbps) > bFunctionalitySupport 1 > Lowest fully-functional device speed is Full Speed (12Mbps) > bU1DevExitLat 1 micro seconds > bU2DevExitLat 500 micro seconds > Device Status: 0x0000 > (Bus Powered) Signed-off-by: Zhengjun Zhang Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/option.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 793530f241ceb..d42ca13569965 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -2074,6 +2074,8 @@ static const struct usb_device_id option_ids[] = { .driver_info = RSVD(4) | RSVD(5) }, { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0105, 0xff), /* Fibocom NL678 series */ .driver_info = RSVD(6) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x2cb7, 0x010b, 0xff, 0xff, 0x30) }, /* Fibocom FG150 Diag */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2cb7, 0x010b, 0xff, 0, 0) }, /* Fibocom FG150 AT */ { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x01a0, 0xff) }, /* Fibocom NL668-AM/NL652-EU (laptop MBIM) */ { USB_DEVICE_INTERFACE_CLASS(0x2df3, 0x9d03, 0xff) }, /* LongSung M5710 */ { USB_DEVICE_INTERFACE_CLASS(0x305a, 0x1404, 0xff) }, /* GosunCn GM500 RNDIS */ From d794995904e30d1c9059e1acadb074e8dbd86ad9 Mon Sep 17 00:00:00 2001 From: Thinh Nguyen Date: Thu, 19 Aug 2021 03:17:03 +0200 Subject: [PATCH 0430/5344] usb: dwc3: gadget: Fix dwc3_calc_trbs_left() commit 51f1954ad853d01ba4dc2b35dee14d8490ee05a1 upstream. We can't depend on the TRB's HWO bit to determine if the TRB ring is "full". A TRB is only available when the driver had processed it, not when the controller consumed and relinquished the TRB's ownership to the driver. Otherwise, the driver may overwrite unprocessed TRBs. This can happen when many transfer events accumulate and the system is slow to process them and/or when there are too many small requests. If a request is in the started_list, that means there is one or more unprocessed TRBs remained. Check this instead of the TRB's HWO bit whether the TRB ring is full. Fixes: c4233573f6ee ("usb: dwc3: gadget: prepare TRBs on update transfers too") Cc: Acked-by: Felipe Balbi Signed-off-by: Thinh Nguyen Link: https://lore.kernel.org/r/e91e975affb0d0d02770686afc3a5b9eb84409f6.1629335416.git.Thinh.Nguyen@synopsys.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/gadget.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 8a3752fcf7b46..d671b9a3b3604 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -894,19 +894,19 @@ static struct dwc3_trb *dwc3_ep_prev_trb(struct dwc3_ep *dep, u8 index) static u32 dwc3_calc_trbs_left(struct dwc3_ep *dep) { - struct dwc3_trb *tmp; u8 trbs_left; /* - * If enqueue & dequeue are equal than it is either full or empty. - * - * One way to know for sure is if the TRB right before us has HWO bit - * set or not. If it has, then we're definitely full and can't fit any - * more transfers in our ring. + * If the enqueue & dequeue are equal then the TRB ring is either full + * or empty. It's considered full when there are DWC3_TRB_NUM-1 of TRBs + * pending to be processed by the driver. */ if (dep->trb_enqueue == dep->trb_dequeue) { - tmp = dwc3_ep_prev_trb(dep, dep->trb_enqueue); - if (tmp->ctrl & DWC3_TRB_CTRL_HWO) + /* + * If there is any request remained in the started_list at + * this point, that means there is no TRB available. + */ + if (!list_empty(&dep->started_list)) return 0; return DWC3_TRB_NUM - 1; From cb02bb35e520ad3fd70196f1037cb3baa8019158 Mon Sep 17 00:00:00 2001 From: Wesley Cheng Date: Tue, 24 Aug 2021 21:28:55 -0700 Subject: [PATCH 0431/5344] usb: dwc3: gadget: Stop EP0 transfers during pullup disable commit 4a1e25c0a029b97ea4a3d423a6392bfacc3b2e39 upstream. During a USB cable disconnect, or soft disconnect scenario, a pending SETUP transaction may not be completed, leading to the following error: dwc3 a600000.dwc3: timed out waiting for SETUP phase If this occurs, then the entire pullup disable routine is skipped and proper cleanup and halting of the controller does not complete. Instead of returning an error (which is ignored from the UDC perspective), allow the pullup disable routine to continue, which will also handle disabling of EP0/1. This will end any active transfers as well. Ensure to clear any delayed_status also, as the timeout could happen within the STATUS stage. Fixes: bb0147364850 ("usb: dwc3: gadget: don't clear RUN/STOP when it's invalid to do so") Cc: Reviewed-by: Thinh Nguyen Acked-by: Felipe Balbi Signed-off-by: Wesley Cheng Link: https://lore.kernel.org/r/20210825042855.7977-1-wcheng@codeaurora.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/gadget.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index d671b9a3b3604..39a9ad12cbbc8 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -2012,10 +2012,8 @@ static int dwc3_gadget_pullup(struct usb_gadget *g, int is_on) ret = wait_for_completion_timeout(&dwc->ep0_in_setup, msecs_to_jiffies(DWC3_PULL_UP_TIMEOUT)); - if (ret == 0) { - dev_err(dwc->dev, "timed out waiting for SETUP phase\n"); - return -ETIMEDOUT; - } + if (ret == 0) + dev_warn(dwc->dev, "timed out waiting for SETUP phase\n"); } /* @@ -2217,6 +2215,7 @@ static int __dwc3_gadget_start(struct dwc3 *dwc) /* begin to receive SETUP packets */ dwc->ep0state = EP0_SETUP_PHASE; dwc->link_state = DWC3_LINK_STATE_SS_DIS; + dwc->delayed_status = false; dwc3_ep0_out_start(dwc); dwc3_gadget_enable_irq(dwc); From 6dd2055ebd816e2b95056e36f780face67498e16 Mon Sep 17 00:00:00 2001 From: Naresh Kumar PBS Date: Wed, 18 Aug 2021 20:25:52 -0700 Subject: [PATCH 0432/5344] RDMA/bnxt_re: Add missing spin lock initialization [ Upstream commit 17f2569dce1848080825b8336e6b7c6900193b44 ] Add the missing initialization of srq lock. Fixes: 37cb11acf1f7 ("RDMA/bnxt_re: Add SRQ support for Broadcom adapters") Link: https://lore.kernel.org/r/1629343553-5843-3-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Naresh Kumar PBS Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe Signed-off-by: Sasha Levin --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 58c021648b7c8..a96f9142fe08e 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -1404,6 +1404,7 @@ int bnxt_re_create_srq(struct ib_srq *ib_srq, if (nq) nq->budget++; atomic_inc(&rdev->srq_count); + spin_lock_init(&srq->lock); return 0; From 303e380be0e2ca64b52f35b7404e43a1e10a220c Mon Sep 17 00:00:00 2001 From: Tuo Li Date: Fri, 6 Aug 2021 06:30:29 -0700 Subject: [PATCH 0433/5344] IB/hfi1: Fix possible null-pointer dereference in _extend_sdma_tx_descs() [ Upstream commit cbe71c61992c38f72c2b625b2ef25916b9f0d060 ] kmalloc_array() is called to allocate memory for tx->descp. If it fails, the function __sdma_txclean() is called: __sdma_txclean(dd, tx); However, in the function __sdma_txclean(), tx-descp is dereferenced if tx->num_desc is not zero: sdma_unmap_desc(dd, &tx->descp[0]); To fix this possible null-pointer dereference, assign the return value of kmalloc_array() to a local variable descp, and then assign it to tx->descp if it is not NULL. Otherwise, go to enomem. Fixes: 7724105686e7 ("IB/hfi1: add driver files") Link: https://lore.kernel.org/r/20210806133029.194964-1-islituo@gmail.com Reported-by: TOTE Robot Signed-off-by: Tuo Li Tested-by: Mike Marciniszyn Acked-by: Mike Marciniszyn Signed-off-by: Jason Gunthorpe Signed-off-by: Sasha Levin --- drivers/infiniband/hw/hfi1/sdma.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index c61b6022575e1..248be21acdbed 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -3056,6 +3056,7 @@ static void __sdma_process_event(struct sdma_engine *sde, static int _extend_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx) { int i; + struct sdma_desc *descp; /* Handle last descriptor */ if (unlikely((tx->num_desc == (MAX_DESC - 1)))) { @@ -3076,12 +3077,10 @@ static int _extend_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx) if (unlikely(tx->num_desc == MAX_DESC)) goto enomem; - tx->descp = kmalloc_array( - MAX_DESC, - sizeof(struct sdma_desc), - GFP_ATOMIC); - if (!tx->descp) + descp = kmalloc_array(MAX_DESC, sizeof(struct sdma_desc), GFP_ATOMIC); + if (!descp) goto enomem; + tx->descp = descp; /* reserve last descriptor for coalescing */ tx->desc_limit = MAX_DESC - 1; From 4d7877717031a9a98854bf1bfbe9434620b0e72c Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Sun, 4 Jul 2021 10:11:41 +0300 Subject: [PATCH 0434/5344] e1000e: Fix the max snoop/no-snoop latency for 10M [ Upstream commit 44a13a5d99c71bf9e1676d9e51679daf4d7b3d73 ] We should decode the latency and the max_latency before directly compare. The latency should be presented as lat_enc = scale x value: lat_enc_d = (lat_enc & 0x0x3ff) x (1U << (5*((max_ltr_enc & 0x1c00) >> 10))) Fixes: cf8fb73c23aa ("e1000e: add support for LTR on I217/I218") Suggested-by: Yee Li Signed-off-by: Sasha Neftin Tested-by: Dvora Fuxbrumer Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/e1000e/ich8lan.c | 14 +++++++++++++- drivers/net/ethernet/intel/e1000e/ich8lan.h | 3 +++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index a1fab77b2096a..58ff747a42ae6 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -995,6 +995,8 @@ static s32 e1000_platform_pm_pch_lpt(struct e1000_hw *hw, bool link) { u32 reg = link << (E1000_LTRV_REQ_SHIFT + E1000_LTRV_NOSNOOP_SHIFT) | link << E1000_LTRV_REQ_SHIFT | E1000_LTRV_SEND; + u16 max_ltr_enc_d = 0; /* maximum LTR decoded by platform */ + u16 lat_enc_d = 0; /* latency decoded */ u16 lat_enc = 0; /* latency encoded */ if (link) { @@ -1048,7 +1050,17 @@ static s32 e1000_platform_pm_pch_lpt(struct e1000_hw *hw, bool link) E1000_PCI_LTR_CAP_LPT + 2, &max_nosnoop); max_ltr_enc = max_t(u16, max_snoop, max_nosnoop); - if (lat_enc > max_ltr_enc) + lat_enc_d = (lat_enc & E1000_LTRV_VALUE_MASK) * + (1U << (E1000_LTRV_SCALE_FACTOR * + ((lat_enc & E1000_LTRV_SCALE_MASK) + >> E1000_LTRV_SCALE_SHIFT))); + + max_ltr_enc_d = (max_ltr_enc & E1000_LTRV_VALUE_MASK) * + (1U << (E1000_LTRV_SCALE_FACTOR * + ((max_ltr_enc & E1000_LTRV_SCALE_MASK) + >> E1000_LTRV_SCALE_SHIFT))); + + if (lat_enc_d > max_ltr_enc_d) lat_enc = max_ltr_enc; } diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.h b/drivers/net/ethernet/intel/e1000e/ich8lan.h index 1502895eb45dd..e757896287eba 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.h +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.h @@ -274,8 +274,11 @@ /* Latency Tolerance Reporting */ #define E1000_LTRV 0x000F8 +#define E1000_LTRV_VALUE_MASK 0x000003FF #define E1000_LTRV_SCALE_MAX 5 #define E1000_LTRV_SCALE_FACTOR 5 +#define E1000_LTRV_SCALE_SHIFT 10 +#define E1000_LTRV_SCALE_MASK 0x00001C00 #define E1000_LTRV_REQ_SHIFT 15 #define E1000_LTRV_NOSNOOP_SHIFT 16 #define E1000_LTRV_SEND (1 << 30) From 136a7bffb2ab7931f53c669b89475550e43e3fec Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 11 Aug 2021 18:11:28 +0300 Subject: [PATCH 0435/5344] RDMA/efa: Free IRQ vectors on error flow [ Upstream commit dbe986bdfd6dfe6ef24b833767fff4151e024357 ] Make sure to free the IRQ vectors in case the allocation doesn't return the expected number of IRQs. Fixes: b7f5e880f377 ("RDMA/efa: Add the efa module") Link: https://lore.kernel.org/r/20210811151131.39138-2-galpress@amazon.com Reviewed-by: Firas JahJah Reviewed-by: Yossi Leybovich Signed-off-by: Gal Pressman Signed-off-by: Jason Gunthorpe Signed-off-by: Sasha Levin --- drivers/infiniband/hw/efa/efa_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c index 83858f7e83d0f..75dfe9d1564c1 100644 --- a/drivers/infiniband/hw/efa/efa_main.c +++ b/drivers/infiniband/hw/efa/efa_main.c @@ -340,6 +340,7 @@ static int efa_enable_msix(struct efa_dev *dev) } if (irq_num != msix_vecs) { + efa_disable_msix(dev); dev_err(&dev->pdev->dev, "Allocated %d MSI-X (out of %d requested)\n", irq_num, msix_vecs); From 7a822655818f805f6fb9a9c29863cb76bc83bb94 Mon Sep 17 00:00:00 2001 From: Shreyansh Chouhan Date: Sat, 21 Aug 2021 12:44:24 +0530 Subject: [PATCH 0436/5344] ip_gre: add validation for csum_start [ Upstream commit 1d011c4803c72f3907eccfc1ec63caefb852fcbf ] Validate csum_start in gre_handle_offloads before we call _gre_xmit so that we do not crash later when the csum_start value is used in the lco_csum function call. This patch deals with ipv4 code. Fixes: c54419321455 ("GRE: Refactor GRE tunneling code.") Reported-by: syzbot+ff8e1b9f2f36481e2efc@syzkaller.appspotmail.com Signed-off-by: Shreyansh Chouhan Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/ipv4/ip_gre.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index fedad3a3e61b8..fd8298b8b1c52 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -446,6 +446,8 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, static int gre_handle_offloads(struct sk_buff *skb, bool csum) { + if (csum && skb_checksum_start(skb) < skb->data) + return -EINVAL; return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); } From 73539143120f0baa7ddb7622462fb25bd129ab76 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 21 Aug 2021 09:35:23 +0200 Subject: [PATCH 0437/5344] xgene-v2: Fix a resource leak in the error handling path of 'xge_probe()' [ Upstream commit 5ed74b03eb4d08f5dd281dcb5f1c9bb92b363a8d ] A successful 'xge_mdio_config()' call should be balanced by a corresponding 'xge_mdio_remove()' call in the error handling path of the probe, as already done in the remove function. Update the error handling path accordingly. Fixes: ea8ab16ab225 ("drivers: net: xgene-v2: Add MDIO support") Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/apm/xgene-v2/main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/apm/xgene-v2/main.c b/drivers/net/ethernet/apm/xgene-v2/main.c index 02b4f3af02b54..848be6bf2fd1f 100644 --- a/drivers/net/ethernet/apm/xgene-v2/main.c +++ b/drivers/net/ethernet/apm/xgene-v2/main.c @@ -677,11 +677,13 @@ static int xge_probe(struct platform_device *pdev) ret = register_netdev(ndev); if (ret) { netdev_err(ndev, "Failed to register netdev\n"); - goto err; + goto err_mdio_remove; } return 0; +err_mdio_remove: + xge_mdio_remove(ndev); err: free_netdev(ndev); From 6cf8c886fe5f952b8fcf091425080d33e79ba327 Mon Sep 17 00:00:00 2001 From: Maxim Kiselev Date: Fri, 20 Aug 2021 18:39:51 +0300 Subject: [PATCH 0438/5344] net: marvell: fix MVNETA_TX_IN_PRGRS bit number [ Upstream commit 359f4cdd7d78fdf8c098713b05fee950a730f131 ] According to Armada XP datasheet bit at 0 position is corresponding for TxInProg indication. Fixes: c5aff18204da ("net: mvneta: driver for Marvell Armada 370/XP network unit") Signed-off-by: Maxim Kiselev Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/marvell/mvneta.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 7b0543056b101..64aa5510e61a6 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -101,7 +101,7 @@ #define MVNETA_DESC_SWAP BIT(6) #define MVNETA_TX_BRST_SZ_MASK(burst) ((burst) << 22) #define MVNETA_PORT_STATUS 0x2444 -#define MVNETA_TX_IN_PRGRS BIT(1) +#define MVNETA_TX_IN_PRGRS BIT(0) #define MVNETA_TX_FIFO_EMPTY BIT(8) #define MVNETA_RX_MIN_FRAME_SIZE 0x247c /* Only exists on Armada XP and Armada 370 */ From 409febc359f4d7aae2ead215ce3c11cdeb706b1e Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 25 Aug 2021 17:25:40 -0700 Subject: [PATCH 0439/5344] rtnetlink: Return correct error on changing device netns [ Upstream commit 96a6b93b69880b2c978e1b2be9cae6970b605008 ] Currently when device is moved between network namespaces using RTM_NEWLINK message type and one of netns attributes (FLA_NET_NS_PID, IFLA_NET_NS_FD, IFLA_TARGET_NETNSID) but w/o specifying IFLA_IFNAME, and target namespace already has device with same name, userspace will get EINVAL what is confusing and makes debugging harder. Fix it so that userspace gets more appropriate EEXIST instead what makes debugging much easier. Before: # ./ifname.sh + ip netns add ns0 + ip netns exec ns0 ip link add l0 type dummy + ip netns exec ns0 ip link show l0 8: l0: mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether 66:90:b5:d5:78:69 brd ff:ff:ff:ff:ff:ff + ip link add l0 type dummy + ip link show l0 10: l0: mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether 6e:c6:1f:15:20:8d brd ff:ff:ff:ff:ff:ff + ip link set l0 netns ns0 RTNETLINK answers: Invalid argument After: # ./ifname.sh + ip netns add ns0 + ip netns exec ns0 ip link add l0 type dummy + ip netns exec ns0 ip link show l0 8: l0: mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether 1e:4a:72:e3:e3:8f brd ff:ff:ff:ff:ff:ff + ip link add l0 type dummy + ip link show l0 10: l0: mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether f2:fc:fe:2b:7d:a6 brd ff:ff:ff:ff:ff:ff + ip link set l0 netns ns0 RTNETLINK answers: File exists The problem is that do_setlink() passes its `char *ifname` argument, that it gets from a caller, to __dev_change_net_namespace() as is (as `const char *pat`), but semantics of ifname and pat can be different. For example, __rtnl_newlink() does this: net/core/rtnetlink.c 3270 char ifname[IFNAMSIZ]; ... 3286 if (tb[IFLA_IFNAME]) 3287 nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); 3288 else 3289 ifname[0] = '\0'; ... 3364 if (dev) { ... 3394 return do_setlink(skb, dev, ifm, extack, tb, ifname, status); 3395 } , i.e. do_setlink() gets ifname pointer that is always valid no matter if user specified IFLA_IFNAME or not and then do_setlink() passes this ifname pointer as is to __dev_change_net_namespace() as pat argument. But the pat (pattern) in __dev_change_net_namespace() is used as: net/core/dev.c 11198 err = -EEXIST; 11199 if (__dev_get_by_name(net, dev->name)) { 11200 /* We get here if we can't use the current device name */ 11201 if (!pat) 11202 goto out; 11203 err = dev_get_valid_name(net, dev, pat); 11204 if (err < 0) 11205 goto out; 11206 } As the result the `goto out` path on line 11202 is neven taken and instead of returning EEXIST defined on line 11198, __dev_change_net_namespace() returns an error from dev_get_valid_name() and this, in turn, will be EINVAL for ifname[0] = '\0' set earlier. Fixes: d8a5ec672768 ("[NET]: netlink support for moving devices between network namespaces.") Signed-off-by: Andrey Ignatov Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/core/rtnetlink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 0bad5db23129a..6fbc9cb09dc0e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2414,6 +2414,7 @@ static int do_setlink(const struct sk_buff *skb, return err; if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_TARGET_NETNSID]) { + const char *pat = ifname && ifname[0] ? ifname : NULL; struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev), tb, CAP_NET_ADMIN); if (IS_ERR(net)) { @@ -2421,7 +2422,7 @@ static int do_setlink(const struct sk_buff *skb, goto errout; } - err = dev_change_net_namespace(dev, net, ifname); + err = dev_change_net_namespace(dev, net, pat); put_net(net); if (err) goto errout; From b9dfa3d9a6daf5d0d3cb5e87f00cc74a702da2cc Mon Sep 17 00:00:00 2001 From: Yufeng Mo Date: Thu, 26 Aug 2021 19:21:55 +0800 Subject: [PATCH 0440/5344] net: hns3: clear hardware resource when loading driver [ Upstream commit 1a6d281946c330cee2855f6d0cd796616e54601f ] If a PF is bonded to a virtual machine and the virtual machine exits unexpectedly, some hardware resource cannot be cleared. In this case, loading driver may cause exceptions. Therefore, the hardware resource needs to be cleared when the driver is loaded. Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Yufeng Mo Signed-off-by: Salil Mehta Signed-off-by: Guangbin Huang Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- .../hisilicon/hns3/hns3pf/hclge_cmd.h | 3 +++ .../hisilicon/hns3/hns3pf/hclge_main.c | 26 +++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index e34e0854635c3..d64cded30eeb4 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -257,6 +257,9 @@ enum hclge_opcode_type { /* Led command */ HCLGE_OPC_LED_STATUS_CFG = 0xB000, + /* clear hardware resource command */ + HCLGE_OPC_CLEAR_HW_RESOURCE = 0x700B, + /* NCL config command */ HCLGE_OPC_QUERY_NCL_CONFIG = 0x7011, /* M7 stats command */ diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 93f3865b679bf..28e260439196d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -9165,6 +9165,28 @@ static void hclge_clear_resetting_state(struct hclge_dev *hdev) } } +static int hclge_clear_hw_resource(struct hclge_dev *hdev) +{ + struct hclge_desc desc; + int ret; + + hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_CLEAR_HW_RESOURCE, false); + + ret = hclge_cmd_send(&hdev->hw, &desc, 1); + /* This new command is only supported by new firmware, it will + * fail with older firmware. Error value -EOPNOSUPP can only be + * returned by older firmware running this command, to keep code + * backward compatible we will override this value and return + * success. + */ + if (ret && ret != -EOPNOTSUPP) { + dev_err(&hdev->pdev->dev, + "failed to clear hw resource, ret = %d\n", ret); + return ret; + } + return 0; +} + static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) { struct pci_dev *pdev = ae_dev->pdev; @@ -9206,6 +9228,10 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) if (ret) goto err_cmd_uninit; + ret = hclge_clear_hw_resource(hdev); + if (ret) + goto err_cmd_uninit; + ret = hclge_get_cap(hdev); if (ret) { dev_err(&pdev->dev, "get hw capability error, ret = %d.\n", From e28c9046f160b68876571ba64ae238c8173cb40e Mon Sep 17 00:00:00 2001 From: Guojia Liao Date: Thu, 26 Aug 2021 19:21:58 +0800 Subject: [PATCH 0441/5344] net: hns3: fix duplicate node in VLAN list [ Upstream commit 94391fae82f71c98ecc7716a32611fcca73c74eb ] VLAN list should not be added duplicate VLAN node, otherwise it would cause "add failed" when restore VLAN from VLAN list, so this patch adds VLAN ID check before adding node into VLAN list. Fixes: c6075b193462 ("net: hns3: Record VF vlan tables") Signed-off-by: Guojia Liao Signed-off-by: Guangbin Huang Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 28e260439196d..aa402e2671212 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -8006,7 +8006,11 @@ static int hclge_init_vlan_config(struct hclge_dev *hdev) static void hclge_add_vport_vlan_table(struct hclge_vport *vport, u16 vlan_id, bool writen_to_tbl) { - struct hclge_vport_vlan_cfg *vlan; + struct hclge_vport_vlan_cfg *vlan, *tmp; + + list_for_each_entry_safe(vlan, tmp, &vport->vlan_list, node) + if (vlan->vlan_id == vlan_id) + return; vlan = kzalloc(sizeof(*vlan), GFP_KERNEL); if (!vlan) From a78f8bff2596f00aba051b83a92938dc52170dbe Mon Sep 17 00:00:00 2001 From: Guangbin Huang Date: Thu, 26 Aug 2021 19:22:01 +0800 Subject: [PATCH 0442/5344] net: hns3: fix get wrong pfc_en when query PFC configuration [ Upstream commit 8c1671e0d13d4a0ba4fb3a0da932bf3736d7ff73 ] Currently, when query PFC configuration by dcbtool, driver will return PFC enable status based on TC. As all priorities are mapped to TC0 by default, if TC0 is enabled, then all priorities mapped to TC0 will be shown as enabled status when query PFC setting, even though some priorities have never been set. for example: $ dcb pfc show dev eth0 pfc-cap 4 macsec-bypass off delay 0 prio-pfc 0:off 1:off 2:off 3:off 4:off 5:off 6:off 7:off $ dcb pfc set dev eth0 prio-pfc 0:on 1:on 2:on 3:on $ dcb pfc show dev eth0 pfc-cap 4 macsec-bypass off delay 0 prio-pfc 0:on 1:on 2:on 3:on 4:on 5:on 6:on 7:on To fix this problem, just returns user's PFC config parameter saved in driver. Fixes: cacde272dd00 ("net: hns3: Add hclge_dcb module for the support of DCB feature") Signed-off-by: Guangbin Huang Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- .../net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c index a1790af73096d..d16488bab86f5 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c @@ -281,21 +281,12 @@ static int hclge_ieee_getpfc(struct hnae3_handle *h, struct ieee_pfc *pfc) u64 requests[HNAE3_MAX_TC], indications[HNAE3_MAX_TC]; struct hclge_vport *vport = hclge_get_vport(h); struct hclge_dev *hdev = vport->back; - u8 i, j, pfc_map, *prio_tc; int ret; + u8 i; memset(pfc, 0, sizeof(*pfc)); pfc->pfc_cap = hdev->pfc_max; - prio_tc = hdev->tm_info.prio_tc; - pfc_map = hdev->tm_info.hw_pfc_map; - - /* Pfc setting is based on TC */ - for (i = 0; i < hdev->tm_info.num_tc; i++) { - for (j = 0; j < HNAE3_MAX_USER_PRIO; j++) { - if ((prio_tc[j] == i) && (pfc_map & BIT(i))) - pfc->pfc_en |= BIT(j); - } - } + pfc->pfc_en = hdev->tm_info.pfc_en; ret = hclge_pfc_tx_stats_get(hdev, requests); if (ret) From 7e702bff49804c9810a3921f5bc5734a0267310d Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Fri, 30 Jul 2021 12:53:42 -0700 Subject: [PATCH 0443/5344] drm/i915: Fix syncmap memory leak [ Upstream commit a63bcf08f0efb5348105bb8e0e1e8c6671077753 ] A small race exists between intel_gt_retire_requests_timeout and intel_timeline_exit which could result in the syncmap not getting free'd. Rather than work to hard to seal this race, simply cleanup the syncmap on fini. unreferenced object 0xffff88813bc53b18 (size 96): comm "gem_close_race", pid 5410, jiffies 4294917818 (age 1105.600s) hex dump (first 32 bytes): 01 00 00 00 00 00 00 00 00 00 00 00 0a 00 00 00 ................ 00 00 00 00 00 00 00 00 6b 6b 6b 6b 06 00 00 00 ........kkkk.... backtrace: [<00000000120b863a>] __sync_alloc_leaf+0x1e/0x40 [i915] [<00000000042f6959>] __sync_set+0x1bb/0x240 [i915] [<0000000090f0e90f>] i915_request_await_dma_fence+0x1c7/0x400 [i915] [<0000000056a48219>] i915_request_await_object+0x222/0x360 [i915] [<00000000aaac4ee3>] i915_gem_do_execbuffer+0x1bd0/0x2250 [i915] [<000000003c9d830f>] i915_gem_execbuffer2_ioctl+0x405/0xce0 [i915] [<00000000fd7a8e68>] drm_ioctl_kernel+0xb0/0xf0 [drm] [<00000000e721ee87>] drm_ioctl+0x305/0x3c0 [drm] [<000000008b0d8986>] __x64_sys_ioctl+0x71/0xb0 [<0000000076c362a4>] do_syscall_64+0x33/0x80 [<00000000eb7a4831>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Signed-off-by: Matthew Brost Fixes: 531958f6f357 ("drm/i915/gt: Track timeline activeness in enter/exit") Cc: Reviewed-by: John Harrison Signed-off-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20210730195342.110234-1-matthew.brost@intel.com (cherry picked from commit faf890985e30d5e88cc3a7c50c1bcad32f89ab7c) Signed-off-by: Rodrigo Vivi Signed-off-by: Sasha Levin --- drivers/gpu/drm/i915/gt/intel_timeline.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.c b/drivers/gpu/drm/i915/gt/intel_timeline.c index 9cb01d9828f1d..c970e3deb0085 100644 --- a/drivers/gpu/drm/i915/gt/intel_timeline.c +++ b/drivers/gpu/drm/i915/gt/intel_timeline.c @@ -289,6 +289,14 @@ void intel_timeline_fini(struct intel_timeline *timeline) i915_gem_object_unpin_map(timeline->hwsp_ggtt->obj); i915_vma_put(timeline->hwsp_ggtt); + + /* + * A small race exists between intel_gt_retire_requests_timeout and + * intel_timeline_exit which could result in the syncmap not getting + * free'd. Rather than work to hard to seal this race, simply cleanup + * the syncmap on fini. + */ + i915_syncmap_free(&timeline->sync); } struct intel_timeline * From f5c79af1c94eec44b3184bdebabc9418b1b6581e Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Fri, 27 Aug 2021 11:29:27 +0200 Subject: [PATCH 0444/5344] usb: gadget: u_audio: fix race condition on endpoint stop [ Upstream commit 068fdad20454f815e61e6f6eb9f051a8b3120e88 ] If the endpoint completion callback is call right after the ep_enabled flag is cleared and before usb_ep_dequeue() is call, we could do a double free on the request and the associated buffer. Fix this by clearing ep_enabled after all the endpoint requests have been dequeued. Fixes: 7de8681be2cd ("usb: gadget: u_audio: Free requests only after callback") Cc: stable Reported-by: Thinh Nguyen Signed-off-by: Jerome Brunet Link: https://lore.kernel.org/r/20210827092927.366482-1-jbrunet@baylibre.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/usb/gadget/function/u_audio.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/usb/gadget/function/u_audio.c b/drivers/usb/gadget/function/u_audio.c index 223029fa84459..4e01ba0ab8ecb 100644 --- a/drivers/usb/gadget/function/u_audio.c +++ b/drivers/usb/gadget/function/u_audio.c @@ -349,8 +349,6 @@ static inline void free_ep(struct uac_rtd_params *prm, struct usb_ep *ep) if (!prm->ep_enabled) return; - prm->ep_enabled = false; - audio_dev = uac->audio_dev; params = &audio_dev->params; @@ -368,11 +366,12 @@ static inline void free_ep(struct uac_rtd_params *prm, struct usb_ep *ep) } } + prm->ep_enabled = false; + if (usb_ep_disable(ep)) dev_err(uac->card->dev, "%s:%d Error!\n", __func__, __LINE__); } - int u_audio_start_capture(struct g_audio *audio_dev) { struct snd_uac_chip *uac = audio_dev->uac; From d7766b9e28270ba36654a5413586355d19016e36 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 6 Jul 2021 12:45:53 +0100 Subject: [PATCH 0445/5344] perf/x86/intel/uncore: Fix integer overflow on 23 bit left shift of a u32 [ Upstream commit 0b3a8738b76fe2087f7bc2bd59f4c78504c79180 ] The u32 variable pci_dword is being masked with 0x1fffffff and then left shifted 23 places. The shift is a u32 operation,so a value of 0x200 or more in pci_dword will overflow the u32 and only the bottow 32 bits are assigned to addr. I don't believe this was the original intent. Fix this by casting pci_dword to a resource_size_t to ensure no overflow occurs. Note that the mask and 12 bit left shift operation does not need this because the mask SNR_IMC_MMIO_MEM0_MASK and shift is always a 32 bit value. Fixes: ee49532b38dd ("perf/x86/intel/uncore: Add IMC uncore support for Snow Ridge") Addresses-Coverity: ("Unintentional integer overflow") Signed-off-by: Colin Ian King Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20210706114553.28249-1-colin.king@canonical.com Signed-off-by: Sasha Levin --- arch/x86/events/intel/uncore_snbep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 998cfae77a707..e6e2d0d6aa78e 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -4419,7 +4419,7 @@ static void __snr_uncore_mmio_init_box(struct intel_uncore_box *box, return; pci_read_config_dword(pdev, SNR_IMC_MMIO_BASE_OFFSET, &pci_dword); - addr = (pci_dword & SNR_IMC_MMIO_BASE_MASK) << 23; + addr = ((resource_size_t)pci_dword & SNR_IMC_MMIO_BASE_MASK) << 23; pci_read_config_dword(pdev, mem_offset, &pci_dword); addr |= (pci_dword & SNR_IMC_MMIO_MEM0_MASK) << 12; From c51e3f7b3c0182fda32cfdd0ee78106686af0515 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 26 Jul 2021 10:30:56 +0200 Subject: [PATCH 0446/5344] opp: remove WARN when no valid OPPs remain MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 335ffab3ef864539e814b9a2903b0ae420c1c067 ] This WARN can be triggered per-core and the stack trace is not useful. Replace it with plain dev_err(). Fix a comment while at it. Signed-off-by: Michał Mirosław Signed-off-by: Viresh Kumar Signed-off-by: Sasha Levin --- drivers/opp/of.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 249738e1e0b7a..603c688fe23dc 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -682,8 +682,9 @@ static int _of_add_opp_table_v2(struct device *dev, struct opp_table *opp_table) } } - /* There should be one of more OPP defined */ - if (WARN_ON(!count)) { + /* There should be one or more OPPs defined */ + if (!count) { + dev_err(dev, "%s: no supported OPPs", __func__); ret = -ENOENT; goto remove_static_opp; } From be834d028c09c9e2946a3ec407359839c3a0d4ae Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 21 Jul 2021 17:26:45 +0300 Subject: [PATCH 0447/5344] virtio: Improve vq->broken access to avoid any compiler optimization [ Upstream commit 60f0779862e4ab943810187752c462e85f5fa371 ] Currently vq->broken field is read by virtqueue_is_broken() in busy loop in one context by virtnet_send_command(). vq->broken is set to true in other process context by virtio_break_device(). Reader and writer are accessing it without any synchronization. This may lead to a compiler optimization which may result to optimize reading vq->broken only once. Hence, force reading vq->broken on each invocation of virtqueue_is_broken() and also force writing it so that such update is visible to the readers. It is a theoretical fix that isn't yet encountered in the field. Signed-off-by: Parav Pandit Link: https://lore.kernel.org/r/20210721142648.1525924-2-parav@nvidia.com Signed-off-by: Michael S. Tsirkin Signed-off-by: Sasha Levin --- drivers/virtio/virtio_ring.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index d0f4263812708..e294114585a03 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -2641,7 +2641,7 @@ bool virtqueue_is_broken(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); - return vq->broken; + return READ_ONCE(vq->broken); } EXPORT_SYMBOL_GPL(virtqueue_is_broken); @@ -2656,7 +2656,9 @@ void virtio_break_device(struct virtio_device *dev) spin_lock(&dev->vqs_list_lock); list_for_each_entry(_vq, &dev->vqs, list) { struct vring_virtqueue *vq = to_vvq(_vq); - vq->broken = true; + + /* Pairs with READ_ONCE() in virtqueue_is_broken(). */ + WRITE_ONCE(vq->broken, true); } spin_unlock(&dev->vqs_list_lock); } From 3b1d7160fedf8f65dc64ecaeda4b0a5558c6e7d3 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 21 Jul 2021 17:26:48 +0300 Subject: [PATCH 0448/5344] virtio_pci: Support surprise removal of virtio pci device [ Upstream commit 43bb40c5b92659966bdf4bfe584fde0a3575a049 ] When a virtio pci device undergo surprise removal (aka async removal in PCIe spec), mark the device as broken so that any upper layer drivers can abort any outstanding operation. When a virtio net pci device undergo surprise removal which is used by a NetworkManager, a below call trace was observed. kernel:watchdog: BUG: soft lockup - CPU#1 stuck for 26s! [kworker/1:1:27059] watchdog: BUG: soft lockup - CPU#1 stuck for 52s! [kworker/1:1:27059] CPU: 1 PID: 27059 Comm: kworker/1:1 Tainted: G S W I L 5.13.0-hotplug+ #8 Hardware name: Dell Inc. PowerEdge R640/0H28RR, BIOS 2.9.4 11/06/2020 Workqueue: events linkwatch_event RIP: 0010:virtnet_send_command+0xfc/0x150 [virtio_net] Call Trace: virtnet_set_rx_mode+0xcf/0x2a7 [virtio_net] ? __hw_addr_create_ex+0x85/0xc0 __dev_mc_add+0x72/0x80 igmp6_group_added+0xa7/0xd0 ipv6_mc_up+0x3c/0x60 ipv6_find_idev+0x36/0x80 addrconf_add_dev+0x1e/0xa0 addrconf_dev_config+0x71/0x130 addrconf_notify+0x1f5/0xb40 ? rtnl_is_locked+0x11/0x20 ? __switch_to_asm+0x42/0x70 ? finish_task_switch+0xaf/0x2c0 ? raw_notifier_call_chain+0x3e/0x50 raw_notifier_call_chain+0x3e/0x50 netdev_state_change+0x67/0x90 linkwatch_do_dev+0x3c/0x50 __linkwatch_run_queue+0xd2/0x220 linkwatch_event+0x21/0x30 process_one_work+0x1c8/0x370 worker_thread+0x30/0x380 ? process_one_work+0x370/0x370 kthread+0x118/0x140 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x1f/0x30 Hence, add the ability to abort the command on surprise removal which prevents infinite loop and system lockup. Signed-off-by: Parav Pandit Link: https://lore.kernel.org/r/20210721142648.1525924-5-parav@nvidia.com Signed-off-by: Michael S. Tsirkin Signed-off-by: Sasha Levin --- drivers/virtio/virtio_pci_common.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index 222d630c41fc9..b35bb2d57f62c 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -576,6 +576,13 @@ static void virtio_pci_remove(struct pci_dev *pci_dev) struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); struct device *dev = get_device(&vp_dev->vdev.dev); + /* + * Device is marked broken on surprise removal so that virtio upper + * layers can abort any ongoing operation. + */ + if (!pci_device_is_present(pci_dev)) + virtio_break_device(&vp_dev->vdev); + pci_disable_sriov(pci_dev); unregister_virtio_device(&vp_dev->vdev); From 880726110d76cdfd31b774dee0e4333cf95969ca Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Fri, 25 Jun 2021 08:55:02 +0530 Subject: [PATCH 0449/5344] vringh: Use wiov->used to check for read/write desc order [ Upstream commit e74cfa91f42c50f7f649b0eca46aa049754ccdbd ] As __vringh_iov() traverses a descriptor chain, it populates each descriptor entry into either read or write vring iov and increments that iov's ->used member. So, as we iterate over a descriptor chain, at any point, (riov/wriov)->used value gives the number of descriptor enteries available, which are to be read or written by the device. As all read iovs must precede the write iovs, wiov->used should be zero when we are traversing a read descriptor. Current code checks for wiov->i, to figure out whether any previous entry in the current descriptor chain was a write descriptor. However, iov->i is only incremented, when these vring iovs are consumed, at a later point, and remain 0 in __vringh_iov(). So, correct the check for read and write descriptor order, to use wiov->used. Acked-by: Jason Wang Reviewed-by: Stefano Garzarella Signed-off-by: Neeraj Upadhyay Link: https://lore.kernel.org/r/1624591502-4827-1-git-send-email-neeraju@codeaurora.org Signed-off-by: Michael S. Tsirkin Signed-off-by: Sasha Levin --- drivers/vhost/vringh.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index 8b7f1071b8897..ebfb0f2d92684 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -361,7 +361,7 @@ __vringh_iov(struct vringh *vrh, u16 i, iov = wiov; else { iov = riov; - if (unlikely(wiov && wiov->i)) { + if (unlikely(wiov && wiov->used)) { vringh_bad("Readable desc %p after writable", &descs[i]); err = -EINVAL; From ef98b98fcef821ecb7e25be585c1be421487ea12 Mon Sep 17 00:00:00 2001 From: Shai Malin Date: Sun, 15 Aug 2021 14:05:08 +0300 Subject: [PATCH 0450/5344] qed: qed ll2 race condition fixes [ Upstream commit 37110237f31105d679fc0aa7b11cdec867750ea7 ] Avoiding qed ll2 race condition and NULL pointer dereference as part of the remove and recovery flows. Changes form V1: - Change (!p_rx->set_prod_addr). - qed_ll2.c checkpatch fixes. Change from V2: - Revert "qed_ll2.c checkpatch fixes". Signed-off-by: Ariel Elior Signed-off-by: Shai Malin Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/qlogic/qed/qed_ll2.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c index 19a1a58d60f89..c449ecc0add23 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c +++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c @@ -353,6 +353,9 @@ static int qed_ll2_txq_completion(struct qed_hwfn *p_hwfn, void *p_cookie) unsigned long flags; int rc = -EINVAL; + if (!p_ll2_conn) + return rc; + spin_lock_irqsave(&p_tx->lock, flags); if (p_tx->b_completing_packet) { rc = -EBUSY; @@ -526,7 +529,16 @@ static int qed_ll2_rxq_completion(struct qed_hwfn *p_hwfn, void *cookie) unsigned long flags = 0; int rc = 0; + if (!p_ll2_conn) + return rc; + spin_lock_irqsave(&p_rx->lock, flags); + + if (!QED_LL2_RX_REGISTERED(p_ll2_conn)) { + spin_unlock_irqrestore(&p_rx->lock, flags); + return 0; + } + cq_new_idx = le16_to_cpu(*p_rx->p_fw_cons); cq_old_idx = qed_chain_get_cons_idx(&p_rx->rcq_chain); @@ -847,6 +859,9 @@ static int qed_ll2_lb_rxq_completion(struct qed_hwfn *p_hwfn, void *p_cookie) struct qed_ll2_info *p_ll2_conn = (struct qed_ll2_info *)p_cookie; int rc; + if (!p_ll2_conn) + return 0; + if (!QED_LL2_RX_REGISTERED(p_ll2_conn)) return 0; @@ -870,6 +885,9 @@ static int qed_ll2_lb_txq_completion(struct qed_hwfn *p_hwfn, void *p_cookie) u16 new_idx = 0, num_bds = 0; int rc; + if (!p_ll2_conn) + return 0; + if (!QED_LL2_TX_REGISTERED(p_ll2_conn)) return 0; @@ -1642,6 +1660,8 @@ int qed_ll2_post_rx_buffer(void *cxt, if (!p_ll2_conn) return -EINVAL; p_rx = &p_ll2_conn->rx_queue; + if (!p_rx->set_prod_addr) + return -EIO; spin_lock_irqsave(&p_rx->lock, flags); if (!list_empty(&p_rx->free_descq)) From f0769114fd5e04cbba1d88ccf834c228e2fc2c1b Mon Sep 17 00:00:00 2001 From: Shai Malin Date: Sun, 15 Aug 2021 14:06:39 +0300 Subject: [PATCH 0451/5344] qed: Fix null-pointer dereference in qed_rdma_create_qp() [ Upstream commit d33d19d313d3466abdf8b0428be7837aff767802 ] Fix a possible null-pointer dereference in qed_rdma_create_qp(). Changes from V2: - Revert checkpatch fixes. Reported-by: TOTE Robot Signed-off-by: Ariel Elior Signed-off-by: Shai Malin Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/qlogic/qed/qed_rdma.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.c b/drivers/net/ethernet/qlogic/qed/qed_rdma.c index 38b1f402f7ed2..b291971bcf926 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_rdma.c +++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.c @@ -1245,8 +1245,7 @@ qed_rdma_create_qp(void *rdma_cxt, if (!rdma_cxt || !in_params || !out_params || !p_hwfn->p_rdma_info->active) { - DP_ERR(p_hwfn->cdev, - "qed roce create qp failed due to NULL entry (rdma_cxt=%p, in=%p, out=%p, roce_info=?\n", + pr_err("qed roce create qp failed due to NULL entry (rdma_cxt=%p, in=%p, out=%p, roce_info=?\n", rdma_cxt, in_params, out_params); return NULL; } From 16408e7bf751ce0aabbd1a90dc1ce75198cf41e3 Mon Sep 17 00:00:00 2001 From: Mark Yacoub Date: Thu, 12 Aug 2021 15:49:17 -0400 Subject: [PATCH 0452/5344] drm: Copy drm_wait_vblank to user before returning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit fa0b1ef5f7a694f48e00804a391245f3471aa155 ] [Why] Userspace should get back a copy of drm_wait_vblank that's been modified even when drm_wait_vblank_ioctl returns a failure. Rationale: drm_wait_vblank_ioctl modifies the request and expects the user to read it back. When the type is RELATIVE, it modifies it to ABSOLUTE and updates the sequence to become current_vblank_count + sequence (which was RELATIVE), but now it became ABSOLUTE. drmWaitVBlank (in libdrm) expects this to be the case as it modifies the request to be Absolute so it expects the sequence to would have been updated. The change is in compat_drm_wait_vblank, which is called by drm_compat_ioctl. This change of copying the data back regardless of the return number makes it en par with drm_ioctl, which always copies the data before returning. [How] Return from the function after everything has been copied to user. Fixes IGT:kms_flip::modeset-vs-vblank-race-interruptible Tested on ChromeOS Trogdor(msm) Reviewed-by: Michel Dänzer Signed-off-by: Mark Yacoub Signed-off-by: Sean Paul Link: https://patchwork.freedesktop.org/patch/msgid/20210812194917.1703356-1-markyacoub@chromium.org Signed-off-by: Sasha Levin --- drivers/gpu/drm/drm_ioc32.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/gpu/drm/drm_ioc32.c b/drivers/gpu/drm/drm_ioc32.c index 2cf053fb8d54b..1c691bdb89141 100644 --- a/drivers/gpu/drm/drm_ioc32.c +++ b/drivers/gpu/drm/drm_ioc32.c @@ -863,8 +863,6 @@ static int compat_drm_wait_vblank(struct file *file, unsigned int cmd, req.request.sequence = req32.request.sequence; req.request.signal = req32.request.signal; err = drm_ioctl_kernel(file, drm_wait_vblank_ioctl, &req, DRM_UNLOCKED); - if (err) - return err; req32.reply.type = req.reply.type; req32.reply.sequence = req.reply.sequence; @@ -873,7 +871,7 @@ static int compat_drm_wait_vblank(struct file *file, unsigned int cmd, if (copy_to_user(argp, &req32, sizeof(req32))) return -EFAULT; - return 0; + return err; } #if defined(CONFIG_X86) From 5cd6584c3f4699d533dc8b4598e2e4cbf5da1d62 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Mon, 9 Aug 2021 16:40:48 +1000 Subject: [PATCH 0453/5344] drm/nouveau/disp: power down unused DP links during init [ Upstream commit 6eaa1f3c59a707332e921e32782ffcad49915c5e ] When booted with multiple displays attached, the EFI GOP driver on (at least) Ampere, can leave DP links powered up that aren't being used to display anything. This confuses our tracking of SOR routing, with the likely result being a failed modeset and display engine hang. Fix this by (ab?)using the DisableLT IED script to power-down the link, restoring HW to a state the driver expects. Signed-off-by: Ben Skeggs Reviewed-by: Lyude Paul Signed-off-by: Sasha Levin --- drivers/gpu/drm/nouveau/nvkm/engine/disp/dp.c | 2 +- drivers/gpu/drm/nouveau/nvkm/engine/disp/dp.h | 1 + drivers/gpu/drm/nouveau/nvkm/engine/disp/outp.c | 9 +++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/dp.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/dp.c index 818d21bd28d31..1d2837c5a8f29 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/dp.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/dp.c @@ -419,7 +419,7 @@ nvkm_dp_train(struct nvkm_dp *dp, u32 dataKBps) return ret; } -static void +void nvkm_dp_disable(struct nvkm_outp *outp, struct nvkm_ior *ior) { struct nvkm_dp *dp = nvkm_dp(outp); diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/dp.h b/drivers/gpu/drm/nouveau/nvkm/engine/disp/dp.h index 428b3f488f033..e484d0c3b0d42 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/dp.h +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/dp.h @@ -32,6 +32,7 @@ struct nvkm_dp { int nvkm_dp_new(struct nvkm_disp *, int index, struct dcb_output *, struct nvkm_outp **); +void nvkm_dp_disable(struct nvkm_outp *, struct nvkm_ior *); /* DPCD Receiver Capabilities */ #define DPCD_RC00_DPCD_REV 0x00000 diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/outp.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/outp.c index c62030c96fba0..4b1c72fd8f039 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/outp.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/outp.c @@ -22,6 +22,7 @@ * Authors: Ben Skeggs */ #include "outp.h" +#include "dp.h" #include "ior.h" #include @@ -216,6 +217,14 @@ nvkm_outp_init_route(struct nvkm_outp *outp) if (!ior->arm.head || ior->arm.proto != proto) { OUTP_DBG(outp, "no heads (%x %d %d)", ior->arm.head, ior->arm.proto, proto); + + /* The EFI GOP driver on Ampere can leave unused DP links routed, + * which we don't expect. The DisableLT IED script *should* get + * us back to where we need to be. + */ + if (ior->func->route.get && !ior->arm.head && outp->info.type == DCB_OUTPUT_DP) + nvkm_dp_disable(outp, ior); + return; } From bb82f863e62e7e624a41797c86ebc4f2a5af3b14 Mon Sep 17 00:00:00 2001 From: Gerd Rausch Date: Tue, 17 Aug 2021 10:04:37 -0700 Subject: [PATCH 0454/5344] net/rds: dma_map_sg is entitled to merge entries [ Upstream commit fb4b1373dcab086d0619c29310f0466a0b2ceb8a ] Function "dma_map_sg" is entitled to merge adjacent entries and return a value smaller than what was passed as "nents". Subsequently "ib_map_mr_sg" needs to work with this value ("sg_dma_len") rather than the original "nents" parameter ("sg_len"). This old RDS bug was exposed and reliably causes kernel panics (using RDMA operations "rds-stress -D") on x86_64 starting with: commit c588072bba6b ("iommu/vt-d: Convert intel iommu driver to the iommu ops") Simply put: Linux 5.11 and later. Signed-off-by: Gerd Rausch Acked-by: Santosh Shilimkar Link: https://lore.kernel.org/r/60efc69f-1f35-529d-a7ef-da0549cad143@oracle.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/rds/ib_frmr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c index 06ecf9d2d4bf1..ef6acd7211180 100644 --- a/net/rds/ib_frmr.c +++ b/net/rds/ib_frmr.c @@ -131,9 +131,9 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) cpu_relax(); } - ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, + ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_dma_len, &off, PAGE_SIZE); - if (unlikely(ret != ibmr->sg_len)) + if (unlikely(ret != ibmr->sg_dma_len)) return ret < 0 ? ret : -EINVAL; if (cmpxchg(&frmr->fr_state, From 9ff978e236dacdfbac51b74de76168feca0b976c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 23 Feb 2021 12:08:48 +0000 Subject: [PATCH 0455/5344] btrfs: fix race between marking inode needs to be logged and log syncing commit bc0939fcfab0d7efb2ed12896b1af3d819954a14 upstream. We have a race between marking that an inode needs to be logged, either at btrfs_set_inode_last_trans() or at btrfs_page_mkwrite(), and between btrfs_sync_log(). The following steps describe how the race happens. 1) We are at transaction N; 2) Inode I was previously fsynced in the current transaction so it has: inode->logged_trans set to N; 3) The inode's root currently has: root->log_transid set to 1 root->last_log_commit set to 0 Which means only one log transaction was committed to far, log transaction 0. When a log tree is created we set ->log_transid and ->last_log_commit of its parent root to 0 (at btrfs_add_log_tree()); 4) One more range of pages is dirtied in inode I; 5) Some task A starts an fsync against some other inode J (same root), and so it joins log transaction 1. Before task A calls btrfs_sync_log()... 6) Task B starts an fsync against inode I, which currently has the full sync flag set, so it starts delalloc and waits for the ordered extent to complete before calling btrfs_inode_in_log() at btrfs_sync_file(); 7) During ordered extent completion we have btrfs_update_inode() called against inode I, which in turn calls btrfs_set_inode_last_trans(), which does the following: spin_lock(&inode->lock); inode->last_trans = trans->transaction->transid; inode->last_sub_trans = inode->root->log_transid; inode->last_log_commit = inode->root->last_log_commit; spin_unlock(&inode->lock); So ->last_trans is set to N and ->last_sub_trans set to 1. But before setting ->last_log_commit... 8) Task A is at btrfs_sync_log(): - it increments root->log_transid to 2 - starts writeback for all log tree extent buffers - waits for the writeback to complete - writes the super blocks - updates root->last_log_commit to 1 It's a lot of slow steps between updating root->log_transid and root->last_log_commit; 9) The task doing the ordered extent completion, currently at btrfs_set_inode_last_trans(), then finally runs: inode->last_log_commit = inode->root->last_log_commit; spin_unlock(&inode->lock); Which results in inode->last_log_commit being set to 1. The ordered extent completes; 10) Task B is resumed, and it calls btrfs_inode_in_log() which returns true because we have all the following conditions met: inode->logged_trans == N which matches fs_info->generation && inode->last_subtrans (1) <= inode->last_log_commit (1) && inode->last_subtrans (1) <= root->last_log_commit (1) && list inode->extent_tree.modified_extents is empty And as a consequence we return without logging the inode, so the existing logged version of the inode does not point to the extent that was written after the previous fsync. It should be impossible in practice for one task be able to do so much progress in btrfs_sync_log() while another task is at btrfs_set_inode_last_trans() right after it reads root->log_transid and before it reads root->last_log_commit. Even if kernel preemption is enabled we know the task at btrfs_set_inode_last_trans() can not be preempted because it is holding the inode's spinlock. However there is another place where we do the same without holding the spinlock, which is in the memory mapped write path at: vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) { (...) BTRFS_I(inode)->last_trans = fs_info->generation; BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; (...) So with preemption happening after setting ->last_sub_trans and before setting ->last_log_commit, it is less of a stretch to have another task do enough progress at btrfs_sync_log() such that the task doing the memory mapped write ends up with ->last_sub_trans and ->last_log_commit set to the same value. It is still a big stretch to get there, as the task doing btrfs_sync_log() has to start writeback, wait for its completion and write the super blocks. So fix this in two different ways: 1) For btrfs_set_inode_last_trans(), simply set ->last_log_commit to the value of ->last_sub_trans minus 1; 2) For btrfs_page_mkwrite() only set the inode's ->last_sub_trans, just like we do for buffered and direct writes at btrfs_file_write_iter(), which is all we need to make sure multiple writes and fsyncs to an inode in the same transaction never result in an fsync missing that the inode changed and needs to be logged. Turn this into a helper function and use it both at btrfs_page_mkwrite() and at btrfs_file_write_iter() - this also fixes the problem that at btrfs_page_mkwrite() we were setting those fields without the protection of the inode's spinlock. This is an extremely unlikely race to happen in practice. Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Anand Jain Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/btrfs_inode.h | 15 +++++++++++++++ fs/btrfs/file.c | 10 ++-------- fs/btrfs/inode.c | 4 +--- fs/btrfs/transaction.h | 2 +- 4 files changed, 19 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index f853835c409c1..f3ff57b931580 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -268,6 +268,21 @@ static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode, mod); } +/* + * Called every time after doing a buffered, direct IO or memory mapped write. + * + * This is to ensure that if we write to a file that was previously fsynced in + * the current transaction, then try to fsync it again in the same transaction, + * we will know that there were changes in the file and that it needs to be + * logged. + */ +static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode) +{ + spin_lock(&inode->lock); + inode->last_sub_trans = inode->root->log_transid; + spin_unlock(&inode->lock); +} + static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) { int ret = 0; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index bbd77db8dd31b..29677510f91c1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2004,14 +2004,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, inode_unlock(inode); - /* - * We also have to set last_sub_trans to the current log transid, - * otherwise subsequent syncs to a file that's been synced in this - * transaction will appear to have already occurred. - */ - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->last_sub_trans = root->log_transid; - spin_unlock(&BTRFS_I(inode)->lock); + btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); + if (num_written > 0) num_written = generic_write_sync(iocb, num_written); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 54b607a3cc3f2..29552d4f6845b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9250,9 +9250,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) set_page_dirty(page); SetPageUptodate(page); - BTRFS_I(inode)->last_trans = fs_info->generation; - BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; - BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; + btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); unlock_extent_cached(io_tree, page_start, page_end, &cached_state); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index d8a7d460e436a..cbede328bda5b 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -160,7 +160,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->last_trans = trans->transaction->transid; BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; - BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; + BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans - 1; spin_unlock(&BTRFS_I(inode)->lock); } From 7e93fb384a52ab3588ad682cfcc5998d679cd7d9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 30 Aug 2021 08:55:18 -0700 Subject: [PATCH 0456/5344] vt_kdsetmode: extend console locking commit 2287a51ba822384834dafc1c798453375d1107c7 upstream. As per the long-suffering comment. Reported-by: Minh Yuan Cc: Greg Kroah-Hartman Cc: Jiri Slaby Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- drivers/tty/vt/vt_ioctl.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/tty/vt/vt_ioctl.c b/drivers/tty/vt/vt_ioctl.c index b5d2ad900ec09..167c72726c5a0 100644 --- a/drivers/tty/vt/vt_ioctl.c +++ b/drivers/tty/vt/vt_ioctl.c @@ -484,16 +484,19 @@ int vt_ioctl(struct tty_struct *tty, ret = -EINVAL; goto out; } - /* FIXME: this needs the console lock extending */ - if (vc->vc_mode == (unsigned char) arg) + console_lock(); + if (vc->vc_mode == (unsigned char) arg) { + console_unlock(); break; + } vc->vc_mode = (unsigned char) arg; - if (console != fg_console) + if (console != fg_console) { + console_unlock(); break; + } /* * explicitly blank/unblank the screen if switching modes */ - console_lock(); if (arg == KD_TEXT) do_unblank_screen(1); else From e9e32c7a965174f78a289fa345029e8daaf656d0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 9 Oct 2019 13:14:57 -0700 Subject: [PATCH 0457/5344] bpf: Track contents of read-only maps as scalars commit a23740ec43ba022dbfd139d0fe3eff193216272b upstream. Maps that are read-only both from BPF program side and user space side have their contents constant, so verifier can track referenced values precisely and use that knowledge for dead code elimination, branch pruning, etc. This patch teaches BPF verifier how to do this. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191009201458.2679171-2-andriin@fb.com Signed-off-by: Rafael David Tinoco Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 57 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5e2456109405a..44cee0f2d9e9b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2778,6 +2778,41 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) reg->smax_value = reg->umax_value; } +static bool bpf_map_is_rdonly(const struct bpf_map *map) +{ + return (map->map_flags & BPF_F_RDONLY_PROG) && map->frozen; +} + +static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val) +{ + void *ptr; + u64 addr; + int err; + + err = map->ops->map_direct_value_addr(map, &addr, off); + if (err) + return err; + ptr = (void *)addr + off; + + switch (size) { + case sizeof(u8): + *val = (u64)*(u8 *)ptr; + break; + case sizeof(u16): + *val = (u64)*(u16 *)ptr; + break; + case sizeof(u32): + *val = (u64)*(u32 *)ptr; + break; + case sizeof(u64): + *val = *(u64 *)ptr; + break; + default: + return -EINVAL; + } + return 0; +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -2815,9 +2850,27 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (err) return err; err = check_map_access(env, regno, off, size, false); - if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown(env, regs, value_regno); + if (!err && t == BPF_READ && value_regno >= 0) { + struct bpf_map *map = reg->map_ptr; + + /* if map is read-only, track its contents as scalars */ + if (tnum_is_const(reg->var_off) && + bpf_map_is_rdonly(map) && + map->ops->map_direct_value_addr) { + int map_off = off + reg->var_off.value; + u64 val = 0; + err = bpf_map_direct_read(map, map_off, size, + &val); + if (err) + return err; + + regs[value_regno].type = SCALAR_VALUE; + __mark_reg_known(®s[value_regno], val); + } else { + mark_reg_unknown(env, regs, value_regno); + } + } } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; From 802987e7b9f3227ae49ba58172dcf75dec57d5e0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 11 Oct 2019 10:20:53 -0700 Subject: [PATCH 0458/5344] bpf: Fix cast to pointer from integer of different size warning commit 2dedd7d2165565bafa89718eaadfc5d1a7865f66 upstream. Fix "warning: cast to pointer from integer of different size" when casting u64 addr to void *. Fixes: a23740ec43ba ("bpf: Track contents of read-only maps as scalars") Reported-by: kbuild test robot Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191011172053.2980619-1-andriin@fb.com Cc: Rafael David Tinoco Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 44cee0f2d9e9b..eb95cf0f5609b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2792,7 +2792,7 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val) err = map->ops->map_direct_value_addr(map, &addr, off); if (err) return err; - ptr = (void *)addr + off; + ptr = (void *)(long)addr + off; switch (size) { case sizeof(u8): From dba4ec88eb0a3eeb2d1f23f8bc54393a4c66975d Mon Sep 17 00:00:00 2001 From: DENG Qingfang Date: Wed, 11 Aug 2021 17:50:43 +0800 Subject: [PATCH 0459/5344] net: dsa: mt7530: fix VLAN traffic leaks again commit 7428022b50d0fbb4846dd0f00639ea09d36dff02 upstream. When a port leaves a VLAN-aware bridge, the current code does not clear other ports' matrix field bit. If the bridge is later set to VLAN-unaware mode, traffic in the bridge may leak to that port. Remove the VLAN filtering check in mt7530_port_bridge_leave. Fixes: 474a2ddaa192 ("net: dsa: mt7530: fix VLAN traffic leaks") Fixes: 83163f7dca56 ("net: dsa: mediatek: add VLAN support for MT7530") Signed-off-by: DENG Qingfang Reviewed-by: Vladimir Oltean Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/dsa/mt7530.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index e1a3c33fdad90..2d8382eb9add3 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -842,11 +842,8 @@ mt7530_port_bridge_leave(struct dsa_switch *ds, int port, /* Remove this port from the port matrix of the other ports * in the same bridge. If the port is disabled, port matrix * is kept and not being setup until the port becomes enabled. - * And the other port's port matrix cannot be broken when the - * other port is still a VLAN-aware port. */ - if (dsa_is_user_port(ds, i) && i != port && - !dsa_port_is_vlan_filtering(&ds->ports[i])) { + if (dsa_is_user_port(ds, i) && i != port) { if (dsa_to_port(ds, i)->bridge_dev != bridge) continue; if (priv->ports[i].enable) From b422c66e9ef2ebc93f03d1133ec94b49d2275557 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:56:47 -0700 Subject: [PATCH 0460/5344] KVM: x86/mmu: Treat NX as used (not reserved) for all !TDP shadow MMUs commit 112022bdb5bc372e00e6e43cb88ee38ea67b97bd upstream Mark NX as being used for all non-nested shadow MMUs, as KVM will set the NX bit for huge SPTEs if the iTLB mutli-hit mitigation is enabled. Checking the mitigation itself is not sufficient as it can be toggled on at any time and KVM doesn't reset MMU contexts when that happens. KVM could reset the contexts, but that would require purging all SPTEs in all MMUs, for no real benefit. And, KVM already forces EFER.NX=1 when TDP is disabled (for WP=0, SMEP=1, NX=0), so technically NX is never reserved for shadow MMUs. Fixes: b8e8c8303ff2 ("kvm: mmu: ITLB_MULTIHIT mitigation") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-3-seanjc@google.com> Signed-off-by: Paolo Bonzini [sudip: use old path] Signed-off-by: Sudip Mukherjee Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/mmu.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 260c64c205b8c..d3877dd713aef 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4666,7 +4666,15 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { - bool uses_nx = context->nx || + /* + * KVM uses NX when TDP is disabled to handle a variety of scenarios, + * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and + * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0. + * The iTLB multi-hit workaround can be toggled at any time, so assume + * NX can be used by any non-nested shadow MMU to avoid having to reset + * MMU contexts. Note, KVM forces EFER.NX=1 when TDP is disabled. + */ + bool uses_nx = context->nx || !tdp_enabled || context->mmu_role.base.smep_andnot_wp; struct rsvd_bits_validate *shadow_zero_check; int i; From e793809631afdbfc88a5214766ac5b834832d32f Mon Sep 17 00:00:00 2001 From: Petr Vorel Date: Thu, 15 Apr 2021 21:39:13 +0200 Subject: [PATCH 0461/5344] arm64: dts: qcom: msm8994-angler: Fix gpio-reserved-ranges 85-88 commit f890f89d9a80fffbfa7ca791b78927e5b8aba869 upstream. Reserve GPIO pins 85-88 as these aren't meant to be accessible from the application CPUs (causes reboot). Yet another fix similar to 9134586715e3, 5f8d3ab136d0, which is needed to allow angler to boot after 3edfb7bd76bd ("gpiolib: Show correct direction from the beginning"). Fixes: feeaf56ac78d ("arm64: dts: msm8994 SoC and Huawei Angler (Nexus 6P) support") Signed-off-by: Petr Vorel Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20210415193913.1836153-1-petr.vorel@gmail.com Signed-off-by: Bjorn Andersson Signed-off-by: Greg Kroah-Hartman --- arch/arm64/boot/dts/qcom/msm8994-angler-rev-101.dts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/boot/dts/qcom/msm8994-angler-rev-101.dts b/arch/arm64/boot/dts/qcom/msm8994-angler-rev-101.dts index a5f9a6ab512c4..9b989cc30edc8 100644 --- a/arch/arm64/boot/dts/qcom/msm8994-angler-rev-101.dts +++ b/arch/arm64/boot/dts/qcom/msm8994-angler-rev-101.dts @@ -30,3 +30,7 @@ }; }; }; + +&msmgpio { + gpio-reserved-ranges = <85 4>; +}; From f6c933cb74b26457e6855a187c222cc8bedfc81b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 6 Aug 2021 18:24:15 +0800 Subject: [PATCH 0462/5344] btrfs: fix NULL pointer dereference when deleting device by invalid id commit e4571b8c5e9ffa1e85c0c671995bd4dcc5c75091 upstream. [BUG] It's easy to trigger NULL pointer dereference, just by removing a non-existing device id: # mkfs.btrfs -f -m single -d single /dev/test/scratch1 \ /dev/test/scratch2 # mount /dev/test/scratch1 /mnt/btrfs # btrfs device remove 3 /mnt/btrfs Then we have the following kernel NULL pointer dereference: BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 9 PID: 649 Comm: btrfs Not tainted 5.14.0-rc3-custom+ #35 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:btrfs_rm_device+0x4de/0x6b0 [btrfs] btrfs_ioctl+0x18bb/0x3190 [btrfs] ? lock_is_held_type+0xa5/0x120 ? find_held_lock.constprop.0+0x2b/0x80 ? do_user_addr_fault+0x201/0x6a0 ? lock_release+0xd2/0x2d0 ? __x64_sys_ioctl+0x83/0xb0 __x64_sys_ioctl+0x83/0xb0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae [CAUSE] Commit a27a94c2b0c7 ("btrfs: Make btrfs_find_device_by_devspec return btrfs_device directly") moves the "missing" device path check into btrfs_rm_device(). But btrfs_rm_device() itself can have case where it only receives @devid, with NULL as @device_path. In that case, calling strcmp() on NULL will trigger the NULL pointer dereference. Before that commit, we handle the "missing" case inside btrfs_find_device_by_devspec(), which will not check @device_path at all if @devid is provided, thus no way to trigger the bug. [FIX] Before calling strcmp(), also make sure @device_path is not NULL. Fixes: a27a94c2b0c7 ("btrfs: Make btrfs_find_device_by_devspec return btrfs_device directly") CC: stable@vger.kernel.org # 5.4+ Reported-by: butt3rflyh4ck Reviewed-by: Anand Jain Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 94bc3e280b009..8deee49a6b3fa 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2171,7 +2171,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, if (IS_ERR(device)) { if (PTR_ERR(device) == -ENOENT && - strcmp(device_path, "missing") == 0) + device_path && strcmp(device_path, "missing") == 0) ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; else ret = PTR_ERR(device); From 7a22c07ba229823132d925857e7a36901a446505 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Sat, 7 Aug 2021 10:37:02 +0300 Subject: [PATCH 0463/5344] Revert "floppy: reintroduce O_NDELAY fix" commit c7e9d0020361f4308a70cdfd6d5335e273eb8717 upstream. The patch breaks userspace implementations (e.g. fdutils) and introduces regressions in behaviour. Previously, it was possible to O_NDELAY open a floppy device with no media inserted or with write protected media without an error. Some userspace tools use this particular behavior for probing. It's not the first time when we revert this patch. Previous revert is in commit f2791e7eadf4 (Revert "floppy: refactor open() flags handling"). This reverts commit 8a0c014cd20516ade9654fc13b51345ec58e7be8. Link: https://lore.kernel.org/linux-block/de10cb47-34d1-5a88-7751-225ca380f735@compro.net/ Reported-by: Mark Hounschell Cc: Jiri Kosina Cc: Wim Osterholt Cc: Kurt Garloff Cc: Signed-off-by: Denis Efremov Signed-off-by: Greg Kroah-Hartman --- drivers/block/floppy.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 40ea1a425c431..ac97a1e2e5ddc 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4063,22 +4063,21 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) if (UFDCS->rawcmd == 1) UFDCS->rawcmd = 2; - if (mode & (FMODE_READ|FMODE_WRITE)) { - UDRS->last_checked = 0; - clear_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags); - check_disk_change(bdev); - if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags)) - goto out; - if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags)) + if (!(mode & FMODE_NDELAY)) { + if (mode & (FMODE_READ|FMODE_WRITE)) { + UDRS->last_checked = 0; + clear_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags); + check_disk_change(bdev); + if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags)) + goto out; + if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags)) + goto out; + } + res = -EROFS; + if ((mode & FMODE_WRITE) && + !test_bit(FD_DISK_WRITABLE_BIT, &UDRS->flags)) goto out; } - - res = -EROFS; - - if ((mode & FMODE_WRITE) && - !test_bit(FD_DISK_WRITABLE_BIT, &UDRS->flags)) - goto out; - mutex_unlock(&open_lock); mutex_unlock(&floppy_mutex); return 0; From ad1d7959f94505d2552357d3dd72fdfc0eaa8e68 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 27 Aug 2021 20:42:57 +0200 Subject: [PATCH 0464/5344] Revert "parisc: Add assembly implementations for memset, strlen, strcpy, strncpy and strcat" commit f6a3308d6feb351d9854eb8b3f6289a1ac163125 upstream. This reverts commit 83af58f8068ea3f7b3c537c37a30887bfa585069. It turns out that at least the assembly implementation for strncpy() was buggy. Revert the whole commit and return back to the default coding. Signed-off-by: Helge Deller Cc: # v5.4+ Cc: Rasmus Villemoes Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- arch/parisc/include/asm/string.h | 15 ---- arch/parisc/kernel/parisc_ksyms.c | 4 - arch/parisc/lib/Makefile | 4 +- arch/parisc/lib/memset.c | 72 ++++++++++++++++ arch/parisc/lib/string.S | 136 ------------------------------ 5 files changed, 74 insertions(+), 157 deletions(-) create mode 100644 arch/parisc/lib/memset.c delete mode 100644 arch/parisc/lib/string.S diff --git a/arch/parisc/include/asm/string.h b/arch/parisc/include/asm/string.h index 4a0c9dbd62fd0..f6e1132f4e352 100644 --- a/arch/parisc/include/asm/string.h +++ b/arch/parisc/include/asm/string.h @@ -8,19 +8,4 @@ extern void * memset(void *, int, size_t); #define __HAVE_ARCH_MEMCPY void * memcpy(void * dest,const void *src,size_t count); -#define __HAVE_ARCH_STRLEN -extern size_t strlen(const char *s); - -#define __HAVE_ARCH_STRCPY -extern char *strcpy(char *dest, const char *src); - -#define __HAVE_ARCH_STRNCPY -extern char *strncpy(char *dest, const char *src, size_t count); - -#define __HAVE_ARCH_STRCAT -extern char *strcat(char *dest, const char *src); - -#define __HAVE_ARCH_MEMSET -extern void *memset(void *, int, size_t); - #endif diff --git a/arch/parisc/kernel/parisc_ksyms.c b/arch/parisc/kernel/parisc_ksyms.c index 8ed409ecec933..e8a6a751dfd8e 100644 --- a/arch/parisc/kernel/parisc_ksyms.c +++ b/arch/parisc/kernel/parisc_ksyms.c @@ -17,10 +17,6 @@ #include EXPORT_SYMBOL(memset); -EXPORT_SYMBOL(strlen); -EXPORT_SYMBOL(strcpy); -EXPORT_SYMBOL(strncpy); -EXPORT_SYMBOL(strcat); #include EXPORT_SYMBOL(__xchg8); diff --git a/arch/parisc/lib/Makefile b/arch/parisc/lib/Makefile index 2d7a9974dbaef..7b197667faf6c 100644 --- a/arch/parisc/lib/Makefile +++ b/arch/parisc/lib/Makefile @@ -3,7 +3,7 @@ # Makefile for parisc-specific library files # -lib-y := lusercopy.o bitops.o checksum.o io.o memcpy.o \ - ucmpdi2.o delay.o string.o +lib-y := lusercopy.o bitops.o checksum.o io.o memset.o memcpy.o \ + ucmpdi2.o delay.o obj-y := iomap.o diff --git a/arch/parisc/lib/memset.c b/arch/parisc/lib/memset.c new file mode 100644 index 0000000000000..133e4809859a3 --- /dev/null +++ b/arch/parisc/lib/memset.c @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#include +#include + +#define OPSIZ (BITS_PER_LONG/8) +typedef unsigned long op_t; + +void * +memset (void *dstpp, int sc, size_t len) +{ + unsigned int c = sc; + long int dstp = (long int) dstpp; + + if (len >= 8) + { + size_t xlen; + op_t cccc; + + cccc = (unsigned char) c; + cccc |= cccc << 8; + cccc |= cccc << 16; + if (OPSIZ > 4) + /* Do the shift in two steps to avoid warning if long has 32 bits. */ + cccc |= (cccc << 16) << 16; + + /* There are at least some bytes to set. + No need to test for LEN == 0 in this alignment loop. */ + while (dstp % OPSIZ != 0) + { + ((unsigned char *) dstp)[0] = c; + dstp += 1; + len -= 1; + } + + /* Write 8 `op_t' per iteration until less than 8 `op_t' remain. */ + xlen = len / (OPSIZ * 8); + while (xlen > 0) + { + ((op_t *) dstp)[0] = cccc; + ((op_t *) dstp)[1] = cccc; + ((op_t *) dstp)[2] = cccc; + ((op_t *) dstp)[3] = cccc; + ((op_t *) dstp)[4] = cccc; + ((op_t *) dstp)[5] = cccc; + ((op_t *) dstp)[6] = cccc; + ((op_t *) dstp)[7] = cccc; + dstp += 8 * OPSIZ; + xlen -= 1; + } + len %= OPSIZ * 8; + + /* Write 1 `op_t' per iteration until less than OPSIZ bytes remain. */ + xlen = len / OPSIZ; + while (xlen > 0) + { + ((op_t *) dstp)[0] = cccc; + dstp += OPSIZ; + xlen -= 1; + } + len %= OPSIZ; + } + + /* Write the last few bytes. */ + while (len > 0) + { + ((unsigned char *) dstp)[0] = c; + dstp += 1; + len -= 1; + } + + return dstpp; +} diff --git a/arch/parisc/lib/string.S b/arch/parisc/lib/string.S deleted file mode 100644 index 4a64264427a63..0000000000000 --- a/arch/parisc/lib/string.S +++ /dev/null @@ -1,136 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * PA-RISC assembly string functions - * - * Copyright (C) 2019 Helge Deller - */ - -#include -#include - - .section .text.hot - .level PA_ASM_LEVEL - - t0 = r20 - t1 = r21 - t2 = r22 - -ENTRY_CFI(strlen, frame=0,no_calls) - or,COND(<>) arg0,r0,ret0 - b,l,n .Lstrlen_null_ptr,r0 - depwi 0,31,2,ret0 - cmpb,COND(<>) arg0,ret0,.Lstrlen_not_aligned - ldw,ma 4(ret0),t0 - cmpib,tr 0,r0,.Lstrlen_loop - uxor,nbz r0,t0,r0 -.Lstrlen_not_aligned: - uaddcm arg0,ret0,t1 - shladd t1,3,r0,t1 - mtsar t1 - depwi -1,%sar,32,t0 - uxor,nbz r0,t0,r0 -.Lstrlen_loop: - b,l,n .Lstrlen_end_loop,r0 - ldw,ma 4(ret0),t0 - cmpib,tr 0,r0,.Lstrlen_loop - uxor,nbz r0,t0,r0 -.Lstrlen_end_loop: - extrw,u,<> t0,7,8,r0 - addib,tr,n -3,ret0,.Lstrlen_out - extrw,u,<> t0,15,8,r0 - addib,tr,n -2,ret0,.Lstrlen_out - extrw,u,<> t0,23,8,r0 - addi -1,ret0,ret0 -.Lstrlen_out: - bv r0(rp) - uaddcm ret0,arg0,ret0 -.Lstrlen_null_ptr: - bv,n r0(rp) -ENDPROC_CFI(strlen) - - -ENTRY_CFI(strcpy, frame=0,no_calls) - ldb 0(arg1),t0 - stb t0,0(arg0) - ldo 0(arg0),ret0 - ldo 1(arg1),t1 - cmpb,= r0,t0,2f - ldo 1(arg0),t2 -1: ldb 0(t1),arg1 - stb arg1,0(t2) - ldo 1(t1),t1 - cmpb,<> r0,arg1,1b - ldo 1(t2),t2 -2: bv,n r0(rp) -ENDPROC_CFI(strcpy) - - -ENTRY_CFI(strncpy, frame=0,no_calls) - ldb 0(arg1),t0 - stb t0,0(arg0) - ldo 1(arg1),t1 - ldo 0(arg0),ret0 - cmpb,= r0,t0,2f - ldo 1(arg0),arg1 -1: ldo -1(arg2),arg2 - cmpb,COND(=),n r0,arg2,2f - ldb 0(t1),arg0 - stb arg0,0(arg1) - ldo 1(t1),t1 - cmpb,<> r0,arg0,1b - ldo 1(arg1),arg1 -2: bv,n r0(rp) -ENDPROC_CFI(strncpy) - - -ENTRY_CFI(strcat, frame=0,no_calls) - ldb 0(arg0),t0 - cmpb,= t0,r0,2f - ldo 0(arg0),ret0 - ldo 1(arg0),arg0 -1: ldb 0(arg0),t1 - cmpb,<>,n r0,t1,1b - ldo 1(arg0),arg0 -2: ldb 0(arg1),t2 - stb t2,0(arg0) - ldo 1(arg0),arg0 - ldb 0(arg1),t0 - cmpb,<> r0,t0,2b - ldo 1(arg1),arg1 - bv,n r0(rp) -ENDPROC_CFI(strcat) - - -ENTRY_CFI(memset, frame=0,no_calls) - copy arg0,ret0 - cmpb,COND(=) r0,arg0,4f - copy arg0,t2 - cmpb,COND(=) r0,arg2,4f - ldo -1(arg2),arg3 - subi -1,arg3,t0 - subi 0,t0,t1 - cmpiclr,COND(>=) 0,t1,arg2 - ldo -1(t1),arg2 - extru arg2,31,2,arg0 -2: stb arg1,0(t2) - ldo 1(t2),t2 - addib,>= -1,arg0,2b - ldo -1(arg3),arg3 - cmpiclr,COND(<=) 4,arg2,r0 - b,l,n 4f,r0 -#ifdef CONFIG_64BIT - depd,* r0,63,2,arg2 -#else - depw r0,31,2,arg2 -#endif - ldo 1(t2),t2 -3: stb arg1,-1(t2) - stb arg1,0(t2) - stb arg1,1(t2) - stb arg1,2(t2) - addib,COND(>) -4,arg2,3b - ldo 4(t2),t2 -4: bv,n r0(rp) -ENDPROC_CFI(memset) - - .end From 1ee23155bd0715c376a1c54437341a038bf30ff4 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Thu, 26 Aug 2021 12:46:01 -0700 Subject: [PATCH 0465/5344] net: don't unconditionally copy_from_user a struct ifreq for socket ioctls commit d0efb16294d145d157432feda83877ae9d7cdf37 upstream. A common implementation of isatty(3) involves calling a ioctl passing a dummy struct argument and checking whether the syscall failed -- bionic and glibc use TCGETS (passing a struct termios), and musl uses TIOCGWINSZ (passing a struct winsize). If the FD is a socket, we will copy sizeof(struct ifreq) bytes of data from the argument and return -EFAULT if that fails. The result is that the isatty implementations may return a non-POSIX-compliant value in errno in the case where part of the dummy struct argument is inaccessible, as both struct termios and struct winsize are smaller than struct ifreq (at least on arm64). Although there is usually enough stack space following the argument on the stack that this did not present a practical problem up to now, with MTE stack instrumentation it's more likely for the copy to fail, as the memory following the struct may have a different tag. Fix the problem by adding an early check for whether the ioctl is a valid socket ioctl, and return -ENOTTY if it isn't. Fixes: 44c02a2c3dc5 ("dev_ioctl(): move copyin/copyout to callers") Link: https://linux-review.googlesource.com/id/I869da6cf6daabc3e4b7b82ac979683ba05e27d4d Signed-off-by: Peter Collingbourne Cc: # 4.19 Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/netdevice.h | 4 ++++ net/socket.c | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 909e96f7a6ef0..7cf0b416bade9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3730,6 +3730,10 @@ int netdev_rx_handler_register(struct net_device *dev, void netdev_rx_handler_unregister(struct net_device *dev); bool dev_valid_name(const char *name); +static inline bool is_socket_ioctl_cmd(unsigned int cmd) +{ + return _IOC_TYPE(cmd) == SOCK_IOC_TYPE; +} int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_copyout); int dev_ifconf(struct net *net, struct ifconf *, int); diff --git a/net/socket.c b/net/socket.c index 0ae4ee7223dd5..5e0b7c8de1ef8 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1054,7 +1054,7 @@ static long sock_do_ioctl(struct net *net, struct socket *sock, rtnl_unlock(); if (!err && copy_to_user(argp, &ifc, sizeof(struct ifconf))) err = -EFAULT; - } else { + } else if (is_socket_ioctl_cmd(cmd)) { struct ifreq ifr; bool need_copyout; if (copy_from_user(&ifr, argp, sizeof(struct ifreq))) @@ -1063,6 +1063,8 @@ static long sock_do_ioctl(struct net *net, struct socket *sock, if (!err && need_copyout) if (copy_to_user(argp, &ifr, sizeof(struct ifreq))) return -EFAULT; + } else { + err = -ENOTTY; } return err; } @@ -3266,6 +3268,8 @@ static int compat_ifr_data_ioctl(struct net *net, unsigned int cmd, struct ifreq ifreq; u32 data32; + if (!is_socket_ioctl_cmd(cmd)) + return -ENOTTY; if (copy_from_user(ifreq.ifr_name, u_ifreq32->ifr_name, IFNAMSIZ)) return -EFAULT; if (get_user(data32, &u_ifreq32->ifr_data)) From 61cbae5978d30af86b88bd4565df5bc0b1575898 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 3 Sep 2021 10:08:16 +0200 Subject: [PATCH 0466/5344] Linux 5.4.144 Link: https://lore.kernel.org/r/20210901122253.388326997@linuxfoundation.org Tested-by: Florian Fainelli Tested-by: Jon Hunter Tested-by: Shuah Khan Tested-by: Hulk Robot Tested-by: Linux Kernel Functional Testing Tested-by: Sudip Mukherjee Tested-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e99fabc4dfc8c..3c3804197b511 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 4 -SUBLEVEL = 143 +SUBLEVEL = 144 EXTRAVERSION = NAME = Kleptomaniac Octopus From a223ad130299f6e914008f050fc5feba5bcb4045 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 20 Aug 2021 23:44:17 -0400 Subject: [PATCH 0467/5344] ext4: fix race writing to an inline_data file while its xattrs are changing commit a54c4613dac1500b40e4ab55199f7c51f028e848 upstream. The location of the system.data extended attribute can change whenever xattr_sem is not taken. So we need to recalculate the i_inline_off field since it mgiht have changed between ext4_write_begin() and ext4_write_end(). This means that caching i_inline_off is probably not helpful, so in the long run we should probably get rid of it and shrink the in-memory ext4 inode slightly, but let's fix the race the simple way for now. Cc: stable@kernel.org Fixes: f19d5870cbf72 ("ext4: add normal write support for inline data") Reported-by: syzbot+13146364637c7363a7de@syzkaller.appspotmail.com Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inline.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 519378a15bc6b..46151bda62368 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -750,6 +750,12 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, ext4_write_lock_xattr(inode, &no_expand); BUG_ON(!ext4_has_inline_data(inode)); + /* + * ei->i_inline_off may have changed since ext4_write_begin() + * called ext4_try_to_write_inline_data() + */ + (void) ext4_find_inline_data_nolock(inode); + kaddr = kmap_atomic(page); ext4_write_inline_data(inode, &iloc, kaddr, pos, len); kunmap_atomic(kaddr); From 3f33ec84c056ba5f7052ede1169866917239cfa2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 1 Sep 2021 09:40:38 -0700 Subject: [PATCH 0468/5344] fscrypt: add fscrypt_symlink_getattr() for computing st_size commit d18760560593e5af921f51a8c9b64b6109d634c2 upstream. Add a helper function fscrypt_symlink_getattr() which will be called from the various filesystems' ->getattr() methods to read and decrypt the target of encrypted symlinks in order to report the correct st_size. Detailed explanation: As required by POSIX and as documented in various man pages, st_size for a symlink is supposed to be the length of the symlink target. Unfortunately, st_size has always been wrong for encrypted symlinks because st_size is populated from i_size from disk, which intentionally contains the length of the encrypted symlink target. That's slightly greater than the length of the decrypted symlink target (which is the symlink target that userspace usually sees), and usually won't match the length of the no-key encoded symlink target either. This hadn't been fixed yet because reporting the correct st_size would require reading the symlink target from disk and decrypting or encoding it, which historically has been considered too heavyweight to do in ->getattr(). Also historically, the wrong st_size had only broken a test (LTP lstat03) and there were no known complaints from real users. (This is probably because the st_size of symlinks isn't used too often, and when it is, typically it's for a hint for what buffer size to pass to readlink() -- which a slightly-too-large size still works for.) However, a couple things have changed now. First, there have recently been complaints about the current behavior from real users: - Breakage in rpmbuild: https://github.com/rpm-software-management/rpm/issues/1682 https://github.com/google/fscrypt/issues/305 - Breakage in toybox cpio: https://www.mail-archive.com/toybox@lists.landley.net/msg07193.html - Breakage in libgit2: https://issuetracker.google.com/issues/189629152 (on Android public issue tracker, requires login) Second, we now cache decrypted symlink targets in ->i_link. Therefore, taking the performance hit of reading and decrypting the symlink target in ->getattr() wouldn't be as big a deal as it used to be, since usually it will just save having to do the same thing later. Also note that eCryptfs ended up having to read and decrypt symlink targets in ->getattr() as well, to fix this same issue; see commit 3a60a1686f0d ("eCryptfs: Decrypt symlink target for stat size"). So, let's just bite the bullet, and read and decrypt the symlink target in ->getattr() in order to report the correct st_size. Add a function fscrypt_symlink_getattr() which the filesystems will call to do this. (Alternatively, we could store the decrypted size of symlinks on-disk. But there isn't a great place to do so, and encryption is meant to hide the original size to some extent; that property would be lost.) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210702065350.209646-2-ebiggers@kernel.org Signed-off-by: Eric Biggers Signed-off-by: Greg Kroah-Hartman --- fs/crypto/hooks.c | 44 +++++++++++++++++++++++++++++++++++++++++ include/linux/fscrypt.h | 7 +++++++ 2 files changed, 51 insertions(+) diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index a5a40a76b8ed7..82575cfbb04db 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -305,3 +305,47 @@ const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, return ERR_PTR(err); } EXPORT_SYMBOL_GPL(fscrypt_get_symlink); + +/** + * fscrypt_symlink_getattr() - set the correct st_size for encrypted symlinks + * @path: the path for the encrypted symlink being queried + * @stat: the struct being filled with the symlink's attributes + * + * Override st_size of encrypted symlinks to be the length of the decrypted + * symlink target (or the no-key encoded symlink target, if the key is + * unavailable) rather than the length of the encrypted symlink target. This is + * necessary for st_size to match the symlink target that userspace actually + * sees. POSIX requires this, and some userspace programs depend on it. + * + * This requires reading the symlink target from disk if needed, setting up the + * inode's encryption key if possible, and then decrypting or encoding the + * symlink target. This makes lstat() more heavyweight than is normally the + * case. However, decrypted symlink targets will be cached in ->i_link, so + * usually the symlink won't have to be read and decrypted again later if/when + * it is actually followed, readlink() is called, or lstat() is called again. + * + * Return: 0 on success, -errno on failure + */ +int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat) +{ + struct dentry *dentry = path->dentry; + struct inode *inode = d_inode(dentry); + const char *link; + DEFINE_DELAYED_CALL(done); + + /* + * To get the symlink target that userspace will see (whether it's the + * decrypted target or the no-key encoded target), we can just get it in + * the same way the VFS does during path resolution and readlink(). + */ + link = READ_ONCE(inode->i_link); + if (!link) { + link = inode->i_op->get_link(dentry, inode, &done); + if (IS_ERR(link)) + return PTR_ERR(link); + } + stat->size = strlen(link); + do_delayed_call(&done); + return 0; +} +EXPORT_SYMBOL_GPL(fscrypt_symlink_getattr); diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 032e5bcf97012..0d1a53d6c52dc 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -298,6 +298,7 @@ extern int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, extern const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, unsigned int max_size, struct delayed_call *done); +int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat); static inline void fscrypt_set_ops(struct super_block *sb, const struct fscrypt_operations *s_cop) { @@ -585,6 +586,12 @@ static inline const char *fscrypt_get_symlink(struct inode *inode, return ERR_PTR(-EOPNOTSUPP); } +static inline int fscrypt_symlink_getattr(const struct path *path, + struct kstat *stat) +{ + return -EOPNOTSUPP; +} + static inline void fscrypt_set_ops(struct super_block *sb, const struct fscrypt_operations *s_cop) { From 3aa5bffd5eef6b9a68ee80f7ff57b6aa7a630095 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 1 Sep 2021 09:40:39 -0700 Subject: [PATCH 0469/5344] ext4: report correct st_size for encrypted symlinks commit 8c4bca10ceafc43b1ca0a9fab5fa27e13cbce99e upstream. The stat() family of syscalls report the wrong size for encrypted symlinks, which has caused breakage in several userspace programs. Fix this by calling fscrypt_symlink_getattr() after ext4_getattr() for encrypted symlinks. This function computes the correct size by reading and decrypting the symlink target (if it's not already cached). For more details, see the commit which added fscrypt_symlink_getattr(). Fixes: f348c252320b ("ext4 crypto: add symlink encryption") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210702065350.209646-3-ebiggers@kernel.org Signed-off-by: Eric Biggers Signed-off-by: Greg Kroah-Hartman --- fs/ext4/symlink.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index dd05af983092d..a9457fed351ed 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -52,10 +52,19 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry, return paddr; } +static int ext4_encrypted_symlink_getattr(const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) +{ + ext4_getattr(path, stat, request_mask, query_flags); + + return fscrypt_symlink_getattr(path, stat); +} + const struct inode_operations ext4_encrypted_symlink_inode_operations = { .get_link = ext4_encrypted_get_link, .setattr = ext4_setattr, - .getattr = ext4_getattr, + .getattr = ext4_encrypted_symlink_getattr, .listxattr = ext4_listxattr, }; From 9f359a187ed91cd5506548df9d64e623e8a67fd8 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 1 Sep 2021 09:40:40 -0700 Subject: [PATCH 0470/5344] f2fs: report correct st_size for encrypted symlinks commit 461b43a8f92e68e96c4424b31e15f2b35f1bbfa9 upstream. The stat() family of syscalls report the wrong size for encrypted symlinks, which has caused breakage in several userspace programs. Fix this by calling fscrypt_symlink_getattr() after f2fs_getattr() for encrypted symlinks. This function computes the correct size by reading and decrypting the symlink target (if it's not already cached). For more details, see the commit which added fscrypt_symlink_getattr(). Fixes: cbaf042a3cc6 ("f2fs crypto: add symlink encryption") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210702065350.209646-4-ebiggers@kernel.org Signed-off-by: Eric Biggers Signed-off-by: Greg Kroah-Hartman --- fs/f2fs/namei.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 3a97ac56821ba..81a18ba18e301 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -1256,9 +1256,18 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, return target; } +static int f2fs_encrypted_symlink_getattr(const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) +{ + f2fs_getattr(path, stat, request_mask, query_flags); + + return fscrypt_symlink_getattr(path, stat); +} + const struct inode_operations f2fs_encrypted_symlink_inode_operations = { .get_link = f2fs_encrypted_get_link, - .getattr = f2fs_getattr, + .getattr = f2fs_encrypted_symlink_getattr, .setattr = f2fs_setattr, #ifdef CONFIG_F2FS_FS_XATTR .listxattr = f2fs_listxattr, From cf594159d01ccff205facdcdeabb4be771f0ca44 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 1 Sep 2021 09:40:41 -0700 Subject: [PATCH 0471/5344] ubifs: report correct st_size for encrypted symlinks commit 064c734986011390b4d111f1a99372b7f26c3850 upstream. The stat() family of syscalls report the wrong size for encrypted symlinks, which has caused breakage in several userspace programs. Fix this by calling fscrypt_symlink_getattr() after ubifs_getattr() for encrypted symlinks. This function computes the correct size by reading and decrypting the symlink target (if it's not already cached). For more details, see the commit which added fscrypt_symlink_getattr(). Fixes: ca7f85be8d6c ("ubifs: Add support for encrypted symlinks") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210702065350.209646-5-ebiggers@kernel.org Signed-off-by: Eric Biggers Signed-off-by: Greg Kroah-Hartman --- fs/ubifs/file.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 8dada89bbe4da..6069c63d833ae 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1629,6 +1629,16 @@ static const char *ubifs_get_link(struct dentry *dentry, return fscrypt_get_symlink(inode, ui->data, ui->data_len, done); } +static int ubifs_symlink_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + ubifs_getattr(path, stat, request_mask, query_flags); + + if (IS_ENCRYPTED(d_inode(path->dentry))) + return fscrypt_symlink_getattr(path, stat); + return 0; +} + const struct address_space_operations ubifs_file_address_operations = { .readpage = ubifs_readpage, .writepage = ubifs_writepage, @@ -1654,7 +1664,7 @@ const struct inode_operations ubifs_file_inode_operations = { const struct inode_operations ubifs_symlink_inode_operations = { .get_link = ubifs_get_link, .setattr = ubifs_setattr, - .getattr = ubifs_getattr, + .getattr = ubifs_symlink_getattr, #ifdef CONFIG_UBIFS_FS_XATTR .listxattr = ubifs_listxattr, #endif From cebc2a3400d646636132a1c3436cb348e9e2af1d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 26 May 2021 00:03:37 -0700 Subject: [PATCH 0472/5344] xtensa: fix kconfig unmet dependency warning for HAVE_FUTEX_CMPXCHG commit ed5aacc81cd41efc4d561e14af408d1003f7b855 upstream. XTENSA should only select HAVE_FUTEX_CMPXCHG when FUTEX is set/enabled. This prevents a kconfig warning. WARNING: unmet direct dependencies detected for HAVE_FUTEX_CMPXCHG Depends on [n]: FUTEX [=n] Selected by [y]: - XTENSA [=y] && !MMU [=n] Fixes: d951ba21b959 ("xtensa: nommu: select HAVE_FUTEX_CMPXCHG") Signed-off-by: Randy Dunlap Cc: Max Filippov Cc: Chris Zankel Cc: linux-xtensa@linux-xtensa.org Message-Id: <20210526070337.28130-1-rdunlap@infradead.org> Signed-off-by: Max Filippov Signed-off-by: Greg Kroah-Hartman --- arch/xtensa/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 8352037322dfb..499bdd1f0c715 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -27,7 +27,7 @@ config XTENSA select HAVE_DMA_CONTIGUOUS select HAVE_EXIT_THREAD select HAVE_FUNCTION_TRACER - select HAVE_FUTEX_CMPXCHG if !MMU + select HAVE_FUTEX_CMPXCHG if !MMU && FUTEX select HAVE_HW_BREAKPOINT if PERF_EVENTS select HAVE_IRQ_TIME_ACCOUNTING select HAVE_OPROFILE From c1c80738801484d30fb8693d4b6580f27af8621f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Ha=C5=82asa?= Date: Mon, 7 Jun 2021 12:49:07 +0200 Subject: [PATCH 0473/5344] gpu: ipu-v3: Fix i.MX IPU-v3 offset calculations for (semi)planar U/V formats [ Upstream commit 7cca7c8096e2c8a4149405438329b5035d0744f0 ] Video captured in 1400x1050 resolution (bytesperline aka stride = 1408 bytes) is invalid. Fix it. Signed-off-by: Krzysztof Halasa Link: https://lore.kernel.org/r/m3y2bmq7a4.fsf@t19.piap.pl [p.zabel@pengutronix.de: added "gpu: ipu-v3:" prefix to commit description] Signed-off-by: Philipp Zabel Signed-off-by: Sasha Levin --- drivers/gpu/ipu-v3/ipu-cpmem.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/ipu-v3/ipu-cpmem.c b/drivers/gpu/ipu-v3/ipu-cpmem.c index a1c85d1521f5c..82b244cb313e6 100644 --- a/drivers/gpu/ipu-v3/ipu-cpmem.c +++ b/drivers/gpu/ipu-v3/ipu-cpmem.c @@ -585,21 +585,21 @@ static const struct ipu_rgb def_bgra_16 = { .bits_per_pixel = 16, }; -#define Y_OFFSET(pix, x, y) ((x) + pix->width * (y)) -#define U_OFFSET(pix, x, y) ((pix->width * pix->height) + \ - (pix->width * ((y) / 2) / 2) + (x) / 2) -#define V_OFFSET(pix, x, y) ((pix->width * pix->height) + \ - (pix->width * pix->height / 4) + \ - (pix->width * ((y) / 2) / 2) + (x) / 2) -#define U2_OFFSET(pix, x, y) ((pix->width * pix->height) + \ - (pix->width * (y) / 2) + (x) / 2) -#define V2_OFFSET(pix, x, y) ((pix->width * pix->height) + \ - (pix->width * pix->height / 2) + \ - (pix->width * (y) / 2) + (x) / 2) -#define UV_OFFSET(pix, x, y) ((pix->width * pix->height) + \ - (pix->width * ((y) / 2)) + (x)) -#define UV2_OFFSET(pix, x, y) ((pix->width * pix->height) + \ - (pix->width * y) + (x)) +#define Y_OFFSET(pix, x, y) ((x) + pix->bytesperline * (y)) +#define U_OFFSET(pix, x, y) ((pix->bytesperline * pix->height) + \ + (pix->bytesperline * ((y) / 2) / 2) + (x) / 2) +#define V_OFFSET(pix, x, y) ((pix->bytesperline * pix->height) + \ + (pix->bytesperline * pix->height / 4) + \ + (pix->bytesperline * ((y) / 2) / 2) + (x) / 2) +#define U2_OFFSET(pix, x, y) ((pix->bytesperline * pix->height) + \ + (pix->bytesperline * (y) / 2) + (x) / 2) +#define V2_OFFSET(pix, x, y) ((pix->bytesperline * pix->height) + \ + (pix->bytesperline * pix->height / 2) + \ + (pix->bytesperline * (y) / 2) + (x) / 2) +#define UV_OFFSET(pix, x, y) ((pix->bytesperline * pix->height) + \ + (pix->bytesperline * ((y) / 2)) + (x)) +#define UV2_OFFSET(pix, x, y) ((pix->bytesperline * pix->height) + \ + (pix->bytesperline * y) + (x)) #define NUM_ALPHA_CHANNELS 7 From e112cb93d8df7c1c89a4117d25138fada18ce8b7 Mon Sep 17 00:00:00 2001 From: Sai Krishna Potthuri Date: Wed, 23 Jun 2021 13:46:20 +0200 Subject: [PATCH 0474/5344] reset: reset-zynqmp: Fixed the argument data type [ Upstream commit ed104ca4bd9c405b41e968ad4ece51f6462e90b6 ] This patch changes the data type of the variable 'val' from int to u32. Addresses-Coverity: argument of type "int *" is incompatible with parameter of type "u32 *" Signed-off-by: Sai Krishna Potthuri Signed-off-by: Michal Simek Link: https://lore.kernel.org/r/925cebbe4eb73c7d0a536da204748d33c7100d8c.1624448778.git.michal.simek@xilinx.com Signed-off-by: Philipp Zabel Signed-off-by: Sasha Levin --- drivers/reset/reset-zynqmp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/reset/reset-zynqmp.c b/drivers/reset/reset-zynqmp.c index 99e75d92dadab..8a7473b6ba585 100644 --- a/drivers/reset/reset-zynqmp.c +++ b/drivers/reset/reset-zynqmp.c @@ -46,7 +46,8 @@ static int zynqmp_reset_status(struct reset_controller_dev *rcdev, unsigned long id) { struct zynqmp_reset_data *priv = to_zynqmp_reset_data(rcdev); - int val, err; + int err; + u32 val; err = priv->eemi_ops->reset_get_status(ZYNQMP_RESET_ID + id, &val); if (err) From aea986d96636bd9751fd74eba0ec75d0f4c09ca3 Mon Sep 17 00:00:00 2001 From: Shai Malin Date: Sun, 22 Aug 2021 22:21:14 +0300 Subject: [PATCH 0475/5344] qed: Fix the VF msix vectors flow [ Upstream commit b0cd08537db8d2fbb227cdb2e5835209db295a24 ] For VFs we should return with an error in case we didn't get the exact number of msix vectors as we requested. Not doing that will lead to a crash when starting queues for this VF. Signed-off-by: Prabhakar Kushwaha Signed-off-by: Ariel Elior Signed-off-by: Shai Malin Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/qlogic/qed/qed_main.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index bc1f5b36b5bf2..1db49424aa43c 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -559,7 +559,12 @@ static int qed_enable_msix(struct qed_dev *cdev, rc = cnt; } - if (rc > 0) { + /* For VFs, we should return with an error in case we didn't get the + * exact number of msix vectors as we requested. + * Not doing that will lead to a crash when starting queues for + * this VF. + */ + if ((IS_PF(cdev) && rc > 0) || (IS_VF(cdev) && rc == cnt)) { /* MSI-x configuration was achieved */ int_params->out.int_mode = QED_INT_MODE_MSIX; int_params->out.num_vectors = rc; From 11b28f8f10b0dcf8c38bb06a24cb9b36cfd1d8a7 Mon Sep 17 00:00:00 2001 From: Harini Katakam Date: Tue, 24 Aug 2021 15:32:09 +0530 Subject: [PATCH 0476/5344] net: macb: Add a NULL check on desc_ptp [ Upstream commit 85520079afce885b80647fbd0d13d8f03d057167 ] macb_ptp_desc will not return NULL under most circumstances with correct Kconfig and IP design config register. But for the sake of the extreme corner case, check for NULL when using the helper. In case of rx_tstamp, no action is necessary except to return (similar to timestamp disabled) and warn. In case of TX, return -EINVAL to let the skb be free. Perform this check before marking skb in progress. Fixes coverity warning: (4) Event dereference: Dereferencing a null pointer "desc_ptp" Signed-off-by: Harini Katakam Reviewed-by: Radhey Shyam Pandey Signed-off-by: Michal Simek Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/cadence/macb_ptp.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cadence/macb_ptp.c b/drivers/net/ethernet/cadence/macb_ptp.c index 43a3f0dbf857c..f5aec28c77306 100644 --- a/drivers/net/ethernet/cadence/macb_ptp.c +++ b/drivers/net/ethernet/cadence/macb_ptp.c @@ -275,6 +275,12 @@ void gem_ptp_rxstamp(struct macb *bp, struct sk_buff *skb, if (GEM_BFEXT(DMA_RXVALID, desc->addr)) { desc_ptp = macb_ptp_desc(bp, desc); + /* Unlikely but check */ + if (!desc_ptp) { + dev_warn_ratelimited(&bp->pdev->dev, + "Timestamp not supported in BD\n"); + return; + } gem_hw_timestamp(bp, desc_ptp->ts_1, desc_ptp->ts_2, &ts); memset(shhwtstamps, 0, sizeof(struct skb_shared_hwtstamps)); shhwtstamps->hwtstamp = ktime_set(ts.tv_sec, ts.tv_nsec); @@ -307,8 +313,11 @@ int gem_ptp_txstamp(struct macb_queue *queue, struct sk_buff *skb, if (CIRC_SPACE(head, tail, PTP_TS_BUFFER_SIZE) == 0) return -ENOMEM; - skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; desc_ptp = macb_ptp_desc(queue->bp, desc); + /* Unlikely but check */ + if (!desc_ptp) + return -EINVAL; + skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; tx_timestamp = &queue->tx_timestamps[head]; tx_timestamp->skb = skb; /* ensure ts_1/ts_2 is loaded after ctrl (TX_USED check) */ From bdba8e16777a321a4f0f203f13781e951d50cee9 Mon Sep 17 00:00:00 2001 From: Shai Malin Date: Tue, 24 Aug 2021 19:52:49 +0300 Subject: [PATCH 0477/5344] qede: Fix memset corruption [ Upstream commit e543468869e2532f5d7926e8f417782b48eca3dc ] Thanks to Kees Cook who detected the problem of memset that starting from not the first member, but sized for the whole struct. The better change will be to remove the redundant memset and to clear only the msix_cnt member. Signed-off-by: Prabhakar Kushwaha Signed-off-by: Ariel Elior Signed-off-by: Shai Malin Reported-by: Kees Cook Reviewed-by: Kees Cook Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/qlogic/qede/qede_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index ce3e62e73e4cd..1133f6fe21a0e 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -1773,6 +1773,7 @@ static void qede_sync_free_irqs(struct qede_dev *edev) } edev->int_info.used_cnt = 0; + edev->int_info.msix_cnt = 0; } static int qede_req_msix_irqs(struct qede_dev *edev) @@ -2317,7 +2318,6 @@ static int qede_load(struct qede_dev *edev, enum qede_load_mode mode, goto out; err4: qede_sync_free_irqs(edev); - memset(&edev->int_info.msix_cnt, 0, sizeof(struct qed_int_info)); err3: qede_napi_disable_remove(edev); err2: From 84ce9552f980f26a5c66f576fe747737915306d8 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Tue, 24 Aug 2021 12:06:22 +0800 Subject: [PATCH 0478/5344] perf/x86/intel/pt: Fix mask of num_address_ranges [ Upstream commit c53c6b7409f4cd9e542991b53d597fbe2751d7db ] Per SDM, bit 2:0 of CPUID(0x14,1).EAX[2:0] reports the number of configurable address ranges for filtering, not bit 1:0. Signed-off-by: Xiaoyao Li Signed-off-by: Peter Zijlstra (Intel) Acked-by: Alexander Shishkin Link: https://lkml.kernel.org/r/20210824040622.4081502-1-xiaoyao.li@intel.com Signed-off-by: Sasha Levin --- arch/x86/events/intel/pt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 05e43d0f430bc..da289a44d5116 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -62,7 +62,7 @@ static struct pt_cap_desc { PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)), PT_CAP(output_subsys, 0, CPUID_ECX, BIT(3)), PT_CAP(payloads_lip, 0, CPUID_ECX, BIT(31)), - PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x3), + PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x7), PT_CAP(mtc_periods, 1, CPUID_EAX, 0xffff0000), PT_CAP(cycle_thresholds, 1, CPUID_EBX, 0xffff), PT_CAP(psb_periods, 1, CPUID_EBX, 0xffff0000), From ab128f846d6a20af23ec0d73917995e533d964b5 Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Tue, 17 Aug 2021 17:10:42 -0500 Subject: [PATCH 0479/5344] perf/x86/amd/ibs: Work around erratum #1197 [ Upstream commit 26db2e0c51fe83e1dd852c1321407835b481806e ] Erratum #1197 "IBS (Instruction Based Sampling) Register State May be Incorrect After Restore From CC6" is published in a document: "Revision Guide for AMD Family 19h Models 00h-0Fh Processors" 56683 Rev. 1.04 July 2021 https://bugzilla.kernel.org/show_bug.cgi?id=206537 Implement the erratum's suggested workaround and ignore IBS samples if MSRC001_1031 == 0. Signed-off-by: Kim Phillips Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210817221048.88063-3-kim.phillips@amd.com Signed-off-by: Sasha Levin --- arch/x86/events/amd/ibs.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 0a54b8c2a457a..b7baaa9733173 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -90,6 +90,7 @@ struct perf_ibs { unsigned long offset_mask[1]; int offset_max; unsigned int fetch_count_reset_broken : 1; + unsigned int fetch_ignore_if_zero_rip : 1; struct cpu_perf_ibs __percpu *pcpu; struct attribute **format_attrs; @@ -664,6 +665,10 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) { regs.flags &= ~PERF_EFLAGS_EXACT; } else { + /* Workaround for erratum #1197 */ + if (perf_ibs->fetch_ignore_if_zero_rip && !(ibs_data.regs[1])) + goto out; + set_linear_ip(®s, ibs_data.regs[1]); regs.flags |= PERF_EFLAGS_EXACT; } @@ -757,6 +762,9 @@ static __init void perf_event_ibs_init(void) if (boot_cpu_data.x86 >= 0x16 && boot_cpu_data.x86 <= 0x18) perf_ibs_fetch.fetch_count_reset_broken = 1; + if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10) + perf_ibs_fetch.fetch_ignore_if_zero_rip = 1; + perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); if (ibs_caps & IBS_CAPS_OPCNT) { From cb725eb3c1e45d0f152ae681817882133c92fb2d Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Tue, 17 Aug 2021 17:10:43 -0500 Subject: [PATCH 0480/5344] perf/x86/amd/power: Assign pmu.module [ Upstream commit ccf26483416a339c114409f6e7cd02abdeaf8052 ] Assign pmu.module so the driver can't be unloaded whilst in use. Signed-off-by: Kim Phillips Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210817221048.88063-4-kim.phillips@amd.com Signed-off-by: Sasha Levin --- arch/x86/events/amd/power.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/amd/power.c b/arch/x86/events/amd/power.c index abef51320e3a5..c4892b7d0c36b 100644 --- a/arch/x86/events/amd/power.c +++ b/arch/x86/events/amd/power.c @@ -217,6 +217,7 @@ static struct pmu pmu_class = { .stop = pmu_event_stop, .read = pmu_event_read, .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + .module = THIS_MODULE, }; static int power_cpu_exit(unsigned int cpu) From bfe0cf96947b1ae21f4e8efe20fab1448c421ed7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Aug 2021 18:32:50 +0200 Subject: [PATCH 0481/5344] cryptoloop: add a deprecation warning [ Upstream commit 222013f9ac30b9cec44301daa8dbd0aae38abffb ] Support for cryptoloop has been officially marked broken and deprecated in favor of dm-crypt (which supports the same broken algorithms if needed) in Linux 2.6.4 (released in March 2004), and support for it has been entirely removed from losetup in util-linux 2.23 (released in April 2013). Add a warning and a deprecation schedule. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210827163250.255325-1-hch@lst.de Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- drivers/block/Kconfig | 4 ++-- drivers/block/cryptoloop.c | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 0fc27ac14f29c..9f6782329a237 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -230,7 +230,7 @@ config BLK_DEV_LOOP_MIN_COUNT dynamically allocated with the /dev/loop-control interface. config BLK_DEV_CRYPTOLOOP - tristate "Cryptoloop Support" + tristate "Cryptoloop Support (DEPRECATED)" select CRYPTO select CRYPTO_CBC depends on BLK_DEV_LOOP @@ -242,7 +242,7 @@ config BLK_DEV_CRYPTOLOOP WARNING: This device is not safe for journaled file systems like ext3 or Reiserfs. Please use the Device Mapper crypto module instead, which can be configured to be on-disk compatible with the - cryptoloop device. + cryptoloop device. cryptoloop support will be removed in Linux 5.16. source "drivers/block/drbd/Kconfig" diff --git a/drivers/block/cryptoloop.c b/drivers/block/cryptoloop.c index 3cabc335ae744..f0a91faa43a89 100644 --- a/drivers/block/cryptoloop.c +++ b/drivers/block/cryptoloop.c @@ -189,6 +189,8 @@ init_cryptoloop(void) if (rc) printk(KERN_ERR "cryptoloop: loop_register_transfer failed\n"); + else + pr_warn("the cryptoloop driver has been deprecated and will be removed in in Linux 5.16\n"); return rc; } From a7d01e83b7b950da335040c78c2b1b2de5ea12bf Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Mon, 4 Nov 2019 18:15:15 +0100 Subject: [PATCH 0482/5344] ARM: 8918/2: only build return_address() if needed commit fb033c95c94ca1ee3d16e04ebdb85d65fb55fff8 upstream. The system currently warns if the config conditions for building return_address in arch/arm/kernel/return_address.c are not met, leaving just an EXPORT_SYMBOL_GPL(return_address) of a function defined to be 'static linline'. This is a result of aeea3592a13b ("ARM: 8158/1: LLVMLinux: use static inline in ARM ftrace.h"). Since we're not going to build anything other than an exported symbol for something that is already being defined to be an inline-able return of NULL, just avoid building the code to remove the following warning: Fixes: aeea3592a13b ("ARM: 8158/1: LLVMLinux: use static inline in ARM ftrace.h") Signed-off-by: Ben Dooks Signed-off-by: Russell King Signed-off-by: Greg Kroah-Hartman --- arch/arm/kernel/Makefile | 6 +++++- arch/arm/kernel/return_address.c | 4 ---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile index 70fd896c132a6..dc31426cae6d8 100644 --- a/arch/arm/kernel/Makefile +++ b/arch/arm/kernel/Makefile @@ -17,10 +17,14 @@ CFLAGS_REMOVE_return_address.o = -pg # Object file lists. obj-y := elf.o entry-common.o irq.o opcodes.o \ - process.o ptrace.o reboot.o return_address.o \ + process.o ptrace.o reboot.o \ setup.o signal.o sigreturn_codes.o \ stacktrace.o sys_arm.o time.o traps.o +ifneq ($(CONFIG_ARM_UNWIND),y) +obj-$(CONFIG_FRAME_POINTER) += return_address.o +endif + obj-$(CONFIG_ATAGS) += atags_parse.o obj-$(CONFIG_ATAGS_PROC) += atags_proc.o obj-$(CONFIG_DEPRECATED_PARAM_STRUCT) += atags_compat.o diff --git a/arch/arm/kernel/return_address.c b/arch/arm/kernel/return_address.c index b0d2f1fe891d1..7b42ac010fdfd 100644 --- a/arch/arm/kernel/return_address.c +++ b/arch/arm/kernel/return_address.c @@ -7,8 +7,6 @@ */ #include #include - -#if defined(CONFIG_FRAME_POINTER) && !defined(CONFIG_ARM_UNWIND) #include #include @@ -53,6 +51,4 @@ void *return_address(unsigned int level) return NULL; } -#endif /* if defined(CONFIG_FRAME_POINTER) && !defined(CONFIG_ARM_UNWIND) */ - EXPORT_SYMBOL_GPL(return_address); From 903d05d6f6027f7a99f8d8548bf67728803505fc Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 20 Aug 2021 16:32:14 +0200 Subject: [PATCH 0483/5344] ALSA: hda/realtek: Workaround for conflicting SSID on ASUS ROG Strix G17 commit 13d9c6b998aaa76fd098133277a28a21f2cc2264 upstream. ASUS ROG Strix G17 has the very same PCI and codec SSID (1043:103f) as ASUS TX300, and unfortunately, the existing quirk for TX300 is broken on ASUS ROG. Actually the device works without the quirk, so we'll need to clear the quirk before applying for this device. Since ASUS ROG has a different codec (ALC294 - while TX300 has ALC282), this patch adds a workaround for the device, just clearing the codec->fixup_id by checking the codec vendor_id. It's a bit ugly to add such a workaround there, but it seems to be the simplest way. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=214101 Cc: Link: https://lore.kernel.org/r/20210820143214.3654-1-tiwai@suse.de Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index f486e680aed1d..abe371c01fba2 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9160,6 +9160,16 @@ static int patch_alc269(struct hda_codec *codec) snd_hda_pick_fixup(codec, alc269_fixup_models, alc269_fixup_tbl, alc269_fixups); + /* FIXME: both TX300 and ROG Strix G17 have the same SSID, and + * the quirk breaks the latter (bko#214101). + * Clear the wrong entry. + */ + if (codec->fixup_id == ALC282_FIXUP_ASUS_TX300 && + codec->core.vendor_id == 0x10ec0294) { + codec_dbg(codec, "Clear wrong fixup for ASUS ROG Strix G17\n"); + codec->fixup_id = HDA_FIXUP_ID_NOT_SET; + } + snd_hda_pick_pin_fixup(codec, alc269_pin_fixup_tbl, alc269_fixups, true); snd_hda_pick_pin_fixup(codec, alc269_fallback_pin_fixup_tbl, alc269_fixups, false); snd_hda_pick_fixup(codec, NULL, alc269_fixup_vendor_tbl, From ba5a5beaf0bef2c50a08c759aec491cbba048773 Mon Sep 17 00:00:00 2001 From: Zubin Mithra Date: Fri, 27 Aug 2021 08:37:35 -0700 Subject: [PATCH 0484/5344] ALSA: pcm: fix divide error in snd_pcm_lib_ioctl commit f3eef46f0518a2b32ca1244015820c35a22cfe4a upstream. Syzkaller reported a divide error in snd_pcm_lib_ioctl. fifo_size is of type snd_pcm_uframes_t(unsigned long). If frame_size is 0x100000000, the error occurs. Fixes: a9960e6a293e ("ALSA: pcm: fix fifo_size frame calculation") Signed-off-by: Zubin Mithra Reviewed-by: Guenter Roeck Cc: Link: https://lore.kernel.org/r/20210827153735.789452-1-zsm@chromium.org Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/pcm_lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/core/pcm_lib.c b/sound/core/pcm_lib.c index 1662573a40309..fd300c3adddec 100644 --- a/sound/core/pcm_lib.c +++ b/sound/core/pcm_lib.c @@ -1736,7 +1736,7 @@ static int snd_pcm_lib_ioctl_fifo_size(struct snd_pcm_substream *substream, channels = params_channels(params); frame_size = snd_pcm_format_size(format, channels); if (frame_size > 0) - params->fifo_size /= (unsigned)frame_size; + params->fifo_size /= frame_size; } return 0; } From d84850b0277c88a8f956390e2da1c69e6f50a638 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Wed, 15 Jan 2020 16:08:12 -0800 Subject: [PATCH 0485/5344] ARC: wireup clone3 syscall commit bd71c453db91ecb464405411f2821d040f2a0d44 upstream. Signed-off-by: Vineet Gupta Signed-off-by: Greg Kroah-Hartman --- arch/arc/Kconfig | 1 + arch/arc/include/asm/syscalls.h | 1 + arch/arc/include/uapi/asm/unistd.h | 1 + arch/arc/kernel/entry.S | 12 ++++++++++++ arch/arc/kernel/process.c | 7 +++---- arch/arc/kernel/sys.c | 1 + 6 files changed, 19 insertions(+), 4 deletions(-) diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 8383155c8c824..a9d0b5310165f 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -29,6 +29,7 @@ config ARC select GENERIC_SMP_IDLE_THREAD select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK + select HAVE_COPY_THREAD_TLS select HAVE_DEBUG_STACKOVERFLOW select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_IOREMAP_PROT diff --git a/arch/arc/include/asm/syscalls.h b/arch/arc/include/asm/syscalls.h index 7ddba13e9b599..c3f4714a4f5cd 100644 --- a/arch/arc/include/asm/syscalls.h +++ b/arch/arc/include/asm/syscalls.h @@ -11,6 +11,7 @@ #include int sys_clone_wrapper(int, int, int, int, int); +int sys_clone3_wrapper(void *, size_t); int sys_cacheflush(uint32_t, uint32_t uint32_t); int sys_arc_settls(void *); int sys_arc_gettls(void); diff --git a/arch/arc/include/uapi/asm/unistd.h b/arch/arc/include/uapi/asm/unistd.h index 5eafa11151623..fa2713ae6bea5 100644 --- a/arch/arc/include/uapi/asm/unistd.h +++ b/arch/arc/include/uapi/asm/unistd.h @@ -21,6 +21,7 @@ #define __ARCH_WANT_SET_GET_RLIMIT #define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_CLONE +#define __ARCH_WANT_SYS_CLONE3 #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_TIME32_SYSCALLS diff --git a/arch/arc/kernel/entry.S b/arch/arc/kernel/entry.S index a0caf81dff038..1bd4d883e6851 100644 --- a/arch/arc/kernel/entry.S +++ b/arch/arc/kernel/entry.S @@ -35,6 +35,18 @@ ENTRY(sys_clone_wrapper) b .Lret_from_system_call END(sys_clone_wrapper) +ENTRY(sys_clone3_wrapper) + SAVE_CALLEE_SAVED_USER + bl @sys_clone3 + DISCARD_CALLEE_SAVED_USER + + GET_CURR_THR_INFO_FLAGS r10 + btst r10, TIF_SYSCALL_TRACE + bnz tracesys_exit + + b .Lret_from_system_call +END(sys_clone3_wrapper) + ENTRY(ret_from_fork) ; when the forked child comes here from the __switch_to function ; r0 has the last task pointer. diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c index 2c6d3865cc011..7cd10fabd60f4 100644 --- a/arch/arc/kernel/process.c +++ b/arch/arc/kernel/process.c @@ -171,9 +171,8 @@ asmlinkage void ret_from_fork(void); * | user_r25 | * ------------------ <===== END of PAGE */ -int copy_thread(unsigned long clone_flags, - unsigned long usp, unsigned long kthread_arg, - struct task_struct *p) +int copy_thread_tls(unsigned long clone_flags, unsigned long usp, + unsigned long kthread_arg, struct task_struct *p, unsigned long tls) { struct pt_regs *c_regs; /* child's pt_regs */ unsigned long *childksp; /* to unwind out of __switch_to() */ @@ -231,7 +230,7 @@ int copy_thread(unsigned long clone_flags, * set task's userland tls data ptr from 4th arg * clone C-lib call is difft from clone sys-call */ - task_thread_info(p)->thr_ptr = regs->r3; + task_thread_info(p)->thr_ptr = tls; } else { /* Normal fork case: set parent's TLS ptr in child */ task_thread_info(p)->thr_ptr = diff --git a/arch/arc/kernel/sys.c b/arch/arc/kernel/sys.c index fddecc76efb7c..1069446bdc589 100644 --- a/arch/arc/kernel/sys.c +++ b/arch/arc/kernel/sys.c @@ -7,6 +7,7 @@ #include #define sys_clone sys_clone_wrapper +#define sys_clone3 sys_clone3_wrapper #undef __SYSCALL #define __SYSCALL(nr, call) [nr] = (call), From 39d93672682f6a2f9774eae7b0abd06cd3457265 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Wed, 7 Jul 2021 19:54:30 +0200 Subject: [PATCH 0486/5344] media: stkwebcam: fix memory leak in stk_camera_probe commit 514e97674400462cc09c459a1ddfb9bf39017223 upstream. My local syzbot instance hit memory leak in usb_set_configuration(). The problem was in unputted usb interface. In case of errors after usb_get_intf() the reference should be putted to correclty free memory allocated for this interface. Fixes: ec16dae5453e ("V4L/DVB (7019): V4L: add support for Syntek DC1125 webcams") Cc: stable@vger.kernel.org Signed-off-by: Pavel Skripkin Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/usb/stkwebcam/stk-webcam.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/media/usb/stkwebcam/stk-webcam.c b/drivers/media/usb/stkwebcam/stk-webcam.c index 21f90a8874852..7d6a4ff6e1421 100644 --- a/drivers/media/usb/stkwebcam/stk-webcam.c +++ b/drivers/media/usb/stkwebcam/stk-webcam.c @@ -1346,7 +1346,7 @@ static int stk_camera_probe(struct usb_interface *interface, if (!dev->isoc_ep) { pr_err("Could not find isoc-in endpoint\n"); err = -ENODEV; - goto error; + goto error_put; } dev->vsettings.palette = V4L2_PIX_FMT_RGB565; dev->vsettings.mode = MODE_VGA; @@ -1359,10 +1359,12 @@ static int stk_camera_probe(struct usb_interface *interface, err = stk_register_video_device(dev); if (err) - goto error; + goto error_put; return 0; +error_put: + usb_put_intf(interface); error: v4l2_ctrl_handler_free(hdl); v4l2_device_unregister(&dev->v4l2_dev); From 84bf57eb2f3e33882704b2a4bbe895214445fe88 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Mon, 11 Jan 2021 14:09:04 -0800 Subject: [PATCH 0487/5344] USB: serial: mos7720: improve OOM-handling in read_mos_reg() commit 161a582bd1d8681095f158d11bc679a58f1d026b upstream. clang static analysis reports this problem mos7720.c:352:2: warning: Undefined or garbage value returned to caller return d; ^~~~~~~~ In the parport_mos7715_read_data()'s call to read_mos_reg(), 'd' is only set after the alloc block. buf = kmalloc(1, GFP_KERNEL); if (!buf) return -ENOMEM; Although the problem is reported in parport_most7715_read_data(), none of the callee's of read_mos_reg() check the return status. Make sure to clear the return-value buffer also on allocation failures. Fixes: 0d130367abf5 ("USB: serial: mos7720: fix control-message error handling") Signed-off-by: Tom Rix Link: https://lore.kernel.org/r/20210111220904.1035957-1-trix@redhat.com [ johan: only clear the buffer on errors, amend commit message ] Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/mos7720.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/serial/mos7720.c b/drivers/usb/serial/mos7720.c index aefc1b58d9563..f5caf38ba2be0 100644 --- a/drivers/usb/serial/mos7720.c +++ b/drivers/usb/serial/mos7720.c @@ -226,8 +226,10 @@ static int read_mos_reg(struct usb_serial *serial, unsigned int serial_portnum, int status; buf = kmalloc(1, GFP_KERNEL); - if (!buf) + if (!buf) { + *data = 0; return -ENOMEM; + } status = usb_control_msg(usbdev, pipe, request, requesttype, value, index, buf, 1, MOS_WDR_TIMEOUT); From aecb6fb9257b7e015194ff97c81a78c8d4beb2a3 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 12 Oct 2020 10:50:14 -0400 Subject: [PATCH 0488/5344] ipv4/icmp: l3mdev: Perform icmp error route lookup on source device routing table (v2) commit e1e84eb58eb494b77c8389fc6308b5042dcce791 upstream. As per RFC792, ICMP errors should be sent to the source host. However, in configurations with Virtual Routing and Forwarding tables, looking up which routing table to use is currently done by using the destination net_device. commit 9d1a6c4ea43e ("net: icmp_route_lookup should use rt dev to determine L3 domain") changes the interface passed to l3mdev_master_ifindex() and inet_addr_type_dev_table() from skb_in->dev to skb_dst(skb_in)->dev. This effectively uses the destination device rather than the source device for choosing which routing table should be used to lookup where to send the ICMP error. Therefore, if the source and destination interfaces are within separate VRFs, or one in the global routing table and the other in a VRF, looking up the source host in the destination interface's routing table will fail if the destination interface's routing table contains no route to the source host. One observable effect of this issue is that traceroute does not work in the following cases: - Route leaking between global routing table and VRF - Route leaking between VRFs Preferably use the source device routing table when sending ICMP error messages. If no source device is set, fall-back on the destination device routing table. Else, use the main routing table (index 0). [ It has been pointed out that a similar issue may exist with ICMP errors triggered when forwarding between network namespaces. It would be worthwhile to investigate, but is outside of the scope of this investigation. ] [ It has also been pointed out that a similar issue exists with unreachable / fragmentation needed messages, which can be triggered by changing the MTU of eth1 in r1 to 1400 and running: ip netns exec h1 ping -s 1450 -Mdo -c1 172.16.2.2 Some investigation points to raw_icmp_error() and raw_err() as being involved in this last scenario. The focus of this patch is TTL expired ICMP messages, which go through icmp_route_lookup. Investigation of failure modes related to raw_icmp_error() is beyond this investigation's scope. ] Fixes: 9d1a6c4ea43e ("net: icmp_route_lookup should use rt dev to determine L3 domain") Link: https://tools.ietf.org/html/rfc792 Signed-off-by: Mathieu Desnoyers Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- net/ipv4/icmp.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index c88612242c89f..f86f948a4b4c1 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -460,6 +460,23 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) local_bh_enable(); } +/* + * The device used for looking up which routing table to use for sending an ICMP + * error is preferably the source whenever it is set, which should ensure the + * icmp error can be sent to the source host, else lookup using the routing + * table of the destination device, else use the main routing table (index 0). + */ +static struct net_device *icmp_get_route_lookup_dev(struct sk_buff *skb) +{ + struct net_device *route_lookup_dev = NULL; + + if (skb->dev) + route_lookup_dev = skb->dev; + else if (skb_dst(skb)) + route_lookup_dev = skb_dst(skb)->dev; + return route_lookup_dev; +} + static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, struct sk_buff *skb_in, @@ -468,6 +485,7 @@ static struct rtable *icmp_route_lookup(struct net *net, int type, int code, struct icmp_bxm *param) { + struct net_device *route_lookup_dev; struct rtable *rt, *rt2; struct flowi4 fl4_dec; int err; @@ -482,7 +500,8 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; - fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev); + route_lookup_dev = icmp_get_route_lookup_dev(skb_in); + fl4->flowi4_oif = l3mdev_master_ifindex(route_lookup_dev); security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); rt = ip_route_output_key_hash(net, fl4, skb_in); @@ -506,7 +525,7 @@ static struct rtable *icmp_route_lookup(struct net *net, if (err) goto relookup_failed; - if (inet_addr_type_dev_table(net, skb_dst(skb_in)->dev, + if (inet_addr_type_dev_table(net, route_lookup_dev, fl4_dec.saddr) == RTN_LOCAL) { rt2 = __ip_route_output_key(net, &fl4_dec); if (IS_ERR(rt2)) From ef724db54c3b4f2983c7cf883b4395a39541a793 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Wed, 25 Mar 2020 09:42:57 -0700 Subject: [PATCH 0489/5344] powerpc/boot: Delete unneeded .globl _zimage_start commit 968339fad422a58312f67718691b717dac45c399 upstream. .globl sets the symbol binding to STB_GLOBAL while .weak sets the binding to STB_WEAK. GNU as let .weak override .globl since binutils-gdb 5ca547dc2399a0a5d9f20626d4bf5547c3ccfddd (1996). Clang integrated assembler let the last win but it may error in the future. Since it is a convention that only one binding directive is used, just delete .globl. Fixes: ee9d21b3b358 ("powerpc/boot: Ensure _zimage_start is a weak symbol") Signed-off-by: Fangrui Song Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200325164257.170229-1-maskray@google.com Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/boot/crt0.S | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/powerpc/boot/crt0.S b/arch/powerpc/boot/crt0.S index 92608f34d3123..1d83966f5ef64 100644 --- a/arch/powerpc/boot/crt0.S +++ b/arch/powerpc/boot/crt0.S @@ -44,9 +44,6 @@ p_end: .long _end p_pstack: .long _platform_stack_top #endif - .globl _zimage_start - /* Clang appears to require the .weak directive to be after the symbol - * is defined. See https://bugs.llvm.org/show_bug.cgi?id=38921 */ .weak _zimage_start _zimage_start: .globl _zimage_start_lib From 66da6bb1fc794903233c98bdc62daa60d76a04b5 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 14 Dec 2020 19:11:25 -0800 Subject: [PATCH 0490/5344] mm/page_alloc: speed up the iteration of max_order commit 7ad69832f37e3cea8557db6df7c793905f1135e8 upstream. When we free a page whose order is very close to MAX_ORDER and greater than pageblock_order, it wastes some CPU cycles to increase max_order to MAX_ORDER one by one and check the pageblock migratetype of that page repeatedly especially when MAX_ORDER is much larger than pageblock_order. We also should not be checking migratetype of buddy when "order == MAX_ORDER - 1" as the buddy pfn may be invalid, so adjust the condition. With the new check, we don't need the max_order check anymore, so we replace it. Also adjust max_order initialization so that it's lower by one than previously, which makes the code hopefully more clear. Link: https://lkml.kernel.org/r/20201204155109.55451-1-songmuchun@bytedance.com Fixes: d9dddbf55667 ("mm/page_alloc: prevent merging between isolated and other pageblocks") Signed-off-by: Muchun Song Acked-by: Vlastimil Babka Reviewed-by: Oscar Salvador Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/page_alloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e6e708a7e65c0..b63208341a3f0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -906,7 +906,7 @@ static inline void __free_one_page(struct page *page, unsigned int max_order; struct capture_control *capc = task_capc(zone); - max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); + max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order); VM_BUG_ON(!zone_is_initialized(zone)); VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); @@ -919,7 +919,7 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON_PAGE(bad_range(zone, page), page); continue_merging: - while (order < max_order - 1) { + while (order < max_order) { if (compaction_capture(capc, page, order, migratetype)) { __mod_zone_freepage_state(zone, -(1 << order), migratetype); @@ -945,7 +945,7 @@ static inline void __free_one_page(struct page *page, pfn = combined_pfn; order++; } - if (max_order < MAX_ORDER) { + if (order < MAX_ORDER - 1) { /* If we are here, it means order is >= pageblock_order. * We want to prevent merge between freepages on isolate * pageblock and normal pageblock. Without this, pageblock @@ -966,7 +966,7 @@ static inline void __free_one_page(struct page *page, is_migrate_isolate(buddy_mt))) goto done_merging; } - max_order++; + max_order = order + 1; goto continue_merging; } From 9eab2538689a2d9f04a2e6687734423e4c8f1e28 Mon Sep 17 00:00:00 2001 From: Hayes Wang Date: Fri, 6 Aug 2021 17:15:55 +0800 Subject: [PATCH 0491/5344] Revert "r8169: avoid link-up interrupt issue on RTL8106e if user enables ASPM" commit 2115d3d482656ea702f7cf308c0ded3500282903 upstream. This reverts commit 1ee8856de82faec9bc8bd0f2308a7f27e30ba207. This is used to re-enable ASPM on RTL8106e, if it is possible. Signed-off-by: Hayes Wang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/realtek/r8169_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index fb51548c57e94..623d86e5e970f 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -4713,6 +4713,7 @@ static void rtl_hw_start_8168g(struct rtl8169_private *tp) rtl_eri_clear_bits(tp, 0x1b0, ERIAR_MASK_0011, BIT(12)); rtl_pcie_state_l2l3_disable(tp); + rtl_hw_aspm_clkreq_enable(tp, true); } static void rtl_hw_start_8168g_1(struct rtl8169_private *tp) From 956c12078850aadb325ea9a32f0fdac0fd1e7681 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 25 Aug 2021 13:41:42 +0800 Subject: [PATCH 0492/5344] Revert "btrfs: compression: don't try to compress if we don't have enough pages" commit 4e9655763b82a91e4c341835bb504a2b1590f984 upstream. This reverts commit f2165627319ffd33a6217275e5690b1ab5c45763. [BUG] It's no longer possible to create compressed inline extent after commit f2165627319f ("btrfs: compression: don't try to compress if we don't have enough pages"). [CAUSE] For compression code, there are several possible reasons we have a range that needs to be compressed while it's no more than one page. - Compressed inline write The data is always smaller than one sector and the test lacks the condition to properly recognize a non-inline extent. - Compressed subpage write For the incoming subpage compressed write support, we require page alignment of the delalloc range. And for 64K page size, we can compress just one page into smaller sectors. For those reasons, the requirement for the data to be more than one page is not correct, and is already causing regression for compressed inline data writeback. The idea of skipping one page to avoid wasting CPU time could be revisited in the future. [FIX] Fix it by reverting the offending commit. Reported-by: Zygo Blaxell Link: https://lore.kernel.org/linux-btrfs/afa2742.c084f5d6.17b6b08dffc@tnonline.net Fixes: f2165627319f ("btrfs: compression: don't try to compress if we don't have enough pages") CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 29552d4f6845b..33b8fedab6c67 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -543,7 +543,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) * inode has not been flagged as nocompress. This flag can * change at any time if we discover bad compression ratios. */ - if (nr_pages > 1 && inode_need_compress(inode, start, end)) { + if (inode_need_compress(inode, start, end)) { WARN_ON(pages); pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) { From 47de5f3223d993e0bc69ee2b5f140f08eea38b6e Mon Sep 17 00:00:00 2001 From: Alexander Tsoy Date: Tue, 31 Aug 2021 03:25:31 +0300 Subject: [PATCH 0493/5344] ALSA: usb-audio: Add registration quirk for JBL Quantum 800 commit c8b177b6e3a005bd8fb0395a4bc5db3470301c28 upstream. Add another device ID for JBL Quantum 800. It requires the same quirk as other JBL Quantum devices. Signed-off-by: Alexander Tsoy Cc: Link: https://lore.kernel.org/r/20210831002531.116957-1-alexander@tsoy.me Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 4d20f3f45f7e2..d5d828817c5e8 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1841,6 +1841,7 @@ static const struct registration_quirk registration_quirks[] = { REG_QUIRK_ENTRY(0x0951, 0x16ed, 2), /* Kingston HyperX Cloud Alpha S */ REG_QUIRK_ENTRY(0x0951, 0x16ea, 2), /* Kingston HyperX Cloud Flight S */ REG_QUIRK_ENTRY(0x0ecb, 0x1f46, 2), /* JBL Quantum 600 */ + REG_QUIRK_ENTRY(0x0ecb, 0x1f47, 2), /* JBL Quantum 800 */ REG_QUIRK_ENTRY(0x0ecb, 0x2039, 2), /* JBL Quantum 400 */ REG_QUIRK_ENTRY(0x0ecb, 0x203c, 2), /* JBL Quantum 600 */ REG_QUIRK_ENTRY(0x0ecb, 0x203e, 2), /* JBL Quantum 800 */ From 5a5af8c73fab4b194bc82fa848ac808df932d732 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Fri, 27 Aug 2021 15:32:27 +0900 Subject: [PATCH 0494/5344] usb: host: xhci-rcar: Don't reload firmware after the completion commit 57f3ffdc11143f56f1314972fe86fe17a0dcde85 upstream. According to the datasheet, "Upon the completion of FW Download, there is no need to write or reload FW.". Otherwise, it's possible to cause unexpected behaviors. So, adds such a condition. Fixes: 4ac8918f3a73 ("usb: host: xhci-plat: add support for the R-Car H2 and M2 xHCI controllers") Cc: stable@vger.kernel.org # v3.17+ Signed-off-by: Yoshihiro Shimoda Link: https://lore.kernel.org/r/20210827063227.81990-1-yoshihiro.shimoda.uh@renesas.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-rcar.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/usb/host/xhci-rcar.c b/drivers/usb/host/xhci-rcar.c index c1025d321a417..3da75b367f952 100644 --- a/drivers/usb/host/xhci-rcar.c +++ b/drivers/usb/host/xhci-rcar.c @@ -134,6 +134,13 @@ static int xhci_rcar_download_firmware(struct usb_hcd *hcd) const struct soc_device_attribute *attr; const char *firmware_name; + /* + * According to the datasheet, "Upon the completion of FW Download, + * there is no need to write or reload FW". + */ + if (readl(regs + RCAR_USB3_DL_CTRL) & RCAR_USB3_DL_CTRL_FW_SUCCESS) + return 0; + attr = soc_device_match(rcar_quirks_match); if (attr) quirks = (uintptr_t)attr->data; From d77a667ca9cb56c20be34dc2a52defbd2120e60c Mon Sep 17 00:00:00 2001 From: Chunfeng Yun Date: Fri, 13 Aug 2021 14:30:48 +0800 Subject: [PATCH 0495/5344] usb: mtu3: use @mult for HS isoc or intr commit fd7cb394ec7efccc3985feb0978cee4d352e1817 upstream. For HS isoc or intr, should use @mult but not @burst to save mult value. Fixes: 4d79e042ed8b ("usb: mtu3: add support for usb3.1 IP") Cc: stable@vger.kernel.org Signed-off-by: Chunfeng Yun Link: https://lore.kernel.org/r/1628836253-7432-2-git-send-email-chunfeng.yun@mediatek.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/mtu3/mtu3_gadget.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/mtu3/mtu3_gadget.c b/drivers/usb/mtu3/mtu3_gadget.c index 08db02f3172df..a4d41dda70975 100644 --- a/drivers/usb/mtu3/mtu3_gadget.c +++ b/drivers/usb/mtu3/mtu3_gadget.c @@ -100,7 +100,7 @@ static int mtu3_ep_enable(struct mtu3_ep *mep) usb_endpoint_xfer_int(desc)) { interval = desc->bInterval; interval = clamp_val(interval, 1, 16) - 1; - burst = (max_packet & GENMASK(12, 11)) >> 11; + mult = (max_packet & GENMASK(12, 11)) >> 11; } break; default: From ecd5160b6eb378bc4c9323ba1b876a51a443cf5c Mon Sep 17 00:00:00 2001 From: Chunfeng Yun Date: Fri, 13 Aug 2021 14:30:49 +0800 Subject: [PATCH 0496/5344] usb: mtu3: fix the wrong HS mult value commit 44e4439d8f9f8d0e9da767d1f31e7c211081feca upstream. usb_endpoint_maxp() returns actual max packet size, @mult will always be zero, fix it by using usb_endpoint_maxp_mult() instead to get mult. Fixes: 4d79e042ed8b ("usb: mtu3: add support for usb3.1 IP") Cc: stable@vger.kernel.org Signed-off-by: Chunfeng Yun Link: https://lore.kernel.org/r/1628836253-7432-3-git-send-email-chunfeng.yun@mediatek.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/mtu3/mtu3_gadget.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/usb/mtu3/mtu3_gadget.c b/drivers/usb/mtu3/mtu3_gadget.c index a4d41dda70975..619c4598e64ea 100644 --- a/drivers/usb/mtu3/mtu3_gadget.c +++ b/drivers/usb/mtu3/mtu3_gadget.c @@ -72,14 +72,12 @@ static int mtu3_ep_enable(struct mtu3_ep *mep) u32 interval = 0; u32 mult = 0; u32 burst = 0; - int max_packet; int ret; desc = mep->desc; comp_desc = mep->comp_desc; mep->type = usb_endpoint_type(desc); - max_packet = usb_endpoint_maxp(desc); - mep->maxp = max_packet & GENMASK(10, 0); + mep->maxp = usb_endpoint_maxp(desc); switch (mtu->g.speed) { case USB_SPEED_SUPER: @@ -100,7 +98,7 @@ static int mtu3_ep_enable(struct mtu3_ep *mep) usb_endpoint_xfer_int(desc)) { interval = desc->bInterval; interval = clamp_val(interval, 1, 16) - 1; - mult = (max_packet & GENMASK(12, 11)) >> 11; + mult = usb_endpoint_maxp_mult(desc) - 1; } break; default: From 413fd914ef3f7aba5a1d636535d47ca94e479fd2 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Fri, 20 Aug 2021 15:34:58 +0300 Subject: [PATCH 0497/5344] xhci: fix unsafe memory usage in xhci tracing commit cbf286e8ef8337308c259ff5b9ce2e74d403be5a upstream. Removes static char buffer usage in the following decode functions: xhci_decode_trb() xhci_decode_ptortsc() Caller must provide a buffer to use. In tracing use __get_str() as recommended to pass buffer. Minor chanes are needed in xhci debugfs code as these functions are also used there. Changes include moving XHCI_MSG_MAX definititon from xhci-trace.h to xhci.h Cc: Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20210820123503.2605901-2-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-debugfs.c | 6 ++-- drivers/usb/host/xhci-trace.h | 8 ++--- drivers/usb/host/xhci.h | 52 ++++++++++++++++++--------------- 3 files changed, 36 insertions(+), 30 deletions(-) diff --git a/drivers/usb/host/xhci-debugfs.c b/drivers/usb/host/xhci-debugfs.c index 448d7b11dec4c..f5c8e4eb62ae6 100644 --- a/drivers/usb/host/xhci-debugfs.c +++ b/drivers/usb/host/xhci-debugfs.c @@ -197,12 +197,13 @@ static void xhci_ring_dump_segment(struct seq_file *s, int i; dma_addr_t dma; union xhci_trb *trb; + char str[XHCI_MSG_MAX]; for (i = 0; i < TRBS_PER_SEGMENT; i++) { trb = &seg->trbs[i]; dma = seg->dma + i * sizeof(*trb); seq_printf(s, "%pad: %s\n", &dma, - xhci_decode_trb(le32_to_cpu(trb->generic.field[0]), + xhci_decode_trb(str, XHCI_MSG_MAX, le32_to_cpu(trb->generic.field[0]), le32_to_cpu(trb->generic.field[1]), le32_to_cpu(trb->generic.field[2]), le32_to_cpu(trb->generic.field[3]))); @@ -340,9 +341,10 @@ static int xhci_portsc_show(struct seq_file *s, void *unused) { struct xhci_port *port = s->private; u32 portsc; + char str[XHCI_MSG_MAX]; portsc = readl(port->addr); - seq_printf(s, "%s\n", xhci_decode_portsc(portsc)); + seq_printf(s, "%s\n", xhci_decode_portsc(str, portsc)); return 0; } diff --git a/drivers/usb/host/xhci-trace.h b/drivers/usb/host/xhci-trace.h index 87da9098fb347..dab2af3f2c4f7 100644 --- a/drivers/usb/host/xhci-trace.h +++ b/drivers/usb/host/xhci-trace.h @@ -25,8 +25,6 @@ #include "xhci.h" #include "xhci-dbgcap.h" -#define XHCI_MSG_MAX 500 - DECLARE_EVENT_CLASS(xhci_log_msg, TP_PROTO(struct va_format *vaf), TP_ARGS(vaf), @@ -122,6 +120,7 @@ DECLARE_EVENT_CLASS(xhci_log_trb, __field(u32, field1) __field(u32, field2) __field(u32, field3) + __dynamic_array(char, str, XHCI_MSG_MAX) ), TP_fast_assign( __entry->type = ring->type; @@ -131,7 +130,7 @@ DECLARE_EVENT_CLASS(xhci_log_trb, __entry->field3 = le32_to_cpu(trb->field[3]); ), TP_printk("%s: %s", xhci_ring_type_string(__entry->type), - xhci_decode_trb(__entry->field0, __entry->field1, + xhci_decode_trb(__get_str(str), XHCI_MSG_MAX, __entry->field0, __entry->field1, __entry->field2, __entry->field3) ) ); @@ -523,6 +522,7 @@ DECLARE_EVENT_CLASS(xhci_log_portsc, TP_STRUCT__entry( __field(u32, portnum) __field(u32, portsc) + __dynamic_array(char, str, XHCI_MSG_MAX) ), TP_fast_assign( __entry->portnum = portnum; @@ -530,7 +530,7 @@ DECLARE_EVENT_CLASS(xhci_log_portsc, ), TP_printk("port-%d: %s", __entry->portnum, - xhci_decode_portsc(__entry->portsc) + xhci_decode_portsc(__get_str(str), __entry->portsc) ) ); diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index 834f32fe99308..02df309e44093 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -22,6 +22,9 @@ #include "xhci-ext-caps.h" #include "pci-quirks.h" +/* max buffer size for trace and debug messages */ +#define XHCI_MSG_MAX 500 + /* xHCI PCI Configuration Registers */ #define XHCI_SBRN_OFFSET (0x60) @@ -2217,15 +2220,14 @@ static inline char *xhci_slot_state_string(u32 state) } } -static inline const char *xhci_decode_trb(u32 field0, u32 field1, u32 field2, - u32 field3) +static inline const char *xhci_decode_trb(char *str, size_t size, + u32 field0, u32 field1, u32 field2, u32 field3) { - static char str[256]; int type = TRB_FIELD_TO_TYPE(field3); switch (type) { case TRB_LINK: - sprintf(str, + snprintf(str, size, "LINK %08x%08x intr %d type '%s' flags %c:%c:%c:%c", field1, field0, GET_INTR_TARGET(field2), xhci_trb_type_string(type), @@ -2242,7 +2244,7 @@ static inline const char *xhci_decode_trb(u32 field0, u32 field1, u32 field2, case TRB_HC_EVENT: case TRB_DEV_NOTE: case TRB_MFINDEX_WRAP: - sprintf(str, + snprintf(str, size, "TRB %08x%08x status '%s' len %d slot %d ep %d type '%s' flags %c:%c", field1, field0, xhci_trb_comp_code_string(GET_COMP_CODE(field2)), @@ -2255,7 +2257,8 @@ static inline const char *xhci_decode_trb(u32 field0, u32 field1, u32 field2, break; case TRB_SETUP: - sprintf(str, "bRequestType %02x bRequest %02x wValue %02x%02x wIndex %02x%02x wLength %d length %d TD size %d intr %d type '%s' flags %c:%c:%c", + snprintf(str, size, + "bRequestType %02x bRequest %02x wValue %02x%02x wIndex %02x%02x wLength %d length %d TD size %d intr %d type '%s' flags %c:%c:%c", field0 & 0xff, (field0 & 0xff00) >> 8, (field0 & 0xff000000) >> 24, @@ -2272,7 +2275,8 @@ static inline const char *xhci_decode_trb(u32 field0, u32 field1, u32 field2, field3 & TRB_CYCLE ? 'C' : 'c'); break; case TRB_DATA: - sprintf(str, "Buffer %08x%08x length %d TD size %d intr %d type '%s' flags %c:%c:%c:%c:%c:%c:%c", + snprintf(str, size, + "Buffer %08x%08x length %d TD size %d intr %d type '%s' flags %c:%c:%c:%c:%c:%c:%c", field1, field0, TRB_LEN(field2), GET_TD_SIZE(field2), GET_INTR_TARGET(field2), xhci_trb_type_string(type), @@ -2285,7 +2289,8 @@ static inline const char *xhci_decode_trb(u32 field0, u32 field1, u32 field2, field3 & TRB_CYCLE ? 'C' : 'c'); break; case TRB_STATUS: - sprintf(str, "Buffer %08x%08x length %d TD size %d intr %d type '%s' flags %c:%c:%c:%c", + snprintf(str, size, + "Buffer %08x%08x length %d TD size %d intr %d type '%s' flags %c:%c:%c:%c", field1, field0, TRB_LEN(field2), GET_TD_SIZE(field2), GET_INTR_TARGET(field2), xhci_trb_type_string(type), @@ -2298,7 +2303,7 @@ static inline const char *xhci_decode_trb(u32 field0, u32 field1, u32 field2, case TRB_ISOC: case TRB_EVENT_DATA: case TRB_TR_NOOP: - sprintf(str, + snprintf(str, size, "Buffer %08x%08x length %d TD size %d intr %d type '%s' flags %c:%c:%c:%c:%c:%c:%c:%c", field1, field0, TRB_LEN(field2), GET_TD_SIZE(field2), GET_INTR_TARGET(field2), @@ -2315,21 +2320,21 @@ static inline const char *xhci_decode_trb(u32 field0, u32 field1, u32 field2, case TRB_CMD_NOOP: case TRB_ENABLE_SLOT: - sprintf(str, + snprintf(str, size, "%s: flags %c", xhci_trb_type_string(type), field3 & TRB_CYCLE ? 'C' : 'c'); break; case TRB_DISABLE_SLOT: case TRB_NEG_BANDWIDTH: - sprintf(str, + snprintf(str, size, "%s: slot %d flags %c", xhci_trb_type_string(type), TRB_TO_SLOT_ID(field3), field3 & TRB_CYCLE ? 'C' : 'c'); break; case TRB_ADDR_DEV: - sprintf(str, + snprintf(str, size, "%s: ctx %08x%08x slot %d flags %c:%c", xhci_trb_type_string(type), field1, field0, @@ -2338,7 +2343,7 @@ static inline const char *xhci_decode_trb(u32 field0, u32 field1, u32 field2, field3 & TRB_CYCLE ? 'C' : 'c'); break; case TRB_CONFIG_EP: - sprintf(str, + snprintf(str, size, "%s: ctx %08x%08x slot %d flags %c:%c", xhci_trb_type_string(type), field1, field0, @@ -2347,7 +2352,7 @@ static inline const char *xhci_decode_trb(u32 field0, u32 field1, u32 field2, field3 & TRB_CYCLE ? 'C' : 'c'); break; case TRB_EVAL_CONTEXT: - sprintf(str, + snprintf(str, size, "%s: ctx %08x%08x slot %d flags %c", xhci_trb_type_string(type), field1, field0, @@ -2355,7 +2360,7 @@ static inline const char *xhci_decode_trb(u32 field0, u32 field1, u32 field2, field3 & TRB_CYCLE ? 'C' : 'c'); break; case TRB_RESET_EP: - sprintf(str, + snprintf(str, size, "%s: ctx %08x%08x slot %d ep %d flags %c:%c", xhci_trb_type_string(type), field1, field0, @@ -2376,7 +2381,7 @@ static inline const char *xhci_decode_trb(u32 field0, u32 field1, u32 field2, field3 & TRB_CYCLE ? 'C' : 'c'); break; case TRB_SET_DEQ: - sprintf(str, + snprintf(str, size, "%s: deq %08x%08x stream %d slot %d ep %d flags %c", xhci_trb_type_string(type), field1, field0, @@ -2387,14 +2392,14 @@ static inline const char *xhci_decode_trb(u32 field0, u32 field1, u32 field2, field3 & TRB_CYCLE ? 'C' : 'c'); break; case TRB_RESET_DEV: - sprintf(str, + snprintf(str, size, "%s: slot %d flags %c", xhci_trb_type_string(type), TRB_TO_SLOT_ID(field3), field3 & TRB_CYCLE ? 'C' : 'c'); break; case TRB_FORCE_EVENT: - sprintf(str, + snprintf(str, size, "%s: event %08x%08x vf intr %d vf id %d flags %c", xhci_trb_type_string(type), field1, field0, @@ -2403,14 +2408,14 @@ static inline const char *xhci_decode_trb(u32 field0, u32 field1, u32 field2, field3 & TRB_CYCLE ? 'C' : 'c'); break; case TRB_SET_LT: - sprintf(str, + snprintf(str, size, "%s: belt %d flags %c", xhci_trb_type_string(type), TRB_TO_BELT(field3), field3 & TRB_CYCLE ? 'C' : 'c'); break; case TRB_GET_BW: - sprintf(str, + snprintf(str, size, "%s: ctx %08x%08x slot %d speed %d flags %c", xhci_trb_type_string(type), field1, field0, @@ -2419,7 +2424,7 @@ static inline const char *xhci_decode_trb(u32 field0, u32 field1, u32 field2, field3 & TRB_CYCLE ? 'C' : 'c'); break; case TRB_FORCE_HEADER: - sprintf(str, + snprintf(str, size, "%s: info %08x%08x%08x pkt type %d roothub port %d flags %c", xhci_trb_type_string(type), field2, field1, field0 & 0xffffffe0, @@ -2428,7 +2433,7 @@ static inline const char *xhci_decode_trb(u32 field0, u32 field1, u32 field2, field3 & TRB_CYCLE ? 'C' : 'c'); break; default: - sprintf(str, + snprintf(str, size, "type '%s' -> raw %08x %08x %08x %08x", xhci_trb_type_string(type), field0, field1, field2, field3); @@ -2553,9 +2558,8 @@ static inline const char *xhci_portsc_link_state_string(u32 portsc) return "Unknown"; } -static inline const char *xhci_decode_portsc(u32 portsc) +static inline const char *xhci_decode_portsc(char *str, u32 portsc) { - static char str[256]; int ret; ret = sprintf(str, "%s %s %s Link:%s PortSpeed:%d ", From ac5ddcf3b93cf04e29fe29a2febb78708022ea66 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sun, 30 May 2021 12:24:47 -0400 Subject: [PATCH 0498/5344] x86/reboot: Limit Dell Optiplex 990 quirk to early BIOS versions commit a729691b541f6e63043beae72e635635abe5dc09 upstream. When this platform was relatively new in November 2011, with early BIOS revisions, a reboot quirk was added in commit 6be30bb7d750 ("x86/reboot: Blacklist Dell OptiPlex 990 known to require PCI reboot") However, this quirk (and several others) are open-ended to all BIOS versions and left no automatic expiry if/when the system BIOS fixed the issue, meaning that nobody is likely to come along and re-test. What is really problematic with using PCI reboot as this quirk does, is that it causes this platform to do a full power down, wait one second, and then power back on. This is less than ideal if one is using it for boot testing and/or bisecting kernels when legacy rotating hard disks are installed. It was only by chance that the quirk was noticed in dmesg - and when disabled it turned out that it wasn't required anymore (BIOS A24), and a default reboot would work fine without the "harshness" of power cycling the machine (and disks) down and up like the PCI reboot does. Doing a bit more research, it seems that the "newest" BIOS for which the issue was reported[1] was version A06, however Dell[2] seemed to suggest only up to and including version A05, with the A06 having a large number of fixes[3] listed. As is typical with a new platform, the initial BIOS updates come frequently and then taper off (and in this case, with a revival for CPU CVEs); a search for O990-A.exe reveals the following dates: A02 16 Mar 2011 A03 11 May 2011 A06 14 Sep 2011 A07 24 Oct 2011 A10 08 Dec 2011 A14 06 Sep 2012 A16 15 Oct 2012 A18 30 Sep 2013 A19 23 Sep 2015 A20 02 Jun 2017 A23 07 Mar 2018 A24 21 Aug 2018 While it's overkill to flash and test each of the above, it would seem likely that the issue was contained within A0x BIOS versions, given the dates above and the dates of issue reports[4] from distros. So rather than just throw out the quirk entirely, limit the scope to just those early BIOS versions, in case people are still running systems from 2011 with the original as-shipped early A0x BIOS versions. [1] https://lore.kernel.org/lkml/1320373471-3942-1-git-send-email-trenn@suse.de/ [2] https://www.dell.com/support/kbdoc/en-ca/000131908/linux-based-operating-systems-stall-upon-reboot-on-optiplex-390-790-990-systems [3] https://www.dell.com/support/home/en-ca/drivers/driversdetails?driverid=85j10 [4] https://bugs.launchpad.net/ubuntu/+source/linux/+bug/768039 Fixes: 6be30bb7d750 ("x86/reboot: Blacklist Dell OptiPlex 990 known to require PCI reboot") Signed-off-by: Paul Gortmaker Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210530162447.996461-4-paul.gortmaker@windriver.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/reboot.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index b1b96d461bc76..d65d1afb27161 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -388,10 +388,11 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = { }, { /* Handle problems with rebooting on the OptiPlex 990. */ .callback = set_pci_reboot, - .ident = "Dell OptiPlex 990", + .ident = "Dell OptiPlex 990 BIOS A0x", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"), + DMI_MATCH(DMI_BIOS_VERSION, "A0"), }, }, { /* Handle problems with rebooting on Dell 300's */ From 4b5410aeab782979f23c474749bc2683be23cab0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Thu, 24 Jun 2021 19:14:17 +0200 Subject: [PATCH 0499/5344] PCI: Call Max Payload Size-related fixup quirks early MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit b8da302e2955fe4d41eb9d48199242674d77dbe0 upstream. pci_device_add() calls HEADER fixups after pci_configure_device(), which configures Max Payload Size. Convert MPS-related fixups to EARLY fixups so pci_configure_mps() takes them into account. Fixes: 27d868b5e6cfa ("PCI: Set MPS to match upstream bridge") Link: https://lore.kernel.org/r/20210624171418.27194-1-kabel@kernel.org Signed-off-by: Marek Behún Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/pci/quirks.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 0241f0dcc093f..97c343d31f989 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3246,12 +3246,12 @@ static void fixup_mpss_256(struct pci_dev *dev) { dev->pcie_mpss = 1; /* 256 bytes */ } -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SOLARFLARE, - PCI_DEVICE_ID_SOLARFLARE_SFC4000A_0, fixup_mpss_256); -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SOLARFLARE, - PCI_DEVICE_ID_SOLARFLARE_SFC4000A_1, fixup_mpss_256); -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SOLARFLARE, - PCI_DEVICE_ID_SOLARFLARE_SFC4000B, fixup_mpss_256); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SOLARFLARE, + PCI_DEVICE_ID_SOLARFLARE_SFC4000A_0, fixup_mpss_256); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SOLARFLARE, + PCI_DEVICE_ID_SOLARFLARE_SFC4000A_1, fixup_mpss_256); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SOLARFLARE, + PCI_DEVICE_ID_SOLARFLARE_SFC4000B, fixup_mpss_256); /* * Intel 5000 and 5100 Memory controllers have an erratum with read completion From 9d1e4c800823f03042e07d68ad09d1660408c98a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 12 Sep 2021 08:56:42 +0200 Subject: [PATCH 0500/5344] Linux 5.4.145 Link: https://lore.kernel.org/r/20210910122917.149278545@linuxfoundation.org Tested-by: Florian Fainelli Tested-by: Shuah Khan Tested-by: Hulk Robot Tested-by: Sudip Mukherjee Tested-by: Guenter Roeck Tested-by: Linux Kernel Functional Testing Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 3c3804197b511..c32a36c8ffc90 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 4 -SUBLEVEL = 144 +SUBLEVEL = 145 EXTRAVERSION = NAME = Kleptomaniac Octopus From 50338defdf40060dd2510f7c98fc7af605647a6b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 30 Jun 2021 17:35:18 +0200 Subject: [PATCH 0501/5344] locking/mutex: Fix HANDOFF condition [ Upstream commit 048661a1f963e9517630f080687d48af79ed784c ] Yanfei reported that setting HANDOFF should not depend on recomputing @first, only on @first state. Which would then give: if (ww_ctx || !first) first = __mutex_waiter_is_first(lock, &waiter); if (first) __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF); But because 'ww_ctx || !first' is basically 'always' and the test for first is relatively cheap, omit that first branch entirely. Reported-by: Yanfei Xu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Waiman Long Reviewed-by: Yanfei Xu Link: https://lore.kernel.org/r/20210630154114.896786297@infradead.org Signed-off-by: Sasha Levin --- kernel/locking/mutex.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index c0c7784f074b2..b02fff28221f0 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -938,7 +938,6 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) { struct mutex_waiter waiter; - bool first = false; struct ww_mutex *ww; int ret; @@ -1017,6 +1016,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, set_current_state(state); for (;;) { + bool first; + /* * Once we hold wait_lock, we're serialized against * mutex_unlock() handing the lock off to us, do a trylock @@ -1045,15 +1046,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, spin_unlock(&lock->wait_lock); schedule_preempt_disabled(); - /* - * ww_mutex needs to always recheck its position since its waiter - * list is not FIFO ordered. - */ - if (ww_ctx || !first) { - first = __mutex_waiter_is_first(lock, &waiter); - if (first) - __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF); - } + first = __mutex_waiter_is_first(lock, &waiter); + if (first) + __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF); set_current_state(state); /* From 33181dc782f36408da6ce8cffe1b6c9338235947 Mon Sep 17 00:00:00 2001 From: Jeongtae Park Date: Thu, 1 Jul 2021 23:26:30 +0900 Subject: [PATCH 0502/5344] regmap: fix the offset of register error log [ Upstream commit 1852f5ed358147095297a09cc3c6f160208a676d ] This patch fixes the offset of register error log by using regmap_get_offset(). Signed-off-by: Jeongtae Park Link: https://lore.kernel.org/r/20210701142630.44936-1-jeongtae.park@gmail.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- drivers/base/regmap/regmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c index e0893f1b14522..43c0452a8ba91 100644 --- a/drivers/base/regmap/regmap.c +++ b/drivers/base/regmap/regmap.c @@ -1505,7 +1505,7 @@ static int _regmap_raw_write_impl(struct regmap *map, unsigned int reg, if (ret) { dev_err(map->dev, "Error in caching of register: %x ret: %d\n", - reg + i, ret); + reg + regmap_get_offset(map, i), ret); return ret; } } From f455df712fdf7bbc7b397702d6cef81c09c890ed Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Thu, 1 Jul 2021 14:56:37 -0400 Subject: [PATCH 0503/5344] crypto: mxs-dcp - Check for DMA mapping errors [ Upstream commit df6313d707e575a679ada3313358289af24454c0 ] After calling dma_map_single(), we must also call dma_mapping_error(). This fixes the following warning when compiling with CONFIG_DMA_API_DEBUG: [ 311.241478] WARNING: CPU: 0 PID: 428 at kernel/dma/debug.c:1027 check_unmap+0x79c/0x96c [ 311.249547] DMA-API: mxs-dcp 2280000.crypto: device driver failed to check map error[device address=0x00000000860cb080] [size=32 bytes] [mapped as single] Signed-off-by: Sean Anderson Reviewed-by: Richard Weinberger Signed-off-by: Herbert Xu Signed-off-by: Sasha Levin --- drivers/crypto/mxs-dcp.c | 45 +++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/drivers/crypto/mxs-dcp.c b/drivers/crypto/mxs-dcp.c index f8a48a84df2ab..66fa524b6261e 100644 --- a/drivers/crypto/mxs-dcp.c +++ b/drivers/crypto/mxs-dcp.c @@ -168,15 +168,19 @@ static struct dcp *global_sdcp; static int mxs_dcp_start_dma(struct dcp_async_ctx *actx) { + int dma_err; struct dcp *sdcp = global_sdcp; const int chan = actx->chan; uint32_t stat; unsigned long ret; struct dcp_dma_desc *desc = &sdcp->coh->desc[actx->chan]; - dma_addr_t desc_phys = dma_map_single(sdcp->dev, desc, sizeof(*desc), DMA_TO_DEVICE); + dma_err = dma_mapping_error(sdcp->dev, desc_phys); + if (dma_err) + return dma_err; + reinit_completion(&sdcp->completion[chan]); /* Clear status register. */ @@ -214,18 +218,29 @@ static int mxs_dcp_start_dma(struct dcp_async_ctx *actx) static int mxs_dcp_run_aes(struct dcp_async_ctx *actx, struct ablkcipher_request *req, int init) { + dma_addr_t key_phys, src_phys, dst_phys; struct dcp *sdcp = global_sdcp; struct dcp_dma_desc *desc = &sdcp->coh->desc[actx->chan]; struct dcp_aes_req_ctx *rctx = ablkcipher_request_ctx(req); int ret; - dma_addr_t key_phys = dma_map_single(sdcp->dev, sdcp->coh->aes_key, - 2 * AES_KEYSIZE_128, - DMA_TO_DEVICE); - dma_addr_t src_phys = dma_map_single(sdcp->dev, sdcp->coh->aes_in_buf, - DCP_BUF_SZ, DMA_TO_DEVICE); - dma_addr_t dst_phys = dma_map_single(sdcp->dev, sdcp->coh->aes_out_buf, - DCP_BUF_SZ, DMA_FROM_DEVICE); + key_phys = dma_map_single(sdcp->dev, sdcp->coh->aes_key, + 2 * AES_KEYSIZE_128, DMA_TO_DEVICE); + ret = dma_mapping_error(sdcp->dev, key_phys); + if (ret) + return ret; + + src_phys = dma_map_single(sdcp->dev, sdcp->coh->aes_in_buf, + DCP_BUF_SZ, DMA_TO_DEVICE); + ret = dma_mapping_error(sdcp->dev, src_phys); + if (ret) + goto err_src; + + dst_phys = dma_map_single(sdcp->dev, sdcp->coh->aes_out_buf, + DCP_BUF_SZ, DMA_FROM_DEVICE); + ret = dma_mapping_error(sdcp->dev, dst_phys); + if (ret) + goto err_dst; if (actx->fill % AES_BLOCK_SIZE) { dev_err(sdcp->dev, "Invalid block size!\n"); @@ -263,10 +278,12 @@ static int mxs_dcp_run_aes(struct dcp_async_ctx *actx, ret = mxs_dcp_start_dma(actx); aes_done_run: + dma_unmap_single(sdcp->dev, dst_phys, DCP_BUF_SZ, DMA_FROM_DEVICE); +err_dst: + dma_unmap_single(sdcp->dev, src_phys, DCP_BUF_SZ, DMA_TO_DEVICE); +err_src: dma_unmap_single(sdcp->dev, key_phys, 2 * AES_KEYSIZE_128, DMA_TO_DEVICE); - dma_unmap_single(sdcp->dev, src_phys, DCP_BUF_SZ, DMA_TO_DEVICE); - dma_unmap_single(sdcp->dev, dst_phys, DCP_BUF_SZ, DMA_FROM_DEVICE); return ret; } @@ -565,6 +582,10 @@ static int mxs_dcp_run_sha(struct ahash_request *req) dma_addr_t buf_phys = dma_map_single(sdcp->dev, sdcp->coh->sha_in_buf, DCP_BUF_SZ, DMA_TO_DEVICE); + ret = dma_mapping_error(sdcp->dev, buf_phys); + if (ret) + return ret; + /* Fill in the DMA descriptor. */ desc->control0 = MXS_DCP_CONTROL0_DECR_SEMAPHORE | MXS_DCP_CONTROL0_INTERRUPT | @@ -597,6 +618,10 @@ static int mxs_dcp_run_sha(struct ahash_request *req) if (rctx->fini) { digest_phys = dma_map_single(sdcp->dev, sdcp->coh->sha_out_buf, DCP_SHA_PAY_SZ, DMA_FROM_DEVICE); + ret = dma_mapping_error(sdcp->dev, digest_phys); + if (ret) + goto done_run; + desc->control0 |= MXS_DCP_CONTROL0_HASH_TERM; desc->payload = digest_phys; } From 73471f794ef4d57458eea0a17d7fb408c727376e Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Tue, 27 Jul 2021 11:11:01 +0100 Subject: [PATCH 0504/5344] sched/deadline: Fix reset_on_fork reporting of DL tasks [ Upstream commit f95091536f78971b269ec321b057b8d630b0ad8a ] It is possible for sched_getattr() to incorrectly report the state of the reset_on_fork flag when called on a deadline task. Indeed, if the flag was set on a deadline task using sched_setattr() with flags (SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_KEEP_PARAMS), then p->sched_reset_on_fork will be set, but __setscheduler() will bail out early, which means that the dl_se->flags will not get updated by __setscheduler_params()->__setparam_dl(). Consequently, if sched_getattr() is then called on the task, __getparam_dl() will override kattr.sched_flags with the now out-of-date copy in dl_se->flags and report the stale value to userspace. To fix this, make sure to only copy the flags that are relevant to sched_deadline to and from the dl_se->flags field. Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210727101103.2729607-2-qperret@google.com Signed-off-by: Sasha Levin --- kernel/sched/deadline.c | 7 ++++--- kernel/sched/sched.h | 2 ++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 3cf776d5bce8f..7ab0b80cb12d1 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2622,7 +2622,7 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_runtime = attr->sched_runtime; dl_se->dl_deadline = attr->sched_deadline; dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; - dl_se->flags = attr->sched_flags; + dl_se->flags = attr->sched_flags & SCHED_DL_FLAGS; dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); } @@ -2635,7 +2635,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) attr->sched_runtime = dl_se->dl_runtime; attr->sched_deadline = dl_se->dl_deadline; attr->sched_period = dl_se->dl_period; - attr->sched_flags = dl_se->flags; + attr->sched_flags &= ~SCHED_DL_FLAGS; + attr->sched_flags |= dl_se->flags; } /* @@ -2710,7 +2711,7 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) if (dl_se->dl_runtime != attr->sched_runtime || dl_se->dl_deadline != attr->sched_deadline || dl_se->dl_period != attr->sched_period || - dl_se->flags != attr->sched_flags) + dl_se->flags != (attr->sched_flags & SCHED_DL_FLAGS)) return true; return false; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8d484647fe96d..c92c74de005c1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -219,6 +219,8 @@ static inline void update_avg(u64 *avg, u64 sample) */ #define SCHED_FLAG_SUGOV 0x10000000 +#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV) + static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se) { #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL From cde2a5197ee49e1c416ae53f1637566f1448227d Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 1 Aug 2021 15:30:59 +0200 Subject: [PATCH 0505/5344] power: supply: axp288_fuel_gauge: Report register-address on readb / writeb errors [ Upstream commit caa534c3ba40c6e8352b42cbbbca9ba481814ac8 ] When fuel_gauge_reg_readb()/_writeb() fails, report which register we were trying to read / write when the error happened. Also reword the message a bit: - Drop the axp288 prefix, dev_err() already prints this - Switch from telegram / abbreviated style to a normal sentence, aligning the message with those from fuel_gauge_read_*bit_word() Signed-off-by: Hans de Goede Signed-off-by: Sebastian Reichel Signed-off-by: Sasha Levin --- drivers/power/supply/axp288_fuel_gauge.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/power/supply/axp288_fuel_gauge.c b/drivers/power/supply/axp288_fuel_gauge.c index f40fa0e63b6e5..993e4a4a34b38 100644 --- a/drivers/power/supply/axp288_fuel_gauge.c +++ b/drivers/power/supply/axp288_fuel_gauge.c @@ -149,7 +149,7 @@ static int fuel_gauge_reg_readb(struct axp288_fg_info *info, int reg) } if (ret < 0) { - dev_err(&info->pdev->dev, "axp288 reg read err:%d\n", ret); + dev_err(&info->pdev->dev, "Error reading reg 0x%02x err: %d\n", reg, ret); return ret; } @@ -163,7 +163,7 @@ static int fuel_gauge_reg_writeb(struct axp288_fg_info *info, int reg, u8 val) ret = regmap_write(info->regmap, reg, (unsigned int)val); if (ret < 0) - dev_err(&info->pdev->dev, "axp288 reg write err:%d\n", ret); + dev_err(&info->pdev->dev, "Error writing reg 0x%02x err: %d\n", reg, ret); return ret; } From 92dc530f73ca05cda29d605011af806b8d4f6b05 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Tue, 27 Jul 2021 13:23:34 +0300 Subject: [PATCH 0506/5344] crypto: omap-sham - clear dma flags only after omap_sham_update_dma_stop() [ Upstream commit fe28140b3393b0ba1eb95cc109f974a7e58b26fd ] We should not clear FLAGS_DMA_ACTIVE before omap_sham_update_dma_stop() is done calling dma_unmap_sg(). We already clear FLAGS_DMA_ACTIVE at the end of omap_sham_update_dma_stop(). The early clearing of FLAGS_DMA_ACTIVE is not causing issues as we do not need to defer anything based on FLAGS_DMA_ACTIVE currently. So this can be applied as clean-up. Cc: Lokesh Vutla Cc: Tero Kristo Signed-off-by: Tony Lindgren Signed-off-by: Herbert Xu Signed-off-by: Sasha Levin --- drivers/crypto/omap-sham.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/omap-sham.c b/drivers/crypto/omap-sham.c index f80db1eb29945..f8a146554b1f3 100644 --- a/drivers/crypto/omap-sham.c +++ b/drivers/crypto/omap-sham.c @@ -1734,7 +1734,7 @@ static void omap_sham_done_task(unsigned long data) if (test_and_clear_bit(FLAGS_OUTPUT_READY, &dd->flags)) goto finish; } else if (test_bit(FLAGS_DMA_READY, &dd->flags)) { - if (test_and_clear_bit(FLAGS_DMA_ACTIVE, &dd->flags)) { + if (test_bit(FLAGS_DMA_ACTIVE, &dd->flags)) { omap_sham_update_dma_stop(dd); if (dd->err) { err = dd->err; From b33b5754081ca27714596e86d05d76ebea431815 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 4 Aug 2021 15:59:25 +0200 Subject: [PATCH 0507/5344] sched/deadline: Fix missing clock update in migrate_task_rq_dl() [ Upstream commit b4da13aa28d4fd0071247b7b41c579ee8a86c81a ] A missing clock update is causing the following warning: rq->clock_update_flags < RQCF_ACT_SKIP WARNING: CPU: 112 PID: 2041 at kernel/sched/sched.h:1453 sub_running_bw.isra.0+0x190/0x1a0 ... CPU: 112 PID: 2041 Comm: sugov:112 Tainted: G W 5.14.0-rc1 #1 Hardware name: WIWYNN Mt.Jade Server System B81.030Z1.0007/Mt.Jade Motherboard, BIOS 1.6.20210526 (SCP: 1.06.20210526) 2021/05/26 ... Call trace: sub_running_bw.isra.0+0x190/0x1a0 migrate_task_rq_dl+0xf8/0x1e0 set_task_cpu+0xa8/0x1f0 try_to_wake_up+0x150/0x3d4 wake_up_q+0x64/0xc0 __up_write+0xd0/0x1c0 up_write+0x4c/0x2b0 cppc_set_perf+0x120/0x2d0 cppc_cpufreq_set_target+0xe0/0x1a4 [cppc_cpufreq] __cpufreq_driver_target+0x74/0x140 sugov_work+0x64/0x80 kthread_worker_fn+0xe0/0x230 kthread+0x138/0x140 ret_from_fork+0x10/0x18 The task causing this is the `cppc_fie` DL task introduced by commit 1eb5dde674f5 ("cpufreq: CPPC: Add support for frequency invariance"). With CONFIG_ACPI_CPPC_CPUFREQ_FIE=y and schedutil cpufreq governor on slow-switching system (like on this Ampere Altra WIWYNN Mt. Jade Arm Server): DL task `curr=sugov:112` lets `p=cppc_fie` migrate and since the latter is in `non_contending` state, migrate_task_rq_dl() calls sub_running_bw()->__sub_running_bw()->cpufreq_update_util()-> rq_clock()->assert_clock_updated() on p. Fix this by updating the clock for a non_contending task in migrate_task_rq_dl() before calling sub_running_bw(). Reported-by: Bruno Goncalves Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Bristot de Oliveira Acked-by: Juri Lelli Link: https://lore.kernel.org/r/20210804135925.3734605-1-dietmar.eggemann@arm.com Signed-off-by: Sasha Levin --- kernel/sched/deadline.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 7ab0b80cb12d1..2bda9fdba31c4 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1654,6 +1654,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused */ raw_spin_lock(&rq->lock); if (p->dl.dl_non_contending) { + update_rq_clock(rq); sub_running_bw(&p->dl, &rq->dl); p->dl.dl_non_contending = 0; /* From c933d541a3567e3b81a21337b66513d6197fbed3 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Sat, 22 May 2021 00:56:23 +0900 Subject: [PATCH 0508/5344] rcu/tree: Handle VM stoppage in stall detection [ Upstream commit ccfc9dd6914feaa9a81f10f9cce56eb0f7712264 ] The soft watchdog timer function checks if a virtual machine was suspended and hence what looks like a lockup in fact is a false positive. This is what kvm_check_and_clear_guest_paused() does: it tests guest PVCLOCK_GUEST_STOPPED (which is set by the host) and if it's set then we need to touch all watchdogs and bail out. Watchdog timer function runs from IRQ, so PVCLOCK_GUEST_STOPPED check works fine. There is, however, one more watchdog that runs from IRQ, so watchdog timer fn races with it, and that watchdog is not aware of PVCLOCK_GUEST_STOPPED - RCU stall detector. apic_timer_interrupt() smp_apic_timer_interrupt() hrtimer_interrupt() __hrtimer_run_queues() tick_sched_timer() tick_sched_handle() update_process_times() rcu_sched_clock_irq() This triggers RCU stalls on our devices during VM resume. If tick_sched_handle()->rcu_sched_clock_irq() runs on a VCPU before watchdog_timer_fn()->kvm_check_and_clear_guest_paused() then there is nothing on this VCPU that touches watchdogs and RCU reads stale gp stall timestamp and new jiffies value, which makes it think that RCU has stalled. Make RCU stall watchdog aware of PVCLOCK_GUEST_STOPPED and don't report RCU stalls when we resume the VM. Signed-off-by: Sergey Senozhatsky Signed-off-by: Signed-off-by: Paul E. McKenney Signed-off-by: Sasha Levin --- kernel/rcu/tree_stall.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index c0b8c458d8a6a..b8c9744ad595b 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -7,6 +7,8 @@ * Author: Paul E. McKenney */ +#include + ////////////////////////////////////////////////////////////////////////////// // // Controlling CPU stall warnings, including delay calculation. @@ -525,6 +527,14 @@ static void check_cpu_stall(struct rcu_data *rdp) (READ_ONCE(rnp->qsmask) & rdp->grpmask) && cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { + /* + * If a virtual machine is stopped by the host it can look to + * the watchdog like an RCU stall. Check to see if the host + * stopped the vm. + */ + if (kvm_check_and_clear_guest_paused()) + return; + /* We haven't checked in, so go dump stack. */ print_cpu_stall(); if (rcu_cpu_stall_ftrace_dump) @@ -534,6 +544,14 @@ static void check_cpu_stall(struct rcu_data *rdp) ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { + /* + * If a virtual machine is stopped by the host it can look to + * the watchdog like an RCU stall. Check to see if the host + * stopped the vm. + */ + if (kvm_check_and_clear_guest_paused()) + return; + /* They had a few time units to dump stack, so complain. */ print_other_cpu_stall(gs2); if (rcu_cpu_stall_ftrace_dump) From c7a237f9e204c1e92df52ff816a1cf273553381b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 26 Jul 2021 14:55:10 +0200 Subject: [PATCH 0509/5344] posix-cpu-timers: Force next expiration recalc after itimer reset [ Upstream commit 406dd42bd1ba0c01babf9cde169bb319e52f6147 ] When an itimer deactivates a previously armed expiration, it simply doesn't do anything. As a result the process wide cputime counter keeps running and the tick dependency stays set until it reaches the old ghost expiration value. This can be reproduced with the following snippet: void trigger_process_counter(void) { struct itimerval n = {}; n.it_value.tv_sec = 100; setitimer(ITIMER_VIRTUAL, &n, NULL); n.it_value.tv_sec = 0; setitimer(ITIMER_VIRTUAL, &n, NULL); } Fix this with resetting the relevant base expiration. This is similar to disarming a timer. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210726125513.271824-4-frederic@kernel.org Signed-off-by: Sasha Levin --- kernel/time/posix-cpu-timers.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index eacb0ca301932..30e061b210b7c 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1201,8 +1201,6 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid, } } - if (!*newval) - return; *newval += now; } From 4ac828f7b20e2be3bfda092c7c74433cd9b31d72 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jul 2021 15:39:46 +0200 Subject: [PATCH 0510/5344] hrtimer: Avoid double reprogramming in __hrtimer_start_range_ns() [ Upstream commit 627ef5ae2df8eeccb20d5af0e4cfa4df9e61ed28 ] If __hrtimer_start_range_ns() is invoked with an already armed hrtimer then the timer has to be canceled first and then added back. If the timer is the first expiring timer then on removal the clockevent device is reprogrammed to the next expiring timer to avoid that the pending expiry fires needlessly. If the new expiry time ends up to be the first expiry again then the clock event device has to reprogrammed again. Avoid this by checking whether the timer is the first to expire and in that case, keep the timer on the current CPU and delay the reprogramming up to the point where the timer has been enqueued again. Reported-by: Lorenzo Colitti Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135157.873137732@linutronix.de Signed-off-by: Sasha Levin --- kernel/time/hrtimer.c | 60 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 7 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 1f3e3a17f67e0..39beb9aaa24b2 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1031,12 +1031,13 @@ static void __remove_hrtimer(struct hrtimer *timer, * remove hrtimer, called with base lock held */ static inline int -remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart) +remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, + bool restart, bool keep_local) { u8 state = timer->state; if (state & HRTIMER_STATE_ENQUEUED) { - int reprogram; + bool reprogram; /* * Remove the timer and force reprogramming when high @@ -1049,8 +1050,16 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest debug_deactivate(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); + /* + * If the timer is not restarted then reprogramming is + * required if the timer is local. If it is local and about + * to be restarted, avoid programming it twice (on removal + * and a moment later when it's requeued). + */ if (!restart) state = HRTIMER_STATE_INACTIVE; + else + reprogram &= !keep_local; __remove_hrtimer(timer, base, state, reprogram); return 1; @@ -1104,9 +1113,31 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, struct hrtimer_clock_base *base) { struct hrtimer_clock_base *new_base; + bool force_local, first; - /* Remove an active timer from the queue: */ - remove_hrtimer(timer, base, true); + /* + * If the timer is on the local cpu base and is the first expiring + * timer then this might end up reprogramming the hardware twice + * (on removal and on enqueue). To avoid that by prevent the + * reprogram on removal, keep the timer local to the current CPU + * and enforce reprogramming after it is queued no matter whether + * it is the new first expiring timer again or not. + */ + force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); + force_local &= base->cpu_base->next_timer == timer; + + /* + * Remove an active timer from the queue. In case it is not queued + * on the current CPU, make sure that remove_hrtimer() updates the + * remote data correctly. + * + * If it's on the current CPU and the first expiring timer, then + * skip reprogramming, keep the timer local and enforce + * reprogramming later if it was the first expiring timer. This + * avoids programming the underlying clock event twice (once at + * removal and once after enqueue). + */ + remove_hrtimer(timer, base, true, force_local); if (mode & HRTIMER_MODE_REL) tim = ktime_add_safe(tim, base->get_time()); @@ -1116,9 +1147,24 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, hrtimer_set_expires_range_ns(timer, tim, delta_ns); /* Switch the timer base, if necessary: */ - new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); + if (!force_local) { + new_base = switch_hrtimer_base(timer, base, + mode & HRTIMER_MODE_PINNED); + } else { + new_base = base; + } + + first = enqueue_hrtimer(timer, new_base, mode); + if (!force_local) + return first; - return enqueue_hrtimer(timer, new_base, mode); + /* + * Timer was forced to stay on the current CPU to avoid + * reprogramming on removal and enqueue. Force reprogram the + * hardware by evaluating the new first expiring timer. + */ + hrtimer_force_reprogram(new_base->cpu_base, 1); + return 0; } /** @@ -1184,7 +1230,7 @@ int hrtimer_try_to_cancel(struct hrtimer *timer) base = lock_hrtimer_base(timer, &flags); if (!hrtimer_callback_running(timer)) - ret = remove_hrtimer(timer, base, false); + ret = remove_hrtimer(timer, base, false, false); unlock_hrtimer_base(timer, &flags); From 7bc7566f45bdb2b1b77146d6861847c735bea6db Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jul 2021 15:39:48 +0200 Subject: [PATCH 0511/5344] hrtimer: Ensure timerfd notification for HIGHRES=n [ Upstream commit 8c3b5e6ec0fee18bc2ce38d1dfe913413205f908 ] If high resolution timers are disabled the timerfd notification about a clock was set event is not happening for all cases which use clock_was_set_delayed() because that's a NOP for HIGHRES=n, which is wrong. Make clock_was_set_delayed() unconditially available to fix that. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135158.196661266@linutronix.de Signed-off-by: Sasha Levin --- include/linux/hrtimer.h | 5 ----- kernel/time/hrtimer.c | 32 ++++++++++++++++---------------- kernel/time/tick-internal.h | 3 +++ 3 files changed, 19 insertions(+), 21 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 1f98b52118f0a..48be92aded5ee 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -317,16 +317,12 @@ struct clock_event_device; extern void hrtimer_interrupt(struct clock_event_device *dev); -extern void clock_was_set_delayed(void); - extern unsigned int hrtimer_resolution; #else #define hrtimer_resolution (unsigned int)LOW_RES_NSEC -static inline void clock_was_set_delayed(void) { } - #endif static inline ktime_t @@ -350,7 +346,6 @@ hrtimer_expires_remaining_adjusted(const struct hrtimer *timer) timer->base->get_time()); } -extern void clock_was_set(void); #ifdef CONFIG_TIMERFD extern void timerfd_clock_was_set(void); #else diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 39beb9aaa24b2..e1e8d5dab0c59 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -759,22 +759,6 @@ static void hrtimer_switch_to_hres(void) retrigger_next_event(NULL); } -static void clock_was_set_work(struct work_struct *work) -{ - clock_was_set(); -} - -static DECLARE_WORK(hrtimer_work, clock_was_set_work); - -/* - * Called from timekeeping and resume code to reprogram the hrtimer - * interrupt device on all cpus. - */ -void clock_was_set_delayed(void) -{ - schedule_work(&hrtimer_work); -} - #else static inline int hrtimer_is_hres_enabled(void) { return 0; } @@ -892,6 +876,22 @@ void clock_was_set(void) timerfd_clock_was_set(); } +static void clock_was_set_work(struct work_struct *work) +{ + clock_was_set(); +} + +static DECLARE_WORK(hrtimer_work, clock_was_set_work); + +/* + * Called from timekeeping and resume code to reprogram the hrtimer + * interrupt device on all cpus and to notify timerfd. + */ +void clock_was_set_delayed(void) +{ + schedule_work(&hrtimer_work); +} + /* * During resume we might have to reprogram the high resolution timer * interrupt on all online CPUs. However, all other CPUs will be diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 7b24961367292..5294f5b1f9550 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -165,3 +165,6 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); void timer_clear_idle(void); + +void clock_was_set(void); +void clock_was_set_delayed(void); From ed6234fc6b2ebcb04b22b2a0842af6f15b6b8028 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 3 May 2021 11:39:03 +0200 Subject: [PATCH 0512/5344] udf: Check LVID earlier [ Upstream commit 781d2a9a2fc7d0be53a072794dc03ef6de770f3d ] We were checking validity of LVID entries only when getting implementation use information from LVID in udf_sb_lvidiu(). However if the LVID is suitably corrupted, it can cause problems also to code such as udf_count_free() which doesn't use udf_sb_lvidiu(). So check validity of LVID already when loading it from the disk and just disable LVID altogether when it is not valid. Reported-by: syzbot+7fbfe5fed73ebb675748@syzkaller.appspotmail.com Signed-off-by: Jan Kara Signed-off-by: Sasha Levin --- fs/udf/super.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/fs/udf/super.c b/fs/udf/super.c index 8bb001c7927f0..a1efcf0593cb8 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -108,16 +108,10 @@ struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct super_block *sb) return NULL; lvid = (struct logicalVolIntegrityDesc *)UDF_SB(sb)->s_lvid_bh->b_data; partnum = le32_to_cpu(lvid->numOfPartitions); - if ((sb->s_blocksize - sizeof(struct logicalVolIntegrityDescImpUse) - - offsetof(struct logicalVolIntegrityDesc, impUse)) / - (2 * sizeof(uint32_t)) < partnum) { - udf_err(sb, "Logical volume integrity descriptor corrupted " - "(numOfPartitions = %u)!\n", partnum); - return NULL; - } /* The offset is to skip freeSpaceTable and sizeTable arrays */ offset = partnum * 2 * sizeof(uint32_t); - return (struct logicalVolIntegrityDescImpUse *)&(lvid->impUse[offset]); + return (struct logicalVolIntegrityDescImpUse *) + (((uint8_t *)(lvid + 1)) + offset); } /* UDF filesystem type */ @@ -1548,6 +1542,7 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ struct udf_sb_info *sbi = UDF_SB(sb); struct logicalVolIntegrityDesc *lvid; int indirections = 0; + u32 parts, impuselen; while (++indirections <= UDF_MAX_LVID_NESTING) { final_bh = NULL; @@ -1574,15 +1569,27 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ lvid = (struct logicalVolIntegrityDesc *)final_bh->b_data; if (lvid->nextIntegrityExt.extLength == 0) - return; + goto check; loc = leea_to_cpu(lvid->nextIntegrityExt); } udf_warn(sb, "Too many LVID indirections (max %u), ignoring.\n", UDF_MAX_LVID_NESTING); +out_err: brelse(sbi->s_lvid_bh); sbi->s_lvid_bh = NULL; + return; +check: + parts = le32_to_cpu(lvid->numOfPartitions); + impuselen = le32_to_cpu(lvid->lengthOfImpUse); + if (parts >= sb->s_blocksize || impuselen >= sb->s_blocksize || + sizeof(struct logicalVolIntegrityDesc) + impuselen + + 2 * parts * sizeof(u32) > sb->s_blocksize) { + udf_warn(sb, "Corrupted LVID (parts=%u, impuselen=%u), " + "ignoring.\n", parts, impuselen); + goto out_err; + } } /* From 427b4317da79cda5b90967ca2457362a05c1f65d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Sun, 8 Aug 2021 18:24:36 +0200 Subject: [PATCH 0513/5344] udf: Fix iocharset=utf8 mount option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit b645333443712d2613e4e863f81090d5dc509657 ] Currently iocharset=utf8 mount option is broken. To use UTF-8 as iocharset, it is required to use utf8 mount option. Fix iocharset=utf8 mount option to use be equivalent to the utf8 mount option. If UTF-8 as iocharset is used then s_nls_map is set to NULL. So simplify code around, remove UDF_FLAG_NLS_MAP and UDF_FLAG_UTF8 flags as to distinguish between UTF-8 and non-UTF-8 it is needed just to check if s_nls_map set to NULL or not. Link: https://lore.kernel.org/r/20210808162453.1653-4-pali@kernel.org Signed-off-by: Pali Rohár Signed-off-by: Jan Kara Signed-off-by: Sasha Levin --- fs/udf/super.c | 50 ++++++++++++++++++------------------------------ fs/udf/udf_sb.h | 2 -- fs/udf/unicode.c | 4 ++-- 3 files changed, 21 insertions(+), 35 deletions(-) diff --git a/fs/udf/super.c b/fs/udf/super.c index a1efcf0593cb8..5663bae95700c 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -343,10 +343,10 @@ static int udf_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",lastblock=%u", sbi->s_last_block); if (sbi->s_anchor != 0) seq_printf(seq, ",anchor=%u", sbi->s_anchor); - if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) - seq_puts(seq, ",utf8"); - if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP) && sbi->s_nls_map) + if (sbi->s_nls_map) seq_printf(seq, ",iocharset=%s", sbi->s_nls_map->charset); + else + seq_puts(seq, ",iocharset=utf8"); return 0; } @@ -551,19 +551,24 @@ static int udf_parse_options(char *options, struct udf_options *uopt, /* Ignored (never implemented properly) */ break; case Opt_utf8: - uopt->flags |= (1 << UDF_FLAG_UTF8); + if (!remount) { + unload_nls(uopt->nls_map); + uopt->nls_map = NULL; + } break; case Opt_iocharset: if (!remount) { - if (uopt->nls_map) - unload_nls(uopt->nls_map); - /* - * load_nls() failure is handled later in - * udf_fill_super() after all options are - * parsed. - */ + unload_nls(uopt->nls_map); + uopt->nls_map = NULL; + } + /* When nls_map is not loaded then UTF-8 is used */ + if (!remount && strcmp(args[0].from, "utf8") != 0) { uopt->nls_map = load_nls(args[0].from); - uopt->flags |= (1 << UDF_FLAG_NLS_MAP); + if (!uopt->nls_map) { + pr_err("iocharset %s not found\n", + args[0].from); + return 0; + } } break; case Opt_uforget: @@ -2152,21 +2157,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) if (!udf_parse_options((char *)options, &uopt, false)) goto parse_options_failure; - if (uopt.flags & (1 << UDF_FLAG_UTF8) && - uopt.flags & (1 << UDF_FLAG_NLS_MAP)) { - udf_err(sb, "utf8 cannot be combined with iocharset\n"); - goto parse_options_failure; - } - if ((uopt.flags & (1 << UDF_FLAG_NLS_MAP)) && !uopt.nls_map) { - uopt.nls_map = load_nls_default(); - if (!uopt.nls_map) - uopt.flags &= ~(1 << UDF_FLAG_NLS_MAP); - else - udf_debug("Using default NLS map\n"); - } - if (!(uopt.flags & (1 << UDF_FLAG_NLS_MAP))) - uopt.flags |= (1 << UDF_FLAG_UTF8); - fileset.logicalBlockNum = 0xFFFFFFFF; fileset.partitionReferenceNum = 0xFFFF; @@ -2321,8 +2311,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) error_out: iput(sbi->s_vat_inode); parse_options_failure: - if (uopt.nls_map) - unload_nls(uopt.nls_map); + unload_nls(uopt.nls_map); if (lvid_open) udf_close_lvid(sb); brelse(sbi->s_lvid_bh); @@ -2372,8 +2361,7 @@ static void udf_put_super(struct super_block *sb) sbi = UDF_SB(sb); iput(sbi->s_vat_inode); - if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) - unload_nls(sbi->s_nls_map); + unload_nls(sbi->s_nls_map); if (!sb_rdonly(sb)) udf_close_lvid(sb); brelse(sbi->s_lvid_bh); diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 3d83be54c4748..8eace7a633d38 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -20,8 +20,6 @@ #define UDF_FLAG_UNDELETE 6 #define UDF_FLAG_UNHIDE 7 #define UDF_FLAG_VARCONV 8 -#define UDF_FLAG_NLS_MAP 9 -#define UDF_FLAG_UTF8 10 #define UDF_FLAG_UID_FORGET 11 /* save -1 for uid to disk */ #define UDF_FLAG_GID_FORGET 12 #define UDF_FLAG_UID_SET 13 diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index 5fcfa96463ebb..622569007b530 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -177,7 +177,7 @@ static int udf_name_from_CS0(struct super_block *sb, return 0; } - if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) + if (UDF_SB(sb)->s_nls_map) conv_f = UDF_SB(sb)->s_nls_map->uni2char; else conv_f = NULL; @@ -285,7 +285,7 @@ static int udf_name_to_CS0(struct super_block *sb, if (ocu_max_len <= 0) return 0; - if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) + if (UDF_SB(sb)->s_nls_map) conv_f = UDF_SB(sb)->s_nls_map->char2uni; else conv_f = NULL; From c78ff82913a3f8813b6a1c359abf840bc3828342 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Sun, 8 Aug 2021 18:24:37 +0200 Subject: [PATCH 0514/5344] isofs: joliet: Fix iocharset=utf8 mount option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 28ce50f8d96ec9035f60c9348294ea26b94db944 ] Currently iocharset=utf8 mount option is broken. To use UTF-8 as iocharset, it is required to use utf8 mount option. Fix iocharset=utf8 mount option to use be equivalent to the utf8 mount option. If UTF-8 as iocharset is used then s_nls_iocharset is set to NULL. So simplify code around, remove s_utf8 field as to distinguish between UTF-8 and non-UTF-8 it is needed just to check if s_nls_iocharset is set to NULL or not. Link: https://lore.kernel.org/r/20210808162453.1653-5-pali@kernel.org Signed-off-by: Pali Rohár Signed-off-by: Jan Kara Signed-off-by: Sasha Levin --- fs/isofs/inode.c | 27 +++++++++++++-------------- fs/isofs/isofs.h | 1 - fs/isofs/joliet.c | 4 +--- 3 files changed, 14 insertions(+), 18 deletions(-) diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 62c0462dc89f3..bf30f6ce8dd10 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -155,7 +155,6 @@ struct iso9660_options{ unsigned int overriderockperm:1; unsigned int uid_set:1; unsigned int gid_set:1; - unsigned int utf8:1; unsigned char map; unsigned char check; unsigned int blocksize; @@ -355,7 +354,6 @@ static int parse_options(char *options, struct iso9660_options *popt) popt->gid = GLOBAL_ROOT_GID; popt->uid = GLOBAL_ROOT_UID; popt->iocharset = NULL; - popt->utf8 = 0; popt->overriderockperm = 0; popt->session=-1; popt->sbsector=-1; @@ -388,10 +386,13 @@ static int parse_options(char *options, struct iso9660_options *popt) case Opt_cruft: popt->cruft = 1; break; +#ifdef CONFIG_JOLIET case Opt_utf8: - popt->utf8 = 1; + kfree(popt->iocharset); + popt->iocharset = kstrdup("utf8", GFP_KERNEL); + if (!popt->iocharset) + return 0; break; -#ifdef CONFIG_JOLIET case Opt_iocharset: kfree(popt->iocharset); popt->iocharset = match_strdup(&args[0]); @@ -494,7 +495,6 @@ static int isofs_show_options(struct seq_file *m, struct dentry *root) if (sbi->s_nocompress) seq_puts(m, ",nocompress"); if (sbi->s_overriderockperm) seq_puts(m, ",overriderockperm"); if (sbi->s_showassoc) seq_puts(m, ",showassoc"); - if (sbi->s_utf8) seq_puts(m, ",utf8"); if (sbi->s_check) seq_printf(m, ",check=%c", sbi->s_check); if (sbi->s_mapping) seq_printf(m, ",map=%c", sbi->s_mapping); @@ -517,9 +517,10 @@ static int isofs_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",fmode=%o", sbi->s_fmode); #ifdef CONFIG_JOLIET - if (sbi->s_nls_iocharset && - strcmp(sbi->s_nls_iocharset->charset, CONFIG_NLS_DEFAULT) != 0) + if (sbi->s_nls_iocharset) seq_printf(m, ",iocharset=%s", sbi->s_nls_iocharset->charset); + else + seq_puts(m, ",iocharset=utf8"); #endif return 0; } @@ -867,14 +868,13 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent) sbi->s_nls_iocharset = NULL; #ifdef CONFIG_JOLIET - if (joliet_level && opt.utf8 == 0) { + if (joliet_level) { char *p = opt.iocharset ? opt.iocharset : CONFIG_NLS_DEFAULT; - sbi->s_nls_iocharset = load_nls(p); - if (! sbi->s_nls_iocharset) { - /* Fail only if explicit charset specified */ - if (opt.iocharset) + if (strcmp(p, "utf8") != 0) { + sbi->s_nls_iocharset = opt.iocharset ? + load_nls(opt.iocharset) : load_nls_default(); + if (!sbi->s_nls_iocharset) goto out_freesbi; - sbi->s_nls_iocharset = load_nls_default(); } } #endif @@ -890,7 +890,6 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent) sbi->s_gid = opt.gid; sbi->s_uid_set = opt.uid_set; sbi->s_gid_set = opt.gid_set; - sbi->s_utf8 = opt.utf8; sbi->s_nocompress = opt.nocompress; sbi->s_overriderockperm = opt.overriderockperm; /* diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h index 055ec6c586f7f..dcdc191ed1834 100644 --- a/fs/isofs/isofs.h +++ b/fs/isofs/isofs.h @@ -44,7 +44,6 @@ struct isofs_sb_info { unsigned char s_session; unsigned int s_high_sierra:1; unsigned int s_rock:2; - unsigned int s_utf8:1; unsigned int s_cruft:1; /* Broken disks with high byte of length * containing junk */ unsigned int s_nocompress:1; diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c index be8b6a9d0b926..c0f04a1e7f695 100644 --- a/fs/isofs/joliet.c +++ b/fs/isofs/joliet.c @@ -41,14 +41,12 @@ uni16_to_x8(unsigned char *ascii, __be16 *uni, int len, struct nls_table *nls) int get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, struct inode * inode) { - unsigned char utf8; struct nls_table *nls; unsigned char len = 0; - utf8 = ISOFS_SB(inode->i_sb)->s_utf8; nls = ISOFS_SB(inode->i_sb)->s_nls_iocharset; - if (utf8) { + if (!nls) { len = utf16s_to_utf8s((const wchar_t *) de->name, de->name_len[0] >> 1, UTF16_BIG_ENDIAN, outname, PAGE_SIZE); From 1832faa2ee5052aaa8810c64c6b98f301d5fb098 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 08:40:26 +0200 Subject: [PATCH 0515/5344] bcache: add proper error unwinding in bcache_device_init [ Upstream commit 224b0683228c5f332f9cee615d85e75e9a347170 ] Except for the IDA none of the allocations in bcache_device_init is unwound on error, fix that. Signed-off-by: Christoph Hellwig Acked-by: Coly Li Link: https://lore.kernel.org/r/20210809064028.1198327-7-hch@lst.de Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- drivers/md/bcache/super.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index b0d569032dd4e..efdf6ce0443ea 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -839,20 +839,20 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long); d->full_dirty_stripes = kvzalloc(n, GFP_KERNEL); if (!d->full_dirty_stripes) - return -ENOMEM; + goto out_free_stripe_sectors_dirty; idx = ida_simple_get(&bcache_device_idx, 0, BCACHE_DEVICE_IDX_MAX, GFP_KERNEL); if (idx < 0) - return idx; + goto out_free_full_dirty_stripes; if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio), BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER)) - goto err; + goto out_ida_remove; d->disk = alloc_disk(BCACHE_MINORS); if (!d->disk) - goto err; + goto out_bioset_exit; set_capacity(d->disk, sectors); snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx); @@ -887,8 +887,14 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, return 0; -err: +out_bioset_exit: + bioset_exit(&d->bio_split); +out_ida_remove: ida_simple_remove(&bcache_device_idx, idx); +out_free_full_dirty_stripes: + kvfree(d->full_dirty_stripes); +out_free_stripe_sectors_dirty: + kvfree(d->stripe_sectors_dirty); return -ENOMEM; } From 244094b7ed84983beae5ce24d40eba4c8af2c4b3 Mon Sep 17 00:00:00 2001 From: Ruozhu Li Date: Sat, 7 Aug 2021 11:50:23 +0800 Subject: [PATCH 0516/5344] nvme-tcp: don't update queue count when failing to set io queues [ Upstream commit 664227fde63844d69e9ec9e90a8a7801e6ff072d ] We update ctrl->queue_count and schedule another reconnect when io queue count is zero.But we will never try to create any io queue in next reco- nnection, because ctrl->queue_count already set to zero.We will end up having an admin-only session in Live state, which is exactly what we try to avoid in the original patch. Update ctrl->queue_count after queue_count zero checking to fix it. Signed-off-by: Ruozhu Li Signed-off-by: Christoph Hellwig Signed-off-by: Sasha Levin --- drivers/nvme/host/tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 718152adc6254..f6427a10a9908 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1649,13 +1649,13 @@ static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) if (ret) return ret; - ctrl->queue_count = nr_io_queues + 1; - if (ctrl->queue_count < 2) { + if (nr_io_queues == 0) { dev_err(ctrl->device, "unable to set any I/O queues\n"); return -ENOMEM; } + ctrl->queue_count = nr_io_queues + 1; dev_info(ctrl->device, "creating %d I/O queues.\n", nr_io_queues); From 04945ef890420edd261004963ef54444a42f034f Mon Sep 17 00:00:00 2001 From: Ruozhu Li Date: Wed, 28 Jul 2021 17:41:20 +0800 Subject: [PATCH 0517/5344] nvme-rdma: don't update queue count when failing to set io queues [ Upstream commit 85032874f80ba17bf187de1d14d9603bf3f582b8 ] We update ctrl->queue_count and schedule another reconnect when io queue count is zero.But we will never try to create any io queue in next reco- nnection, because ctrl->queue_count already set to zero.We will end up having an admin-only session in Live state, which is exactly what we try to avoid in the original patch. Update ctrl->queue_count after queue_count zero checking to fix it. Signed-off-by: Ruozhu Li Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Sasha Levin --- drivers/nvme/host/rdma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index b8c0f75bfb7ba..dcc3d2393605e 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -665,13 +665,13 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl) if (ret) return ret; - ctrl->ctrl.queue_count = nr_io_queues + 1; - if (ctrl->ctrl.queue_count < 2) { + if (nr_io_queues == 0) { dev_err(ctrl->ctrl.device, "unable to set any I/O queues\n"); return -ENOMEM; } + ctrl->ctrl.queue_count = nr_io_queues + 1; dev_info(ctrl->ctrl.device, "creating %d I/O queues.\n", nr_io_queues); From 50465ac87a1f0a543b4e43298770135abd3a8309 Mon Sep 17 00:00:00 2001 From: Amit Engel Date: Sun, 8 Aug 2021 09:20:14 +0300 Subject: [PATCH 0518/5344] nvmet: pass back cntlid on successful completion [ Upstream commit e804d5abe2d74cfe23f5f83be580d1cdc9307111 ] According to the NVMe specification, the response dword 0 value of the Connect command is based on status code: return cntlid for successful compeltion return IPO and IATTR for connect invalid parameters. Fix a missing error information for a zero sized queue, and return the cntlid also for I/O queue Connect commands. Signed-off-by: Amit Engel Signed-off-by: Christoph Hellwig Signed-off-by: Sasha Levin --- drivers/nvme/target/fabrics-cmd.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index feef15c38ec91..880bcfd5b0f22 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -120,6 +120,7 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) if (!sqsize) { pr_warn("queue size zero!\n"); req->error_loc = offsetof(struct nvmf_connect_command, sqsize); + req->cqe->result.u32 = IPO_IATTR_CONNECT_SQE(sqsize); ret = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; goto err; } @@ -260,11 +261,11 @@ static void nvmet_execute_io_connect(struct nvmet_req *req) } status = nvmet_install_queue(ctrl, req); - if (status) { - /* pass back cntlid that had the issue of installing queue */ - req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid); + if (status) goto out_ctrl_put; - } + + /* pass back cntlid for successful completion */ + req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid); pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid); From e2530592e9ee7f4a8c526ed7f9c7a4cdb27907e7 Mon Sep 17 00:00:00 2001 From: Sebastian Krzyszkowiak Date: Mon, 16 Aug 2021 18:50:14 +0200 Subject: [PATCH 0519/5344] power: supply: max17042_battery: fix typo in MAx17042_TOFF [ Upstream commit ed0d0a0506025f06061325cedae1bbebd081620a ] Signed-off-by: Sebastian Krzyszkowiak Signed-off-by: Sebastian Reichel Signed-off-by: Sasha Levin --- drivers/power/supply/max17042_battery.c | 2 +- include/linux/power/max17042_battery.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/power/supply/max17042_battery.c b/drivers/power/supply/max17042_battery.c index fa862f0380c41..ab4740c3bf573 100644 --- a/drivers/power/supply/max17042_battery.c +++ b/drivers/power/supply/max17042_battery.c @@ -726,7 +726,7 @@ static inline void max17042_override_por_values(struct max17042_chip *chip) struct max17042_config_data *config = chip->pdata->config_data; max17042_override_por(map, MAX17042_TGAIN, config->tgain); - max17042_override_por(map, MAx17042_TOFF, config->toff); + max17042_override_por(map, MAX17042_TOFF, config->toff); max17042_override_por(map, MAX17042_CGAIN, config->cgain); max17042_override_por(map, MAX17042_COFF, config->coff); diff --git a/include/linux/power/max17042_battery.h b/include/linux/power/max17042_battery.h index 4badd53229490..2f9ff5017f122 100644 --- a/include/linux/power/max17042_battery.h +++ b/include/linux/power/max17042_battery.h @@ -69,7 +69,7 @@ enum max17042_register { MAX17042_RelaxCFG = 0x2A, MAX17042_MiscCFG = 0x2B, MAX17042_TGAIN = 0x2C, - MAx17042_TOFF = 0x2D, + MAX17042_TOFF = 0x2D, MAX17042_CGAIN = 0x2E, MAX17042_COFF = 0x2F, From 1911cc466f5244924bd2f17a9473d9df35fa7c81 Mon Sep 17 00:00:00 2001 From: Vineeth Vijayan Date: Sun, 25 Apr 2021 10:52:38 +0200 Subject: [PATCH 0520/5344] s390/cio: add dev_busid sysfs entry for each subchannel [ Upstream commit d3683c055212bf910d4e318f7944910ce10dbee6 ] Introduce dev_busid, which exports the device-id associated with the io-subchannel (and message-subchannel). The dev_busid indicates that of the device which may be physically installed on the corrosponding subchannel. The dev_busid value "none" indicates that the subchannel is not valid, there is no I/O device currently associated with the subchannel. The dev_busid information would be helpful to write device-specific udev-rules associated with the subchannel. The dev_busid interface would be available even when the sch is not bound to any driver or if there is no operational device connected on it. Hence this attribute can be used to write udev-rules which are specific to the device associated with the subchannel. Signed-off-by: Vineeth Vijayan Reviewed-by: Peter Oberparleiter Signed-off-by: Vasily Gorbik Signed-off-by: Sasha Levin --- drivers/s390/cio/css.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c index 5734a78dbb8e6..7950ac59b1744 100644 --- a/drivers/s390/cio/css.c +++ b/drivers/s390/cio/css.c @@ -426,9 +426,26 @@ static ssize_t pimpampom_show(struct device *dev, } static DEVICE_ATTR_RO(pimpampom); +static ssize_t dev_busid_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct subchannel *sch = to_subchannel(dev); + struct pmcw *pmcw = &sch->schib.pmcw; + + if ((pmcw->st == SUBCHANNEL_TYPE_IO || + pmcw->st == SUBCHANNEL_TYPE_MSG) && pmcw->dnv) + return sysfs_emit(buf, "0.%x.%04x\n", sch->schid.ssid, + pmcw->dev); + else + return sysfs_emit(buf, "none\n"); +} +static DEVICE_ATTR_RO(dev_busid); + static struct attribute *io_subchannel_type_attrs[] = { &dev_attr_chpids.attr, &dev_attr_pimpampom.attr, + &dev_attr_dev_busid.attr, NULL, }; ATTRIBUTE_GROUPS(io_subchannel_type); From 72966d31ab79be37822b83f9a93e65b074fd9cee Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 16 Aug 2021 10:44:47 +0900 Subject: [PATCH 0521/5344] libata: fix ata_host_start() [ Upstream commit 355a8031dc174450ccad2a61c513ad7222d87a97 ] The loop on entry of ata_host_start() may not initialize host->ops to a non NULL value. The test on the host_stop field of host->ops must then be preceded by a check that host->ops is not NULL. Reported-by: kernel test robot Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210816014456.2191776-3-damien.lemoal@wdc.com Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- drivers/ata/libata-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index f67b3fb33d579..7788af0ca1090 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -6394,7 +6394,7 @@ int ata_host_start(struct ata_host *host) have_stop = 1; } - if (host->ops->host_stop) + if (host->ops && host->ops->host_stop) have_stop = 1; if (have_stop) { From f0f3b824bf9a8f8f3ed07c85330cb4e072320567 Mon Sep 17 00:00:00 2001 From: Giovanni Cabiddu Date: Thu, 12 Aug 2021 21:21:13 +0100 Subject: [PATCH 0522/5344] crypto: qat - do not ignore errors from enable_vf2pf_comms() [ Upstream commit 5147f0906d50a9d26f2b8698cd06b5680e9867ff ] The function adf_dev_init() ignores the error code reported by enable_vf2pf_comms(). If the latter fails, e.g. the VF is not compatible with the pf, then the load of the VF driver progresses. This patch changes adf_dev_init() so that the error code from enable_vf2pf_comms() is returned to the caller. Signed-off-by: Giovanni Cabiddu Reviewed-by: Marco Chiappero Reviewed-by: Fiona Trahe Signed-off-by: Herbert Xu Signed-off-by: Sasha Levin --- drivers/crypto/qat/qat_common/adf_init.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/qat/qat_common/adf_init.c b/drivers/crypto/qat/qat_common/adf_init.c index 26556c7130497..7a7d43c475342 100644 --- a/drivers/crypto/qat/qat_common/adf_init.c +++ b/drivers/crypto/qat/qat_common/adf_init.c @@ -105,6 +105,7 @@ int adf_dev_init(struct adf_accel_dev *accel_dev) struct service_hndl *service; struct list_head *list_itr; struct adf_hw_device_data *hw_data = accel_dev->hw_device; + int ret; if (!hw_data) { dev_err(&GET_DEV(accel_dev), @@ -171,9 +172,9 @@ int adf_dev_init(struct adf_accel_dev *accel_dev) } hw_data->enable_error_correction(accel_dev); - hw_data->enable_vf2pf_comms(accel_dev); + ret = hw_data->enable_vf2pf_comms(accel_dev); - return 0; + return ret; } EXPORT_SYMBOL_GPL(adf_dev_init); From 3733fb13cc3b7b26277628d4c763956157f8f436 Mon Sep 17 00:00:00 2001 From: Giovanni Cabiddu Date: Thu, 12 Aug 2021 21:21:14 +0100 Subject: [PATCH 0523/5344] crypto: qat - handle both source of interrupt in VF ISR [ Upstream commit 0a73c762e1eee33a5e5dc0e3488f1b7cd17249b3 ] The top half of the VF drivers handled only a source at the time. If an interrupt for PF2VF and bundle occurred at the same time, the ISR scheduled only the bottom half for PF2VF. This patch fixes the VF top half so that if both sources of interrupt trigger at the same time, both bottom halves are scheduled. This patch is based on earlier work done by Conor McLoughlin. Signed-off-by: Giovanni Cabiddu Reviewed-by: Marco Chiappero Reviewed-by: Fiona Trahe Signed-off-by: Herbert Xu Signed-off-by: Sasha Levin --- drivers/crypto/qat/qat_common/adf_vf_isr.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/crypto/qat/qat_common/adf_vf_isr.c b/drivers/crypto/qat/qat_common/adf_vf_isr.c index df9a1f35b8320..ef90902c8200d 100644 --- a/drivers/crypto/qat/qat_common/adf_vf_isr.c +++ b/drivers/crypto/qat/qat_common/adf_vf_isr.c @@ -203,6 +203,7 @@ static irqreturn_t adf_isr(int irq, void *privdata) struct adf_bar *pmisc = &GET_BARS(accel_dev)[hw_data->get_misc_bar_id(hw_data)]; void __iomem *pmisc_bar_addr = pmisc->virt_addr; + bool handled = false; u32 v_int; /* Read VF INT source CSR to determine the source of VF interrupt */ @@ -215,7 +216,7 @@ static irqreturn_t adf_isr(int irq, void *privdata) /* Schedule tasklet to handle interrupt BH */ tasklet_hi_schedule(&accel_dev->vf.pf2vf_bh_tasklet); - return IRQ_HANDLED; + handled = true; } /* Check bundle interrupt */ @@ -227,10 +228,10 @@ static irqreturn_t adf_isr(int irq, void *privdata) WRITE_CSR_INT_FLAG_AND_COL(bank->csr_addr, bank->bank_number, 0); tasklet_hi_schedule(&bank->resp_handler); - return IRQ_HANDLED; + handled = true; } - return IRQ_NONE; + return handled ? IRQ_HANDLED : IRQ_NONE; } static int adf_request_msi_irq(struct adf_accel_dev *accel_dev) From 7da9e1ad5684b566a2e41ff7b113abdec55729cd Mon Sep 17 00:00:00 2001 From: Marco Chiappero Date: Thu, 12 Aug 2021 21:21:19 +0100 Subject: [PATCH 0524/5344] crypto: qat - fix reuse of completion variable [ Upstream commit 3d655732b0199562267a05c7ff69ecdd11632939 ] Use reinit_completion() to set to a clean state a completion variable, used to coordinate the VF to PF request-response flow, before every new VF request. Signed-off-by: Marco Chiappero Co-developed-by: Giovanni Cabiddu Signed-off-by: Giovanni Cabiddu Reviewed-by: Fiona Trahe Signed-off-by: Herbert Xu Signed-off-by: Sasha Levin --- drivers/crypto/qat/qat_common/adf_pf2vf_msg.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/crypto/qat/qat_common/adf_pf2vf_msg.c b/drivers/crypto/qat/qat_common/adf_pf2vf_msg.c index b3875fdf6cd72..9dab2cc11fdfa 100644 --- a/drivers/crypto/qat/qat_common/adf_pf2vf_msg.c +++ b/drivers/crypto/qat/qat_common/adf_pf2vf_msg.c @@ -361,6 +361,8 @@ static int adf_vf2pf_request_version(struct adf_accel_dev *accel_dev) msg |= ADF_PFVF_COMPATIBILITY_VERSION << ADF_VF2PF_COMPAT_VER_REQ_SHIFT; BUILD_BUG_ON(ADF_PFVF_COMPATIBILITY_VERSION > 255); + reinit_completion(&accel_dev->vf.iov_msg_completion); + /* Send request from VF to PF */ ret = adf_iov_putmsg(accel_dev, msg, 0); if (ret) { From fd15bcdc8b899c256b088f61c38de52bde6ffc89 Mon Sep 17 00:00:00 2001 From: Marco Chiappero Date: Thu, 12 Aug 2021 21:21:22 +0100 Subject: [PATCH 0525/5344] crypto: qat - fix naming for init/shutdown VF to PF notifications [ Upstream commit b90c1c4d3fa8cd90f4e8245b13564380fd0bfad1 ] At start and shutdown, VFs notify the PF about their state. These notifications are carried out through a message exchange using the PFVF protocol. Function names lead to believe they do perform init or shutdown logic. This is to fix the naming to better reflect their purpose. Signed-off-by: Marco Chiappero Co-developed-by: Giovanni Cabiddu Signed-off-by: Giovanni Cabiddu Reviewed-by: Fiona Trahe Signed-off-by: Herbert Xu Signed-off-by: Sasha Levin --- drivers/crypto/qat/qat_c3xxxvf/adf_c3xxxvf_hw_data.c | 4 ++-- drivers/crypto/qat/qat_c62xvf/adf_c62xvf_hw_data.c | 4 ++-- drivers/crypto/qat/qat_common/adf_common_drv.h | 8 ++++---- drivers/crypto/qat/qat_common/adf_vf2pf_msg.c | 12 ++++++------ .../qat/qat_dh895xccvf/adf_dh895xccvf_hw_data.c | 4 ++-- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/crypto/qat/qat_c3xxxvf/adf_c3xxxvf_hw_data.c b/drivers/crypto/qat/qat_c3xxxvf/adf_c3xxxvf_hw_data.c index d2d0ae445fd89..7c7d49a8a4034 100644 --- a/drivers/crypto/qat/qat_c3xxxvf/adf_c3xxxvf_hw_data.c +++ b/drivers/crypto/qat/qat_c3xxxvf/adf_c3xxxvf_hw_data.c @@ -123,10 +123,10 @@ void adf_init_hw_data_c3xxxiov(struct adf_hw_device_data *hw_data) hw_data->enable_error_correction = adf_vf_void_noop; hw_data->init_admin_comms = adf_vf_int_noop; hw_data->exit_admin_comms = adf_vf_void_noop; - hw_data->send_admin_init = adf_vf2pf_init; + hw_data->send_admin_init = adf_vf2pf_notify_init; hw_data->init_arb = adf_vf_int_noop; hw_data->exit_arb = adf_vf_void_noop; - hw_data->disable_iov = adf_vf2pf_shutdown; + hw_data->disable_iov = adf_vf2pf_notify_shutdown; hw_data->get_accel_mask = get_accel_mask; hw_data->get_ae_mask = get_ae_mask; hw_data->get_num_accels = get_num_accels; diff --git a/drivers/crypto/qat/qat_c62xvf/adf_c62xvf_hw_data.c b/drivers/crypto/qat/qat_c62xvf/adf_c62xvf_hw_data.c index 38e4bc04f407b..90e8a7564756b 100644 --- a/drivers/crypto/qat/qat_c62xvf/adf_c62xvf_hw_data.c +++ b/drivers/crypto/qat/qat_c62xvf/adf_c62xvf_hw_data.c @@ -123,10 +123,10 @@ void adf_init_hw_data_c62xiov(struct adf_hw_device_data *hw_data) hw_data->enable_error_correction = adf_vf_void_noop; hw_data->init_admin_comms = adf_vf_int_noop; hw_data->exit_admin_comms = adf_vf_void_noop; - hw_data->send_admin_init = adf_vf2pf_init; + hw_data->send_admin_init = adf_vf2pf_notify_init; hw_data->init_arb = adf_vf_int_noop; hw_data->exit_arb = adf_vf_void_noop; - hw_data->disable_iov = adf_vf2pf_shutdown; + hw_data->disable_iov = adf_vf2pf_notify_shutdown; hw_data->get_accel_mask = get_accel_mask; hw_data->get_ae_mask = get_ae_mask; hw_data->get_num_accels = get_num_accels; diff --git a/drivers/crypto/qat/qat_common/adf_common_drv.h b/drivers/crypto/qat/qat_common/adf_common_drv.h index d78f8d5c89c3f..289dd7e48d4a4 100644 --- a/drivers/crypto/qat/qat_common/adf_common_drv.h +++ b/drivers/crypto/qat/qat_common/adf_common_drv.h @@ -239,8 +239,8 @@ void adf_enable_vf2pf_interrupts(struct adf_accel_dev *accel_dev, void adf_enable_pf2vf_interrupts(struct adf_accel_dev *accel_dev); void adf_disable_pf2vf_interrupts(struct adf_accel_dev *accel_dev); -int adf_vf2pf_init(struct adf_accel_dev *accel_dev); -void adf_vf2pf_shutdown(struct adf_accel_dev *accel_dev); +int adf_vf2pf_notify_init(struct adf_accel_dev *accel_dev); +void adf_vf2pf_notify_shutdown(struct adf_accel_dev *accel_dev); int adf_init_pf_wq(void); void adf_exit_pf_wq(void); int adf_init_vf_wq(void); @@ -263,12 +263,12 @@ static inline void adf_disable_pf2vf_interrupts(struct adf_accel_dev *accel_dev) { } -static inline int adf_vf2pf_init(struct adf_accel_dev *accel_dev) +static inline int adf_vf2pf_notify_init(struct adf_accel_dev *accel_dev) { return 0; } -static inline void adf_vf2pf_shutdown(struct adf_accel_dev *accel_dev) +static inline void adf_vf2pf_notify_shutdown(struct adf_accel_dev *accel_dev) { } diff --git a/drivers/crypto/qat/qat_common/adf_vf2pf_msg.c b/drivers/crypto/qat/qat_common/adf_vf2pf_msg.c index cd5f37dffe8a6..1830194567e84 100644 --- a/drivers/crypto/qat/qat_common/adf_vf2pf_msg.c +++ b/drivers/crypto/qat/qat_common/adf_vf2pf_msg.c @@ -49,14 +49,14 @@ #include "adf_pf2vf_msg.h" /** - * adf_vf2pf_init() - send init msg to PF + * adf_vf2pf_notify_init() - send init msg to PF * @accel_dev: Pointer to acceleration VF device. * * Function sends an init messge from the VF to a PF * * Return: 0 on success, error code otherwise. */ -int adf_vf2pf_init(struct adf_accel_dev *accel_dev) +int adf_vf2pf_notify_init(struct adf_accel_dev *accel_dev) { u32 msg = (ADF_VF2PF_MSGORIGIN_SYSTEM | (ADF_VF2PF_MSGTYPE_INIT << ADF_VF2PF_MSGTYPE_SHIFT)); @@ -69,17 +69,17 @@ int adf_vf2pf_init(struct adf_accel_dev *accel_dev) set_bit(ADF_STATUS_PF_RUNNING, &accel_dev->status); return 0; } -EXPORT_SYMBOL_GPL(adf_vf2pf_init); +EXPORT_SYMBOL_GPL(adf_vf2pf_notify_init); /** - * adf_vf2pf_shutdown() - send shutdown msg to PF + * adf_vf2pf_notify_shutdown() - send shutdown msg to PF * @accel_dev: Pointer to acceleration VF device. * * Function sends a shutdown messge from the VF to a PF * * Return: void */ -void adf_vf2pf_shutdown(struct adf_accel_dev *accel_dev) +void adf_vf2pf_notify_shutdown(struct adf_accel_dev *accel_dev) { u32 msg = (ADF_VF2PF_MSGORIGIN_SYSTEM | (ADF_VF2PF_MSGTYPE_SHUTDOWN << ADF_VF2PF_MSGTYPE_SHIFT)); @@ -89,4 +89,4 @@ void adf_vf2pf_shutdown(struct adf_accel_dev *accel_dev) dev_err(&GET_DEV(accel_dev), "Failed to send Shutdown event to PF\n"); } -EXPORT_SYMBOL_GPL(adf_vf2pf_shutdown); +EXPORT_SYMBOL_GPL(adf_vf2pf_notify_shutdown); diff --git a/drivers/crypto/qat/qat_dh895xccvf/adf_dh895xccvf_hw_data.c b/drivers/crypto/qat/qat_dh895xccvf/adf_dh895xccvf_hw_data.c index a3b4dd8099a7b..3a8361c83f0b1 100644 --- a/drivers/crypto/qat/qat_dh895xccvf/adf_dh895xccvf_hw_data.c +++ b/drivers/crypto/qat/qat_dh895xccvf/adf_dh895xccvf_hw_data.c @@ -123,10 +123,10 @@ void adf_init_hw_data_dh895xcciov(struct adf_hw_device_data *hw_data) hw_data->enable_error_correction = adf_vf_void_noop; hw_data->init_admin_comms = adf_vf_int_noop; hw_data->exit_admin_comms = adf_vf_void_noop; - hw_data->send_admin_init = adf_vf2pf_init; + hw_data->send_admin_init = adf_vf2pf_notify_init; hw_data->init_arb = adf_vf_int_noop; hw_data->exit_arb = adf_vf_void_noop; - hw_data->disable_iov = adf_vf2pf_shutdown; + hw_data->disable_iov = adf_vf2pf_notify_shutdown; hw_data->get_accel_mask = get_accel_mask; hw_data->get_ae_mask = get_ae_mask; hw_data->get_num_accels = get_num_accels; From 6ccf2fdb39e1cd484afb456151de216b94fdadcb Mon Sep 17 00:00:00 2001 From: Giovanni Cabiddu Date: Thu, 12 Aug 2021 21:21:28 +0100 Subject: [PATCH 0526/5344] crypto: qat - do not export adf_iov_putmsg() [ Upstream commit 645ae0af1840199086c33e4f841892ebee73f615 ] The function adf_iov_putmsg() is only used inside the intel_qat module therefore should not be exported. Remove EXPORT_SYMBOL for the function adf_iov_putmsg(). Signed-off-by: Giovanni Cabiddu Reviewed-by: Fiona Trahe Signed-off-by: Herbert Xu Signed-off-by: Sasha Levin --- drivers/crypto/qat/qat_common/adf_pf2vf_msg.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/crypto/qat/qat_common/adf_pf2vf_msg.c b/drivers/crypto/qat/qat_common/adf_pf2vf_msg.c index 9dab2cc11fdfa..c64481160b711 100644 --- a/drivers/crypto/qat/qat_common/adf_pf2vf_msg.c +++ b/drivers/crypto/qat/qat_common/adf_pf2vf_msg.c @@ -231,7 +231,6 @@ int adf_iov_putmsg(struct adf_accel_dev *accel_dev, u32 msg, u8 vf_nr) return ret; } -EXPORT_SYMBOL_GPL(adf_iov_putmsg); void adf_vf2pf_req_hndl(struct adf_accel_vf_info *vf_info) { From a46ca70dd3b305d79eccd3fa885f9d43d46eb12f Mon Sep 17 00:00:00 2001 From: Desmond Cheong Zhi Xi Date: Fri, 2 Jul 2021 17:18:31 +0800 Subject: [PATCH 0527/5344] fcntl: fix potential deadlock for &fasync_struct.fa_lock [ Upstream commit 2f488f698fda820f8e6fa0407630154eceb145d6 ] There is an existing lock hierarchy of &dev->event_lock --> &fasync_struct.fa_lock --> &f->f_owner.lock from the following call chain: input_inject_event(): spin_lock_irqsave(&dev->event_lock,...); input_handle_event(): input_pass_values(): input_to_handler(): evdev_events(): evdev_pass_values(): spin_lock(&client->buffer_lock); __pass_event(): kill_fasync(): kill_fasync_rcu(): read_lock(&fa->fa_lock); send_sigio(): read_lock_irqsave(&fown->lock,...); &dev->event_lock is HARDIRQ-safe, so interrupts have to be disabled while grabbing &fasync_struct.fa_lock, otherwise we invert the lock hierarchy. However, since kill_fasync which calls kill_fasync_rcu is an exported symbol, it may not necessarily be called with interrupts disabled. As kill_fasync_rcu may be called with interrupts disabled (for example, in the call chain above), we replace calls to read_lock/read_unlock on &fasync_struct.fa_lock in kill_fasync_rcu with read_lock_irqsave/read_unlock_irqrestore. Signed-off-by: Desmond Cheong Zhi Xi Signed-off-by: Jeff Layton Signed-off-by: Sasha Levin --- fs/fcntl.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/fcntl.c b/fs/fcntl.c index 1b9c6f8944b1f..09e473d11055c 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -993,13 +993,14 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) { while (fa) { struct fown_struct *fown; + unsigned long flags; if (fa->magic != FASYNC_MAGIC) { printk(KERN_ERR "kill_fasync: bad magic number in " "fasync_struct!\n"); return; } - read_lock(&fa->fa_lock); + read_lock_irqsave(&fa->fa_lock, flags); if (fa->fa_file) { fown = &fa->fa_file->f_owner; /* Don't send SIGURG to processes which have not set a @@ -1008,7 +1009,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) if (!(sig == SIGURG && fown->signum == 0)) send_sigio(fown, fa->fa_fd, band); } - read_unlock(&fa->fa_lock); + read_unlock_irqrestore(&fa->fa_lock, flags); fa = rcu_dereference(fa->fa_next); } } From 4d45edf030ea7032ef512c7aa16505d8b3ed24df Mon Sep 17 00:00:00 2001 From: Stian Skjelstad Date: Sun, 22 Aug 2021 11:33:32 +0200 Subject: [PATCH 0528/5344] udf_get_extendedattr() had no boundary checks. [ Upstream commit 58bc6d1be2f3b0ceecb6027dfa17513ec6aa2abb ] When parsing the ExtendedAttr data, malicous or corrupt attribute length could cause kernel hangs and buffer overruns in some special cases. Link: https://lore.kernel.org/r/20210822093332.25234-1-stian.skjelstad@gmail.com Signed-off-by: Stian Skjelstad Signed-off-by: Jan Kara Signed-off-by: Sasha Levin --- fs/udf/misc.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/udf/misc.c b/fs/udf/misc.c index 401e64cde1be0..853bcff51043f 100644 --- a/fs/udf/misc.c +++ b/fs/udf/misc.c @@ -173,13 +173,22 @@ struct genericFormat *udf_get_extendedattr(struct inode *inode, uint32_t type, else offset = le32_to_cpu(eahd->appAttrLocation); - while (offset < iinfo->i_lenEAttr) { + while (offset + sizeof(*gaf) < iinfo->i_lenEAttr) { + uint32_t attrLength; + gaf = (struct genericFormat *)&ea[offset]; + attrLength = le32_to_cpu(gaf->attrLength); + + /* Detect undersized elements and buffer overflows */ + if ((attrLength < sizeof(*gaf)) || + (attrLength > (iinfo->i_lenEAttr - offset))) + break; + if (le32_to_cpu(gaf->attrType) == type && gaf->attrSubtype == subtype) return gaf; else - offset += le32_to_cpu(gaf->attrLength); + offset += attrLength; } } From 1b1dbe9cc76f522b4c4fa39e91eb47fabc8403d6 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 6 Aug 2021 12:55:08 +0200 Subject: [PATCH 0529/5344] s390/kasan: fix large PMD pages address alignment check [ Upstream commit ddd63c85ef67ea9ea7282ad35eafb6568047126e ] It is currently possible to initialize a large PMD page when the address is not aligned on page boundary. Signed-off-by: Alexander Gordeev Reviewed-by: Vasily Gorbik Signed-off-by: Vasily Gorbik Signed-off-by: Heiko Carstens Signed-off-by: Sasha Levin --- arch/s390/mm/kasan_init.c | 41 +++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/arch/s390/mm/kasan_init.c b/arch/s390/mm/kasan_init.c index 460f255729402..5182e0836ca71 100644 --- a/arch/s390/mm/kasan_init.c +++ b/arch/s390/mm/kasan_init.c @@ -101,6 +101,9 @@ static void __init kasan_early_vmemmap_populate(unsigned long address, pgt_prot = pgprot_val(PAGE_KERNEL_EXEC); sgt_prot = pgprot_val(SEGMENT_KERNEL_EXEC); + /* + * The first 1MB of 1:1 mapping is mapped with 4KB pages + */ while (address < end) { pg_dir = pgd_offset_k(address); if (pgd_none(*pg_dir)) { @@ -146,30 +149,26 @@ static void __init kasan_early_vmemmap_populate(unsigned long address, pm_dir = pmd_offset(pu_dir, address); if (pmd_none(*pm_dir)) { - if (mode == POPULATE_ZERO_SHADOW && - IS_ALIGNED(address, PMD_SIZE) && + if (IS_ALIGNED(address, PMD_SIZE) && end - address >= PMD_SIZE) { - pmd_populate(&init_mm, pm_dir, - kasan_early_shadow_pte); - address = (address + PMD_SIZE) & PMD_MASK; - continue; - } - /* the first megabyte of 1:1 is mapped with 4k pages */ - if (has_edat && address && end - address >= PMD_SIZE && - mode != POPULATE_ZERO_SHADOW) { - void *page; - - if (mode == POPULATE_ONE2ONE) { - page = (void *)address; - } else { - page = kasan_early_alloc_segment(); - memset(page, 0, _SEGMENT_SIZE); + if (mode == POPULATE_ZERO_SHADOW) { + pmd_populate(&init_mm, pm_dir, kasan_early_shadow_pte); + address = (address + PMD_SIZE) & PMD_MASK; + continue; + } else if (has_edat && address) { + void *page; + + if (mode == POPULATE_ONE2ONE) { + page = (void *)address; + } else { + page = kasan_early_alloc_segment(); + memset(page, 0, _SEGMENT_SIZE); + } + pmd_val(*pm_dir) = __pa(page) | sgt_prot; + address = (address + PMD_SIZE) & PMD_MASK; + continue; } - pmd_val(*pm_dir) = __pa(page) | sgt_prot; - address = (address + PMD_SIZE) & PMD_MASK; - continue; } - pt_dir = kasan_early_pte_alloc(); pmd_populate(&init_mm, pm_dir, pt_dir); } else if (pmd_large(*pm_dir)) { From 5d35e2da5b137deac8a4ec988f84f2b53e9a7ad8 Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Fri, 13 Aug 2021 15:05:03 +0200 Subject: [PATCH 0530/5344] s390/debug: fix debug area life cycle [ Upstream commit 9372a82892c2caa6bccab9a4081166fa769699f8 ] Currently allocation and registration of s390dbf debug areas are tied together. As a result, a debug area cannot be unregistered and re-registered while any process has an associated debugfs file open. Fix this by splitting alloc/release from register/unregister. Signed-off-by: Peter Oberparleiter Signed-off-by: Heiko Carstens Signed-off-by: Sasha Levin --- arch/s390/kernel/debug.c | 102 +++++++++++++++++++++------------------ 1 file changed, 56 insertions(+), 46 deletions(-) diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index 7184d55d87aae..b1aadc3ad065d 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -327,24 +327,6 @@ static debug_info_t *debug_info_create(const char *name, int pages_per_area, goto out; rc->mode = mode & ~S_IFMT; - - /* create root directory */ - rc->debugfs_root_entry = debugfs_create_dir(rc->name, - debug_debugfs_root_entry); - - /* append new element to linked list */ - if (!debug_area_first) { - /* first element in list */ - debug_area_first = rc; - rc->prev = NULL; - } else { - /* append element to end of list */ - debug_area_last->next = rc; - rc->prev = debug_area_last; - } - debug_area_last = rc; - rc->next = NULL; - refcount_set(&rc->ref_count, 1); out: return rc; @@ -404,27 +386,10 @@ static void debug_info_get(debug_info_t *db_info) */ static void debug_info_put(debug_info_t *db_info) { - int i; - if (!db_info) return; - if (refcount_dec_and_test(&db_info->ref_count)) { - for (i = 0; i < DEBUG_MAX_VIEWS; i++) { - if (!db_info->views[i]) - continue; - debugfs_remove(db_info->debugfs_entries[i]); - } - debugfs_remove(db_info->debugfs_root_entry); - if (db_info == debug_area_first) - debug_area_first = db_info->next; - if (db_info == debug_area_last) - debug_area_last = db_info->prev; - if (db_info->prev) - db_info->prev->next = db_info->next; - if (db_info->next) - db_info->next->prev = db_info->prev; + if (refcount_dec_and_test(&db_info->ref_count)) debug_info_free(db_info); - } } /* @@ -648,6 +613,31 @@ static int debug_close(struct inode *inode, struct file *file) return 0; /* success */ } +/* Create debugfs entries and add to internal list. */ +static void _debug_register(debug_info_t *id) +{ + /* create root directory */ + id->debugfs_root_entry = debugfs_create_dir(id->name, + debug_debugfs_root_entry); + + /* append new element to linked list */ + if (!debug_area_first) { + /* first element in list */ + debug_area_first = id; + id->prev = NULL; + } else { + /* append element to end of list */ + debug_area_last->next = id; + id->prev = debug_area_last; + } + debug_area_last = id; + id->next = NULL; + + debug_register_view(id, &debug_level_view); + debug_register_view(id, &debug_flush_view); + debug_register_view(id, &debug_pages_view); +} + /** * debug_register_mode() - creates and initializes debug area. * @@ -677,19 +667,16 @@ debug_info_t *debug_register_mode(const char *name, int pages_per_area, if ((uid != 0) || (gid != 0)) pr_warn("Root becomes the owner of all s390dbf files in sysfs\n"); BUG_ON(!initialized); - mutex_lock(&debug_mutex); /* create new debug_info */ rc = debug_info_create(name, pages_per_area, nr_areas, buf_size, mode); - if (!rc) - goto out; - debug_register_view(rc, &debug_level_view); - debug_register_view(rc, &debug_flush_view); - debug_register_view(rc, &debug_pages_view); -out: - if (!rc) + if (rc) { + mutex_lock(&debug_mutex); + _debug_register(rc); + mutex_unlock(&debug_mutex); + } else { pr_err("Registering debug feature %s failed\n", name); - mutex_unlock(&debug_mutex); + } return rc; } EXPORT_SYMBOL(debug_register_mode); @@ -718,6 +705,27 @@ debug_info_t *debug_register(const char *name, int pages_per_area, } EXPORT_SYMBOL(debug_register); +/* Remove debugfs entries and remove from internal list. */ +static void _debug_unregister(debug_info_t *id) +{ + int i; + + for (i = 0; i < DEBUG_MAX_VIEWS; i++) { + if (!id->views[i]) + continue; + debugfs_remove(id->debugfs_entries[i]); + } + debugfs_remove(id->debugfs_root_entry); + if (id == debug_area_first) + debug_area_first = id->next; + if (id == debug_area_last) + debug_area_last = id->prev; + if (id->prev) + id->prev->next = id->next; + if (id->next) + id->next->prev = id->prev; +} + /** * debug_unregister() - give back debug area. * @@ -731,8 +739,10 @@ void debug_unregister(debug_info_t *id) if (!id) return; mutex_lock(&debug_mutex); - debug_info_put(id); + _debug_unregister(id); mutex_unlock(&debug_mutex); + + debug_info_put(id); } EXPORT_SYMBOL(debug_unregister); From 67c4273f5e393dfe74f7f2adc5b22bdf6936b48c Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Mon, 5 Jul 2021 23:47:27 +0300 Subject: [PATCH 0531/5344] m68k: emu: Fix invalid free in nfeth_cleanup() [ Upstream commit 761608f5cf70e8876c2f0e39ca54b516bdcb7c12 ] In the for loop all nfeth_dev array members should be freed, not only the first one. Freeing only the first array member can cause double-free bugs and memory leaks. Fixes: 9cd7b148312f ("m68k/atari: ARAnyM - Add support for network access") Signed-off-by: Pavel Skripkin Link: https://lore.kernel.org/r/20210705204727.10743-1-paskripkin@gmail.com Signed-off-by: Geert Uytterhoeven Signed-off-by: Sasha Levin --- arch/m68k/emu/nfeth.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/m68k/emu/nfeth.c b/arch/m68k/emu/nfeth.c index a4ebd2445edae..e5831cd293d05 100644 --- a/arch/m68k/emu/nfeth.c +++ b/arch/m68k/emu/nfeth.c @@ -254,8 +254,8 @@ static void __exit nfeth_cleanup(void) for (i = 0; i < MAX_UNIT; i++) { if (nfeth_dev[i]) { - unregister_netdev(nfeth_dev[0]); - free_netdev(nfeth_dev[0]); + unregister_netdev(nfeth_dev[i]); + free_netdev(nfeth_dev[i]); } } free_irq(nfEtherIRQ, nfeth_interrupt); From 63a6af49ff8a909214021631dbd4189d8addcdb3 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 5 Aug 2021 11:21:53 +0100 Subject: [PATCH 0532/5344] sched: Fix UCLAMP_FLAG_IDLE setting [ Upstream commit ca4984a7dd863f3e1c0df775ae3e744bff24c303 ] The UCLAMP_FLAG_IDLE flag is set on a runqueue when dequeueing the last uclamp active task (that is, when buckets.tasks reaches 0 for all buckets) to maintain the last uclamp.max and prevent blocked util from suddenly becoming visible. However, there is an asymmetry in how the flag is set and cleared which can lead to having the flag set whilst there are active tasks on the rq. Specifically, the flag is cleared in the uclamp_rq_inc() path, which is called at enqueue time, but set in uclamp_rq_dec_id() which is called both when dequeueing a task _and_ in the update_uclamp_active() path. As a result, when both uclamp_rq_{dec,ind}_id() are called from update_uclamp_active(), the flag ends up being set but not cleared, hence leaving the runqueue in a broken state. Fix this by clearing the flag in update_uclamp_active() as well. Fixes: e496187da710 ("sched/uclamp: Enforce last task's UCLAMP_MAX") Reported-by: Rick Yiu Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Qais Yousef Tested-by: Dietmar Eggemann Link: https://lore.kernel.org/r/20210805102154.590709-2-qperret@google.com Signed-off-by: Sasha Levin --- kernel/sched/core.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 332da08bc792a..3c132771cffcd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1114,6 +1114,23 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) uclamp_rq_dec_id(rq, p, clamp_id); } +static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p, + enum uclamp_id clamp_id) +{ + if (!p->uclamp[clamp_id].active) + return; + + uclamp_rq_dec_id(rq, p, clamp_id); + uclamp_rq_inc_id(rq, p, clamp_id); + + /* + * Make sure to clear the idle flag if we've transiently reached 0 + * active tasks on rq. + */ + if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE)) + rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE; +} + static inline void uclamp_update_active(struct task_struct *p) { @@ -1137,12 +1154,8 @@ uclamp_update_active(struct task_struct *p) * affecting a valid clamp bucket, the next time it's enqueued, * it will already see the updated clamp bucket value. */ - for_each_clamp_id(clamp_id) { - if (p->uclamp[clamp_id].active) { - uclamp_rq_dec_id(rq, p, clamp_id); - uclamp_rq_inc_id(rq, p, clamp_id); - } - } + for_each_clamp_id(clamp_id) + uclamp_rq_reinc_id(rq, p, clamp_id); task_rq_unlock(rq, p, &rf); } From 8cde129e627d13de78af12d70fec4cd4bf0e8460 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Tue, 10 Aug 2021 11:17:26 +0300 Subject: [PATCH 0533/5344] spi: spi-fsl-dspi: Fix issue with uninitialized dma_slave_config [ Upstream commit 209ab223ad5b18e437289235e3bde12593b94ac4 ] Depending on the DMA driver being used, the struct dma_slave_config may need to be initialized to zero for the unused data. For example, we have three DMA drivers using src_port_window_size and dst_port_window_size. If these are left uninitialized, it can cause DMA failures. For spi-fsl-dspi, this is probably not currently an issue but is still good to fix though. Fixes: 90ba37033cb9 ("spi: spi-fsl-dspi: Add DMA support for Vybrid") Cc: Sanchayan Maity Cc: Vladimir Oltean Cc: Peter Ujfalusi Cc: Vinod Koul Signed-off-by: Tony Lindgren Acked-by: Vladimir Oltean Link: https://lore.kernel.org/r/20210810081727.19491-1-tony@atomide.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- drivers/spi/spi-fsl-dspi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/spi/spi-fsl-dspi.c b/drivers/spi/spi-fsl-dspi.c index 40dccc580e866..3e0200618af30 100644 --- a/drivers/spi/spi-fsl-dspi.c +++ b/drivers/spi/spi-fsl-dspi.c @@ -423,6 +423,7 @@ static int dspi_request_dma(struct fsl_dspi *dspi, phys_addr_t phy_addr) goto err_rx_dma_buf; } + memset(&cfg, 0, sizeof(cfg)); cfg.src_addr = phy_addr + SPI_POPR; cfg.dst_addr = phy_addr + SPI_PUSHR; cfg.src_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES; From 25e4a8196702088346bd3d1ba704506e521c5d80 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Tue, 10 Aug 2021 11:17:27 +0300 Subject: [PATCH 0534/5344] spi: spi-pic32: Fix issue with uninitialized dma_slave_config [ Upstream commit 976c1de1de147bb7f4e0d87482f375221c05aeaf ] Depending on the DMA driver being used, the struct dma_slave_config may need to be initialized to zero for the unused data. For example, we have three DMA drivers using src_port_window_size and dst_port_window_size. If these are left uninitialized, it can cause DMA failures. For spi-pic32, this is probably not currently an issue but is still good to fix though. Fixes: 1bcb9f8ceb67 ("spi: spi-pic32: Add PIC32 SPI master driver") Cc: Purna Chandra Mandal Cc: Peter Ujfalusi Cc: Vinod Koul Signed-off-by: Tony Lindgren Link: https://lore.kernel.org/r/20210810081727.19491-2-tony@atomide.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- drivers/spi/spi-pic32.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/spi/spi-pic32.c b/drivers/spi/spi-pic32.c index 8272bde5d706f..b5268b0d7b4c8 100644 --- a/drivers/spi/spi-pic32.c +++ b/drivers/spi/spi-pic32.c @@ -361,6 +361,7 @@ static int pic32_spi_dma_config(struct pic32_spi *pic32s, u32 dma_width) struct dma_slave_config cfg; int ret; + memset(&cfg, 0, sizeof(cfg)); cfg.device_fc = true; cfg.src_addr = pic32s->dma_base + buf_offset; cfg.dst_addr = pic32s->dma_base + buf_offset; From 12319b62a40e6602b89bc0312d4e52fb38d2584b Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 11 Aug 2021 17:33:32 +0800 Subject: [PATCH 0535/5344] genirq/timings: Fix error return code in irq_timings_test_irqs() [ Upstream commit 290fdc4b7ef14e33d0e30058042b0e9bfd02b89b ] Return a negative error code from the error handling case instead of 0, as done elsewhere in this function. Fixes: f52da98d900e ("genirq/timings: Add selftest for irqs circular buffer") Reported-by: Hulk Robot Signed-off-by: Zhen Lei Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210811093333.2376-1-thunder.leizhen@huawei.com Signed-off-by: Sasha Levin --- kernel/irq/timings.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index b5985da80acf0..7ccc8edce46dc 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -799,12 +799,14 @@ static int __init irq_timings_test_irqs(struct timings_intervals *ti) __irq_timings_store(irq, irqs, ti->intervals[i]); if (irqs->circ_timings[i & IRQ_TIMINGS_MASK] != index) { + ret = -EBADSLT; pr_err("Failed to store in the circular buffer\n"); goto out; } } if (irqs->count != ti->count) { + ret = -ERANGE; pr_err("Count differs\n"); goto out; } From fb006592fd5b086a440dfc948c099008131981ca Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Thu, 5 Aug 2021 16:53:32 +0800 Subject: [PATCH 0536/5344] lib/mpi: use kcalloc in mpi_resize [ Upstream commit b6f756726e4dfe75be1883f6a0202dcecdc801ab ] We should set the additional space to 0 in mpi_resize(). So use kcalloc() instead of kmalloc_array(). In lib/mpi/ec.c: /**************** * Resize the array of A to NLIMBS. the additional space is cleared * (set to 0) [done by m_realloc()] */ int mpi_resize(MPI a, unsigned nlimbs) Like the comment of kernel's mpi_resize() said, the additional space need to be set to 0, but when a->d is not NULL, it does not set. The kernel's mpi lib is from libgcrypt, the mpi resize in libgcrypt is _gcry_mpi_resize() which set the additional space to 0. This bug may cause mpi api which use mpi_resize() get wrong result under the condition of using the additional space without initiation. If this condition is not met, the bug would not be triggered. Currently in kernel, rsa, sm2 and dh use mpi lib, and they works well, so the bug is not triggered in these cases. add_points_edwards() use the additional space directly, so it will get a wrong result. Fixes: cdec9cb5167a ("crypto: GnuPG based MPI lib - source files (part 1)") Signed-off-by: Hongbo Li Signed-off-by: Herbert Xu Signed-off-by: Sasha Levin --- lib/mpi/mpiutil.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/mpi/mpiutil.c b/lib/mpi/mpiutil.c index 20ed0f7667871..00825028cc847 100644 --- a/lib/mpi/mpiutil.c +++ b/lib/mpi/mpiutil.c @@ -91,7 +91,7 @@ int mpi_resize(MPI a, unsigned nlimbs) return 0; /* no need to do it */ if (a->d) { - p = kmalloc_array(nlimbs, sizeof(mpi_limb_t), GFP_KERNEL); + p = kcalloc(nlimbs, sizeof(mpi_limb_t), GFP_KERNEL); if (!p) return -ENOMEM; memcpy(p, a->d, a->alloced * sizeof(mpi_limb_t)); From 12e36b4d492b5c23c9b7d331ef20aa67686e0709 Mon Sep 17 00:00:00 2001 From: Phong Hoang Date: Thu, 22 Apr 2021 14:34:43 +0200 Subject: [PATCH 0537/5344] clocksource/drivers/sh_cmt: Fix wrong setting if don't request IRQ for clock source channel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit be83c3b6e7b8ff22f72827a613bf6f3aa5afadbb ] If CMT instance has at least two channels, one channel will be used as a clock source and another one used as a clock event device. In that case, IRQ is not requested for clock source channel so sh_cmt_clock_event_program_verify() might work incorrectly. Besides, when a channel is only used for clock source, don't need to re-set the next match_value since it should be maximum timeout as it still is. On the other hand, due to no IRQ, total_cycles is not counted up when reaches compare match time (timer counter resets to zero), so sh_cmt_clocksource_read() returns unexpected value. Therefore, use 64-bit clocksoure's mask for 32-bit or 16-bit variants will also lead to wrong delta calculation. Hence, this mask should correspond to timer counter width, and above function just returns the raw value of timer counter register. Fixes: bfa76bb12f23 ("clocksource: sh_cmt: Request IRQ for clock event device only") Fixes: 37e7742c55ba ("clocksource/drivers/sh_cmt: Fix clocksource width for 32-bit machines") Signed-off-by: Phong Hoang Signed-off-by: Niklas Söderlund Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210422123443.73334-1-niklas.soderlund+renesas@ragnatech.se Signed-off-by: Sasha Levin --- drivers/clocksource/sh_cmt.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c index ef773db080e90..a0570213170d8 100644 --- a/drivers/clocksource/sh_cmt.c +++ b/drivers/clocksource/sh_cmt.c @@ -568,7 +568,8 @@ static int sh_cmt_start(struct sh_cmt_channel *ch, unsigned long flag) ch->flags |= flag; /* setup timeout if no clockevent */ - if ((flag == FLAG_CLOCKSOURCE) && (!(ch->flags & FLAG_CLOCKEVENT))) + if (ch->cmt->num_channels == 1 && + flag == FLAG_CLOCKSOURCE && (!(ch->flags & FLAG_CLOCKEVENT))) __sh_cmt_set_next(ch, ch->max_match_value); out: raw_spin_unlock_irqrestore(&ch->lock, flags); @@ -604,20 +605,25 @@ static struct sh_cmt_channel *cs_to_sh_cmt(struct clocksource *cs) static u64 sh_cmt_clocksource_read(struct clocksource *cs) { struct sh_cmt_channel *ch = cs_to_sh_cmt(cs); - unsigned long flags; u32 has_wrapped; - u64 value; - u32 raw; - raw_spin_lock_irqsave(&ch->lock, flags); - value = ch->total_cycles; - raw = sh_cmt_get_counter(ch, &has_wrapped); + if (ch->cmt->num_channels == 1) { + unsigned long flags; + u64 value; + u32 raw; - if (unlikely(has_wrapped)) - raw += ch->match_value + 1; - raw_spin_unlock_irqrestore(&ch->lock, flags); + raw_spin_lock_irqsave(&ch->lock, flags); + value = ch->total_cycles; + raw = sh_cmt_get_counter(ch, &has_wrapped); + + if (unlikely(has_wrapped)) + raw += ch->match_value + 1; + raw_spin_unlock_irqrestore(&ch->lock, flags); + + return value + raw; + } - return value + raw; + return sh_cmt_get_counter(ch, &has_wrapped); } static int sh_cmt_clocksource_enable(struct clocksource *cs) @@ -680,7 +686,7 @@ static int sh_cmt_register_clocksource(struct sh_cmt_channel *ch, cs->disable = sh_cmt_clocksource_disable; cs->suspend = sh_cmt_clocksource_suspend; cs->resume = sh_cmt_clocksource_resume; - cs->mask = CLOCKSOURCE_MASK(sizeof(u64) * 8); + cs->mask = CLOCKSOURCE_MASK(ch->cmt->info->width); cs->flags = CLOCK_SOURCE_IS_CONTINUOUS; dev_info(&ch->cmt->pdev->dev, "ch%u: used as clock source\n", From d4fcbabccfe7a7ba44c99ef2f7138aa1f9b1dec9 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Thu, 12 Aug 2021 12:15:01 +0300 Subject: [PATCH 0538/5344] block: nbd: add sanity check for first_minor [ Upstream commit b1a811633f7321cf1ae2bb76a66805b7720e44c9 ] Syzbot hit WARNING in internal_create_group(). The problem was in too big disk->first_minor. disk->first_minor is initialized by value, which comes from userspace and there wasn't any sanity checks about value correctness. It can cause duplicate creation of sysfs files/links, because disk->first_minor will be passed to MKDEV() which causes truncation to byte. Since maximum minor value is 0xff, let's check if first_minor is correct minor number. NOTE: the root case of the reported warning was in wrong error handling in register_disk(), but we can avoid passing knowingly wrong values to sysfs API, because sysfs error messages can confuse users. For example: user passed 1048576 as index, but sysfs complains about duplicate creation of /dev/block/43:0. It's not obvious how 1048576 becomes 0. Log and reproducer for above example can be found on syzkaller bug report page. Link: https://syzkaller.appspot.com/bug?id=03c2ae9146416edf811958d5fd7acfab75b143d1 Fixes: b0d9111a2d53 ("nbd: use an idr to keep track of nbd devices") Reported-by: syzbot+9937dc42271cd87d4b98@syzkaller.appspotmail.com Reviewed-by: Christoph Hellwig Signed-off-by: Pavel Skripkin Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- drivers/block/nbd.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index bdc750c5d6dda..bc83c75596d47 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1786,7 +1786,17 @@ static int nbd_dev_add(int index) refcount_set(&nbd->refs, 1); INIT_LIST_HEAD(&nbd->list); disk->major = NBD_MAJOR; + + /* Too big first_minor can cause duplicate creation of + * sysfs files/links, since first_minor will be truncated to + * byte in __device_add_disk(). + */ disk->first_minor = index << part_shift; + if (disk->first_minor > 0xff) { + err = -EINVAL; + goto out_free_idr; + } + disk->fops = &nbd_fops; disk->private_data = nbd; sprintf(disk->disk_name, "nbd%d", index); From ff7896c240292008680dc044217c27d8466f0c9b Mon Sep 17 00:00:00 2001 From: Giovanni Cabiddu Date: Thu, 12 Aug 2021 21:21:10 +0100 Subject: [PATCH 0539/5344] crypto: qat - use proper type for vf_mask [ Upstream commit 462354d986b6a89c6449b85f17aaacf44e455216 ] Replace vf_mask type with unsigned long to avoid a stack-out-of-bound. This is to fix the following warning reported by KASAN the first time adf_msix_isr_ae() gets called. [ 692.091987] BUG: KASAN: stack-out-of-bounds in find_first_bit+0x28/0x50 [ 692.092017] Read of size 8 at addr ffff88afdf789e60 by task swapper/32/0 [ 692.092076] Call Trace: [ 692.092089] [ 692.092101] dump_stack+0x9c/0xcf [ 692.092132] print_address_description.constprop.0+0x18/0x130 [ 692.092164] ? find_first_bit+0x28/0x50 [ 692.092185] kasan_report.cold+0x7f/0x111 [ 692.092213] ? static_obj+0x10/0x80 [ 692.092234] ? find_first_bit+0x28/0x50 [ 692.092262] find_first_bit+0x28/0x50 [ 692.092288] adf_msix_isr_ae+0x16e/0x230 [intel_qat] Fixes: ed8ccaef52fa ("crypto: qat - Add support for SRIOV") Signed-off-by: Giovanni Cabiddu Reviewed-by: Marco Chiappero Reviewed-by: Fiona Trahe Signed-off-by: Herbert Xu Signed-off-by: Sasha Levin --- drivers/crypto/qat/qat_common/adf_isr.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/crypto/qat/qat_common/adf_isr.c b/drivers/crypto/qat/qat_common/adf_isr.c index 4898ef41fd9fd..7d319c5c071c4 100644 --- a/drivers/crypto/qat/qat_common/adf_isr.c +++ b/drivers/crypto/qat/qat_common/adf_isr.c @@ -59,6 +59,8 @@ #include "adf_transport_access_macros.h" #include "adf_transport_internal.h" +#define ADF_MAX_NUM_VFS 32 + static int adf_enable_msix(struct adf_accel_dev *accel_dev) { struct adf_accel_pci *pci_dev_info = &accel_dev->accel_pci_dev; @@ -111,7 +113,7 @@ static irqreturn_t adf_msix_isr_ae(int irq, void *dev_ptr) struct adf_bar *pmisc = &GET_BARS(accel_dev)[hw_data->get_misc_bar_id(hw_data)]; void __iomem *pmisc_bar_addr = pmisc->virt_addr; - u32 vf_mask; + unsigned long vf_mask; /* Get the interrupt sources triggered by VFs */ vf_mask = ((ADF_CSR_RD(pmisc_bar_addr, ADF_ERRSOU5) & @@ -132,8 +134,7 @@ static irqreturn_t adf_msix_isr_ae(int irq, void *dev_ptr) * unless the VF is malicious and is attempting to * flood the host OS with VF2PF interrupts. */ - for_each_set_bit(i, (const unsigned long *)&vf_mask, - (sizeof(vf_mask) * BITS_PER_BYTE)) { + for_each_set_bit(i, &vf_mask, ADF_MAX_NUM_VFS) { vf_info = accel_dev->pf.vf_info + i; if (!__ratelimit(&vf_info->vf2pf_ratelimit)) { From 275a5bfd0771c98815d933bdcb0d2f1b6ef44d9d Mon Sep 17 00:00:00 2001 From: Stefan Berger Date: Tue, 29 Jun 2021 17:34:20 -0400 Subject: [PATCH 0540/5344] certs: Trigger creation of RSA module signing key if it's not an RSA key [ Upstream commit ea35e0d5df6c92fa2e124bb1b91d09b2240715ba ] Address a kbuild issue where a developer created an ECDSA key for signing kernel modules and then builds an older version of the kernel, when bi- secting the kernel for example, that does not support ECDSA keys. If openssl is installed, trigger the creation of an RSA module signing key if it is not an RSA key. Fixes: cfc411e7fff3 ("Move certificate handling to its own directory") Cc: David Howells Cc: David Woodhouse Signed-off-by: Stefan Berger Reviewed-by: Jarkko Sakkinen Tested-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen Signed-off-by: Sasha Levin --- certs/Makefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/certs/Makefile b/certs/Makefile index f4b90bad8690a..2baef6fba029e 100644 --- a/certs/Makefile +++ b/certs/Makefile @@ -46,11 +46,19 @@ endif redirect_openssl = 2>&1 quiet_redirect_openssl = 2>&1 silent_redirect_openssl = 2>/dev/null +openssl_available = $(shell openssl help 2>/dev/null && echo yes) # We do it this way rather than having a boolean option for enabling an # external private key, because 'make randconfig' might enable such a # boolean option and we unfortunately can't make it depend on !RANDCONFIG. ifeq ($(CONFIG_MODULE_SIG_KEY),"certs/signing_key.pem") + +ifeq ($(openssl_available),yes) +X509TEXT=$(shell openssl x509 -in "certs/signing_key.pem" -text 2>/dev/null) + +$(if $(findstring rsaEncryption,$(X509TEXT)),,$(shell rm -f "certs/signing_key.pem")) +endif + $(obj)/signing_key.pem: $(obj)/x509.genkey @$(kecho) "###" @$(kecho) "### Now generating an X.509 key pair to be used for signing modules." From bf54a27ca9717bf6ac5ffad7694151b3bb3d7e09 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Wed, 25 Aug 2021 11:37:03 +0800 Subject: [PATCH 0541/5344] regulator: vctrl: Use locked regulator_get_voltage in probe path [ Upstream commit 98e47570ba985f2310586c80409238200fa3170f ] In commit e9153311491d ("regulator: vctrl-regulator: Avoid deadlock getting and setting the voltage"), all calls to get/set the voltage of the control regulator were switched to unlocked versions to avoid deadlocks. However, the call in the probe path is done without regulator locks held. In this case the locked version should be used. Switch back to the locked regulator_get_voltage() in the probe path to avoid any mishaps. Fixes: e9153311491d ("regulator: vctrl-regulator: Avoid deadlock getting and setting the voltage") Signed-off-by: Chen-Yu Tsai Link: https://lore.kernel.org/r/20210825033704.3307263-2-wenst@chromium.org Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- drivers/regulator/vctrl-regulator.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/regulator/vctrl-regulator.c b/drivers/regulator/vctrl-regulator.c index cbadb1c996790..93d33201ffe0a 100644 --- a/drivers/regulator/vctrl-regulator.c +++ b/drivers/regulator/vctrl-regulator.c @@ -490,7 +490,8 @@ static int vctrl_probe(struct platform_device *pdev) if (ret) return ret; - ctrl_uV = regulator_get_voltage_rdev(vctrl->ctrl_reg->rdev); + /* Use locked consumer API when not in regulator framework */ + ctrl_uV = regulator_get_voltage(vctrl->ctrl_reg); if (ctrl_uV < 0) { dev_err(&pdev->dev, "failed to get control voltage\n"); return ctrl_uV; From 7872149c46f5662b32ed39bc4aa8057aff28d481 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Wed, 25 Aug 2021 11:37:04 +0800 Subject: [PATCH 0542/5344] regulator: vctrl: Avoid lockdep warning in enable/disable ops [ Upstream commit 21e39809fd7c4b8ff3662f23e0168e87594c8ca8 ] vctrl_enable() and vctrl_disable() call regulator_enable() and regulator_disable(), respectively. However, vctrl_* are regulator ops and should not be calling the locked regulator APIs. Doing so results in a lockdep warning. Instead of exporting more internal regulator ops, model the ctrl supply as an actual supply to vctrl-regulator. At probe time this driver still needs to use the consumer API to fetch its constraints, but otherwise lets the regulator core handle the upstream supply for it. The enable/disable/is_enabled ops are not removed, but now only track state internally. This preserves the original behavior with the ops being available, but one could argue that the original behavior was already incorrect: the internal state would not match the upstream supply if that supply had another consumer that enabled the supply, while vctrl-regulator was not enabled. The lockdep warning is as follows: WARNING: possible circular locking dependency detected 5.14.0-rc6 #2 Not tainted ------------------------------------------------------ swapper/0/1 is trying to acquire lock: ffffffc011306d00 (regulator_list_mutex){+.+.}-{3:3}, at: regulator_lock_dependent (arch/arm64/include/asm/current.h:19 include/linux/ww_mutex.h:111 drivers/regulator/core.c:329) but task is already holding lock: ffffff8004a77160 (regulator_ww_class_mutex){+.+.}-{3:3}, at: regulator_lock_recursive (drivers/regulator/core.c:156 drivers/regulator/core.c:263) which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (regulator_ww_class_mutex){+.+.}-{3:3}: __mutex_lock_common (include/asm-generic/atomic-instrumented.h:606 include/asm-generic/atomic-long.h:29 kernel/locking/mutex.c:103 kernel/locking/mutex.c:144 kernel/locking/mutex.c:963) ww_mutex_lock (kernel/locking/mutex.c:1199) regulator_lock_recursive (drivers/regulator/core.c:156 drivers/regulator/core.c:263) regulator_lock_dependent (drivers/regulator/core.c:343) regulator_enable (drivers/regulator/core.c:2808) set_machine_constraints (drivers/regulator/core.c:1536) regulator_register (drivers/regulator/core.c:5486) devm_regulator_register (drivers/regulator/devres.c:196) reg_fixed_voltage_probe (drivers/regulator/fixed.c:289) platform_probe (drivers/base/platform.c:1427) [...] -> #1 (regulator_ww_class_acquire){+.+.}-{0:0}: regulator_lock_dependent (include/linux/ww_mutex.h:129 drivers/regulator/core.c:329) regulator_enable (drivers/regulator/core.c:2808) set_machine_constraints (drivers/regulator/core.c:1536) regulator_register (drivers/regulator/core.c:5486) devm_regulator_register (drivers/regulator/devres.c:196) reg_fixed_voltage_probe (drivers/regulator/fixed.c:289) [...] -> #0 (regulator_list_mutex){+.+.}-{3:3}: __lock_acquire (kernel/locking/lockdep.c:3052 (discriminator 4) kernel/locking/lockdep.c:3174 (discriminator 4) kernel/locking/lockdep.c:3789 (discriminator 4) kernel/locking/lockdep.c:5015 (discriminator 4)) lock_acquire (arch/arm64/include/asm/percpu.h:39 kernel/locking/lockdep.c:438 kernel/locking/lockdep.c:5627) __mutex_lock_common (include/asm-generic/atomic-instrumented.h:606 include/asm-generic/atomic-long.h:29 kernel/locking/mutex.c:103 kernel/locking/mutex.c:144 kernel/locking/mutex.c:963) mutex_lock_nested (kernel/locking/mutex.c:1125) regulator_lock_dependent (arch/arm64/include/asm/current.h:19 include/linux/ww_mutex.h:111 drivers/regulator/core.c:329) regulator_enable (drivers/regulator/core.c:2808) vctrl_enable (drivers/regulator/vctrl-regulator.c:400) _regulator_do_enable (drivers/regulator/core.c:2617) _regulator_enable (drivers/regulator/core.c:2764) regulator_enable (drivers/regulator/core.c:308 drivers/regulator/core.c:2809) _set_opp (drivers/opp/core.c:819 drivers/opp/core.c:1072) dev_pm_opp_set_rate (drivers/opp/core.c:1164) set_target (drivers/cpufreq/cpufreq-dt.c:62) __cpufreq_driver_target (drivers/cpufreq/cpufreq.c:2216 drivers/cpufreq/cpufreq.c:2271) cpufreq_online (drivers/cpufreq/cpufreq.c:1488 (discriminator 2)) cpufreq_add_dev (drivers/cpufreq/cpufreq.c:1563) subsys_interface_register (drivers/base/bus.c:?) cpufreq_register_driver (drivers/cpufreq/cpufreq.c:2819) dt_cpufreq_probe (drivers/cpufreq/cpufreq-dt.c:344) [...] other info that might help us debug this: Chain exists of: regulator_list_mutex --> regulator_ww_class_acquire --> regulator_ww_class_mutex Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(regulator_ww_class_mutex); lock(regulator_ww_class_acquire); lock(regulator_ww_class_mutex); lock(regulator_list_mutex); *** DEADLOCK *** 6 locks held by swapper/0/1: #0: ffffff8002d32188 (&dev->mutex){....}-{3:3}, at: __device_driver_lock (drivers/base/dd.c:1030) #1: ffffffc0111a0520 (cpu_hotplug_lock){++++}-{0:0}, at: cpufreq_register_driver (drivers/cpufreq/cpufreq.c:2792 (discriminator 2)) #2: ffffff8002a8d918 (subsys mutex#9){+.+.}-{3:3}, at: subsys_interface_register (drivers/base/bus.c:1033) #3: ffffff800341bb90 (&policy->rwsem){+.+.}-{3:3}, at: cpufreq_online (include/linux/bitmap.h:285 include/linux/cpumask.h:405 drivers/cpufreq/cpufreq.c:1399) #4: ffffffc011f0b7b8 (regulator_ww_class_acquire){+.+.}-{0:0}, at: regulator_enable (drivers/regulator/core.c:2808) #5: ffffff8004a77160 (regulator_ww_class_mutex){+.+.}-{3:3}, at: regulator_lock_recursive (drivers/regulator/core.c:156 drivers/regulator/core.c:263) stack backtrace: CPU: 1 PID: 1 Comm: swapper/0 Not tainted 5.14.0-rc6 #2 7c8f8996d021ed0f65271e6aeebf7999de74a9fa Hardware name: Google Scarlet (DT) Call trace: dump_backtrace (arch/arm64/kernel/stacktrace.c:161) show_stack (arch/arm64/kernel/stacktrace.c:218) dump_stack_lvl (lib/dump_stack.c:106 (discriminator 2)) dump_stack (lib/dump_stack.c:113) print_circular_bug (kernel/locking/lockdep.c:?) check_noncircular (kernel/locking/lockdep.c:?) __lock_acquire (kernel/locking/lockdep.c:3052 (discriminator 4) kernel/locking/lockdep.c:3174 (discriminator 4) kernel/locking/lockdep.c:3789 (discriminator 4) kernel/locking/lockdep.c:5015 (discriminator 4)) lock_acquire (arch/arm64/include/asm/percpu.h:39 kernel/locking/lockdep.c:438 kernel/locking/lockdep.c:5627) __mutex_lock_common (include/asm-generic/atomic-instrumented.h:606 include/asm-generic/atomic-long.h:29 kernel/locking/mutex.c:103 kernel/locking/mutex.c:144 kernel/locking/mutex.c:963) mutex_lock_nested (kernel/locking/mutex.c:1125) regulator_lock_dependent (arch/arm64/include/asm/current.h:19 include/linux/ww_mutex.h:111 drivers/regulator/core.c:329) regulator_enable (drivers/regulator/core.c:2808) vctrl_enable (drivers/regulator/vctrl-regulator.c:400) _regulator_do_enable (drivers/regulator/core.c:2617) _regulator_enable (drivers/regulator/core.c:2764) regulator_enable (drivers/regulator/core.c:308 drivers/regulator/core.c:2809) _set_opp (drivers/opp/core.c:819 drivers/opp/core.c:1072) dev_pm_opp_set_rate (drivers/opp/core.c:1164) set_target (drivers/cpufreq/cpufreq-dt.c:62) __cpufreq_driver_target (drivers/cpufreq/cpufreq.c:2216 drivers/cpufreq/cpufreq.c:2271) cpufreq_online (drivers/cpufreq/cpufreq.c:1488 (discriminator 2)) cpufreq_add_dev (drivers/cpufreq/cpufreq.c:1563) subsys_interface_register (drivers/base/bus.c:?) cpufreq_register_driver (drivers/cpufreq/cpufreq.c:2819) dt_cpufreq_probe (drivers/cpufreq/cpufreq-dt.c:344) [...] Reported-by: Brian Norris Fixes: f8702f9e4aa7 ("regulator: core: Use ww_mutex for regulators locking") Fixes: e9153311491d ("regulator: vctrl-regulator: Avoid deadlock getting and setting the voltage") Signed-off-by: Chen-Yu Tsai Link: https://lore.kernel.org/r/20210825033704.3307263-3-wenst@chromium.org Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- drivers/regulator/vctrl-regulator.c | 72 +++++++++++++++++------------ 1 file changed, 42 insertions(+), 30 deletions(-) diff --git a/drivers/regulator/vctrl-regulator.c b/drivers/regulator/vctrl-regulator.c index 93d33201ffe0a..d2a37978fc3a8 100644 --- a/drivers/regulator/vctrl-regulator.c +++ b/drivers/regulator/vctrl-regulator.c @@ -37,7 +37,6 @@ struct vctrl_voltage_table { struct vctrl_data { struct regulator_dev *rdev; struct regulator_desc desc; - struct regulator *ctrl_reg; bool enabled; unsigned int min_slew_down_rate; unsigned int ovp_threshold; @@ -82,7 +81,12 @@ static int vctrl_calc_output_voltage(struct vctrl_data *vctrl, int ctrl_uV) static int vctrl_get_voltage(struct regulator_dev *rdev) { struct vctrl_data *vctrl = rdev_get_drvdata(rdev); - int ctrl_uV = regulator_get_voltage_rdev(vctrl->ctrl_reg->rdev); + int ctrl_uV; + + if (!rdev->supply) + return -EPROBE_DEFER; + + ctrl_uV = regulator_get_voltage_rdev(rdev->supply->rdev); return vctrl_calc_output_voltage(vctrl, ctrl_uV); } @@ -92,14 +96,19 @@ static int vctrl_set_voltage(struct regulator_dev *rdev, unsigned int *selector) { struct vctrl_data *vctrl = rdev_get_drvdata(rdev); - struct regulator *ctrl_reg = vctrl->ctrl_reg; - int orig_ctrl_uV = regulator_get_voltage_rdev(ctrl_reg->rdev); - int uV = vctrl_calc_output_voltage(vctrl, orig_ctrl_uV); + int orig_ctrl_uV; + int uV; int ret; + if (!rdev->supply) + return -EPROBE_DEFER; + + orig_ctrl_uV = regulator_get_voltage_rdev(rdev->supply->rdev); + uV = vctrl_calc_output_voltage(vctrl, orig_ctrl_uV); + if (req_min_uV >= uV || !vctrl->ovp_threshold) /* voltage rising or no OVP */ - return regulator_set_voltage_rdev(ctrl_reg->rdev, + return regulator_set_voltage_rdev(rdev->supply->rdev, vctrl_calc_ctrl_voltage(vctrl, req_min_uV), vctrl_calc_ctrl_voltage(vctrl, req_max_uV), PM_SUSPEND_ON); @@ -117,7 +126,7 @@ static int vctrl_set_voltage(struct regulator_dev *rdev, next_uV = max_t(int, req_min_uV, uV - max_drop_uV); next_ctrl_uV = vctrl_calc_ctrl_voltage(vctrl, next_uV); - ret = regulator_set_voltage_rdev(ctrl_reg->rdev, + ret = regulator_set_voltage_rdev(rdev->supply->rdev, next_ctrl_uV, next_ctrl_uV, PM_SUSPEND_ON); @@ -134,7 +143,7 @@ static int vctrl_set_voltage(struct regulator_dev *rdev, err: /* Try to go back to original voltage */ - regulator_set_voltage_rdev(ctrl_reg->rdev, orig_ctrl_uV, orig_ctrl_uV, + regulator_set_voltage_rdev(rdev->supply->rdev, orig_ctrl_uV, orig_ctrl_uV, PM_SUSPEND_ON); return ret; @@ -151,16 +160,18 @@ static int vctrl_set_voltage_sel(struct regulator_dev *rdev, unsigned int selector) { struct vctrl_data *vctrl = rdev_get_drvdata(rdev); - struct regulator *ctrl_reg = vctrl->ctrl_reg; unsigned int orig_sel = vctrl->sel; int ret; + if (!rdev->supply) + return -EPROBE_DEFER; + if (selector >= rdev->desc->n_voltages) return -EINVAL; if (selector >= vctrl->sel || !vctrl->ovp_threshold) { /* voltage rising or no OVP */ - ret = regulator_set_voltage_rdev(ctrl_reg->rdev, + ret = regulator_set_voltage_rdev(rdev->supply->rdev, vctrl->vtable[selector].ctrl, vctrl->vtable[selector].ctrl, PM_SUSPEND_ON); @@ -179,7 +190,7 @@ static int vctrl_set_voltage_sel(struct regulator_dev *rdev, else next_sel = vctrl->vtable[vctrl->sel].ovp_min_sel; - ret = regulator_set_voltage_rdev(ctrl_reg->rdev, + ret = regulator_set_voltage_rdev(rdev->supply->rdev, vctrl->vtable[next_sel].ctrl, vctrl->vtable[next_sel].ctrl, PM_SUSPEND_ON); @@ -202,7 +213,7 @@ static int vctrl_set_voltage_sel(struct regulator_dev *rdev, err: if (vctrl->sel != orig_sel) { /* Try to go back to original voltage */ - if (!regulator_set_voltage_rdev(ctrl_reg->rdev, + if (!regulator_set_voltage_rdev(rdev->supply->rdev, vctrl->vtable[orig_sel].ctrl, vctrl->vtable[orig_sel].ctrl, PM_SUSPEND_ON)) @@ -234,10 +245,6 @@ static int vctrl_parse_dt(struct platform_device *pdev, u32 pval; u32 vrange_ctrl[2]; - vctrl->ctrl_reg = devm_regulator_get(&pdev->dev, "ctrl"); - if (IS_ERR(vctrl->ctrl_reg)) - return PTR_ERR(vctrl->ctrl_reg); - ret = of_property_read_u32(np, "ovp-threshold-percent", &pval); if (!ret) { vctrl->ovp_threshold = pval; @@ -315,11 +322,11 @@ static int vctrl_cmp_ctrl_uV(const void *a, const void *b) return at->ctrl - bt->ctrl; } -static int vctrl_init_vtable(struct platform_device *pdev) +static int vctrl_init_vtable(struct platform_device *pdev, + struct regulator *ctrl_reg) { struct vctrl_data *vctrl = platform_get_drvdata(pdev); struct regulator_desc *rdesc = &vctrl->desc; - struct regulator *ctrl_reg = vctrl->ctrl_reg; struct vctrl_voltage_range *vrange_ctrl = &vctrl->vrange.ctrl; int n_voltages; int ctrl_uV; @@ -395,23 +402,19 @@ static int vctrl_init_vtable(struct platform_device *pdev) static int vctrl_enable(struct regulator_dev *rdev) { struct vctrl_data *vctrl = rdev_get_drvdata(rdev); - int ret = regulator_enable(vctrl->ctrl_reg); - if (!ret) - vctrl->enabled = true; + vctrl->enabled = true; - return ret; + return 0; } static int vctrl_disable(struct regulator_dev *rdev) { struct vctrl_data *vctrl = rdev_get_drvdata(rdev); - int ret = regulator_disable(vctrl->ctrl_reg); - if (!ret) - vctrl->enabled = false; + vctrl->enabled = false; - return ret; + return 0; } static int vctrl_is_enabled(struct regulator_dev *rdev) @@ -447,6 +450,7 @@ static int vctrl_probe(struct platform_device *pdev) struct regulator_desc *rdesc; struct regulator_config cfg = { }; struct vctrl_voltage_range *vrange_ctrl; + struct regulator *ctrl_reg; int ctrl_uV; int ret; @@ -461,15 +465,20 @@ static int vctrl_probe(struct platform_device *pdev) if (ret) return ret; + ctrl_reg = devm_regulator_get(&pdev->dev, "ctrl"); + if (IS_ERR(ctrl_reg)) + return PTR_ERR(ctrl_reg); + vrange_ctrl = &vctrl->vrange.ctrl; rdesc = &vctrl->desc; rdesc->name = "vctrl"; rdesc->type = REGULATOR_VOLTAGE; rdesc->owner = THIS_MODULE; + rdesc->supply_name = "ctrl"; - if ((regulator_get_linear_step(vctrl->ctrl_reg) == 1) || - (regulator_count_voltages(vctrl->ctrl_reg) == -EINVAL)) { + if ((regulator_get_linear_step(ctrl_reg) == 1) || + (regulator_count_voltages(ctrl_reg) == -EINVAL)) { rdesc->continuous_voltage_range = true; rdesc->ops = &vctrl_ops_cont; } else { @@ -486,12 +495,12 @@ static int vctrl_probe(struct platform_device *pdev) cfg.init_data = init_data; if (!rdesc->continuous_voltage_range) { - ret = vctrl_init_vtable(pdev); + ret = vctrl_init_vtable(pdev, ctrl_reg); if (ret) return ret; /* Use locked consumer API when not in regulator framework */ - ctrl_uV = regulator_get_voltage(vctrl->ctrl_reg); + ctrl_uV = regulator_get_voltage(ctrl_reg); if (ctrl_uV < 0) { dev_err(&pdev->dev, "failed to get control voltage\n"); return ctrl_uV; @@ -514,6 +523,9 @@ static int vctrl_probe(struct platform_device *pdev) } } + /* Drop ctrl-supply here in favor of regulator core managed supply */ + devm_regulator_put(ctrl_reg); + vctrl->rdev = devm_regulator_register(&pdev->dev, rdesc, &cfg); if (IS_ERR(vctrl->rdev)) { ret = PTR_ERR(vctrl->rdev); From d1d85d645008bc2b1a2d49ac2465565d94b80b76 Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Thu, 26 Aug 2021 17:15:46 +0800 Subject: [PATCH 0543/5344] spi: sprd: Fix the wrong WDG_LOAD_VAL [ Upstream commit 245ca2cc212bb2a078332ec99afbfbb202f44c2d ] Use 50ms as default timeout value and the time clock is 32768HZ. The original value of WDG_LOAD_VAL is not correct, so this patch fixes it. Fixes: ac1775012058 ("spi: sprd: Add the support of restarting the system") Signed-off-by: Chunyan Zhang Link: https://lore.kernel.org/r/20210826091549.2138125-2-zhang.lyra@gmail.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- drivers/spi/spi-sprd-adi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-sprd-adi.c b/drivers/spi/spi-sprd-adi.c index 09f983524d51b..e804a3854c351 100644 --- a/drivers/spi/spi-sprd-adi.c +++ b/drivers/spi/spi-sprd-adi.c @@ -102,7 +102,7 @@ #define HWRST_STATUS_WATCHDOG 0xf0 /* Use default timeout 50 ms that converts to watchdog values */ -#define WDG_LOAD_VAL ((50 * 1000) / 32768) +#define WDG_LOAD_VAL ((50 * 32768) / 1000) #define WDG_LOAD_MASK GENMASK(15, 0) #define WDG_UNLOCK_KEY 0xe551 From 482a7781c93b8025900f07c6b385a7900afc36f6 Mon Sep 17 00:00:00 2001 From: Quanyang Wang Date: Thu, 26 Aug 2021 08:59:30 +0800 Subject: [PATCH 0544/5344] spi: spi-zynq-qspi: use wait_for_completion_timeout to make zynq_qspi_exec_mem_op not interruptible [ Upstream commit 26cfc0dbe43aae60dc03af27077775244f26c167 ] The function wait_for_completion_interruptible_timeout will return -ERESTARTSYS immediately when receiving SIGKILL signal which is sent by "jffs2_gcd_mtd" during umounting jffs2. This will break the SPI memory operation because the data transmitting may begin before the command or address transmitting completes. Use wait_for_completion_timeout to prevent the process from being interruptible. Fixes: 67dca5e580f1 ("spi: spi-mem: Add support for Zynq QSPI controller") Signed-off-by: Quanyang Wang Link: https://lore.kernel.org/r/20210826005930.20572-1-quanyang.wang@windriver.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- drivers/spi/spi-zynq-qspi.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/spi/spi-zynq-qspi.c b/drivers/spi/spi-zynq-qspi.c index 5cf6993ddce57..1ced6eb8b3303 100644 --- a/drivers/spi/spi-zynq-qspi.c +++ b/drivers/spi/spi-zynq-qspi.c @@ -533,7 +533,7 @@ static int zynq_qspi_exec_mem_op(struct spi_mem *mem, zynq_qspi_write_op(xqspi, ZYNQ_QSPI_FIFO_DEPTH, true); zynq_qspi_write(xqspi, ZYNQ_QSPI_IEN_OFFSET, ZYNQ_QSPI_IXR_RXTX_MASK); - if (!wait_for_completion_interruptible_timeout(&xqspi->data_completion, + if (!wait_for_completion_timeout(&xqspi->data_completion, msecs_to_jiffies(1000))) err = -ETIMEDOUT; } @@ -551,7 +551,7 @@ static int zynq_qspi_exec_mem_op(struct spi_mem *mem, zynq_qspi_write_op(xqspi, ZYNQ_QSPI_FIFO_DEPTH, true); zynq_qspi_write(xqspi, ZYNQ_QSPI_IEN_OFFSET, ZYNQ_QSPI_IXR_RXTX_MASK); - if (!wait_for_completion_interruptible_timeout(&xqspi->data_completion, + if (!wait_for_completion_timeout(&xqspi->data_completion, msecs_to_jiffies(1000))) err = -ETIMEDOUT; } @@ -567,7 +567,7 @@ static int zynq_qspi_exec_mem_op(struct spi_mem *mem, zynq_qspi_write_op(xqspi, ZYNQ_QSPI_FIFO_DEPTH, true); zynq_qspi_write(xqspi, ZYNQ_QSPI_IEN_OFFSET, ZYNQ_QSPI_IXR_RXTX_MASK); - if (!wait_for_completion_interruptible_timeout(&xqspi->data_completion, + if (!wait_for_completion_timeout(&xqspi->data_completion, msecs_to_jiffies(1000))) err = -ETIMEDOUT; @@ -591,7 +591,7 @@ static int zynq_qspi_exec_mem_op(struct spi_mem *mem, zynq_qspi_write_op(xqspi, ZYNQ_QSPI_FIFO_DEPTH, true); zynq_qspi_write(xqspi, ZYNQ_QSPI_IEN_OFFSET, ZYNQ_QSPI_IXR_RXTX_MASK); - if (!wait_for_completion_interruptible_timeout(&xqspi->data_completion, + if (!wait_for_completion_timeout(&xqspi->data_completion, msecs_to_jiffies(1000))) err = -ETIMEDOUT; } From 407e86cbae46d8cbff8aaca67e74c0ab9e2ec92d Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 8 Jun 2021 14:38:56 +0000 Subject: [PATCH 0545/5344] drm/panfrost: Fix missing clk_disable_unprepare() on error in panfrost_clk_init() [ Upstream commit f42498705965bd4b026953c1892c686d8b1138e4 ] Fix the missing clk_disable_unprepare() before return from panfrost_clk_init() in the error handling case. Fixes: b681af0bc1cc ("drm: panfrost: add optional bus_clock") Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Reviewed-by: Steven Price Signed-off-by: Steven Price Link: https://patchwork.freedesktop.org/patch/msgid/20210608143856.4154766-1-weiyongjun1@huawei.com Signed-off-by: Sasha Levin --- drivers/gpu/drm/panfrost/panfrost_device.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/panfrost/panfrost_device.c b/drivers/gpu/drm/panfrost/panfrost_device.c index 238fb6d54df47..413bf314a2bc3 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.c +++ b/drivers/gpu/drm/panfrost/panfrost_device.c @@ -59,7 +59,8 @@ static int panfrost_clk_init(struct panfrost_device *pfdev) if (IS_ERR(pfdev->bus_clock)) { dev_err(pfdev->dev, "get bus_clock failed %ld\n", PTR_ERR(pfdev->bus_clock)); - return PTR_ERR(pfdev->bus_clock); + err = PTR_ERR(pfdev->bus_clock); + goto disable_clock; } if (pfdev->bus_clock) { From c4f3f3e301874fd0adf62febf390c1bddc72b872 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Ha=C5=82asa?= Date: Wed, 16 Jun 2021 07:13:55 +0200 Subject: [PATCH 0546/5344] media: TDA1997x: enable EDID support [ Upstream commit ea3e1c36e38810427485f06c2becc1f29e54521d ] Without this patch, the TDA19971 chip's EDID is inactive. EDID never worked with this driver, it was all tested with HDMI signal sources which don't need EDID support. Signed-off-by: Krzysztof Halasa Fixes: 9ac0038db9a7 ("media: i2c: Add TDA1997x HDMI receiver driver") Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/i2c/tda1997x.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/i2c/tda1997x.c b/drivers/media/i2c/tda1997x.c index e43d8327b8103..1088161498df0 100644 --- a/drivers/media/i2c/tda1997x.c +++ b/drivers/media/i2c/tda1997x.c @@ -2233,6 +2233,7 @@ static int tda1997x_core_init(struct v4l2_subdev *sd) /* get initial HDMI status */ state->hdmi_status = io_read(sd, REG_HDMI_FLAGS); + io_write(sd, REG_EDID_ENABLE, EDID_ENABLE_A_EN | EDID_ENABLE_B_EN); return 0; } From e105f8246415151e2b8b32bd46d17713193c6b9d Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 8 Feb 2021 15:38:55 +0100 Subject: [PATCH 0547/5344] soc: rockchip: ROCKCHIP_GRF should not default to y, unconditionally [ Upstream commit 2a1c55d4762dd34a8b0f2e36fb01b7b16b60735b ] Merely enabling CONFIG_COMPILE_TEST should not enable additional code. To fix this, restrict the automatic enabling of ROCKCHIP_GRF to ARCH_ROCKCHIP, and ask the user in case of compile-testing. Fixes: 4c58063d4258f6be ("soc: rockchip: add driver handling grf setup") Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20210208143855.418374-1-geert+renesas@glider.be Signed-off-by: Heiko Stuebner Signed-off-by: Sasha Levin --- drivers/soc/rockchip/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/soc/rockchip/Kconfig b/drivers/soc/rockchip/Kconfig index b71b73bf5fc5c..785990720479c 100644 --- a/drivers/soc/rockchip/Kconfig +++ b/drivers/soc/rockchip/Kconfig @@ -6,8 +6,8 @@ if ARCH_ROCKCHIP || COMPILE_TEST # config ROCKCHIP_GRF - bool - default y + bool "Rockchip General Register Files support" if COMPILE_TEST + default y if ARCH_ROCKCHIP help The General Register Files are a central component providing special additional settings registers for a lot of soc-components. From 55005f40626af82290fd4a4139c7eecc17bcba13 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 10 Jun 2021 21:54:31 +0200 Subject: [PATCH 0548/5344] media: cxd2880-spi: Fix an error handling path [ Upstream commit dcb0145821017e929a733e2271c85c6f82b9c9f8 ] If an error occurs after a successful 'regulator_enable()' call, 'regulator_disable()' must be called. Fix the error handling path of the probe accordingly. Fixes: cb496cd472af ("media: cxd2880-spi: Add optional vcc regulator") Signed-off-by: Christophe JAILLET Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/spi/cxd2880-spi.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/media/spi/cxd2880-spi.c b/drivers/media/spi/cxd2880-spi.c index 4077217777f92..93194f03764d2 100644 --- a/drivers/media/spi/cxd2880-spi.c +++ b/drivers/media/spi/cxd2880-spi.c @@ -524,13 +524,13 @@ cxd2880_spi_probe(struct spi_device *spi) if (IS_ERR(dvb_spi->vcc_supply)) { if (PTR_ERR(dvb_spi->vcc_supply) == -EPROBE_DEFER) { ret = -EPROBE_DEFER; - goto fail_adapter; + goto fail_regulator; } dvb_spi->vcc_supply = NULL; } else { ret = regulator_enable(dvb_spi->vcc_supply); if (ret) - goto fail_adapter; + goto fail_regulator; } dvb_spi->spi = spi; @@ -618,6 +618,9 @@ cxd2880_spi_probe(struct spi_device *spi) fail_attach: dvb_unregister_adapter(&dvb_spi->adapter); fail_adapter: + if (!dvb_spi->vcc_supply) + regulator_disable(dvb_spi->vcc_supply); +fail_regulator: kfree(dvb_spi); return ret; } From 6fd32c2e7622a416c99c365582ecaab4cd2e070b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 14 Jul 2021 21:43:17 +0900 Subject: [PATCH 0549/5344] bpf: Fix a typo of reuseport map in bpf.h. [ Upstream commit f170acda7ffaf0473d06e1e17c12cd9fd63904f5 ] Fix s/BPF_MAP_TYPE_REUSEPORT_ARRAY/BPF_MAP_TYPE_REUSEPORT_SOCKARRAY/ typo in bpf.h. Fixes: 2dbb9b9e6df6 ("bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210714124317.67526-1-kuniyu@amazon.co.jp Signed-off-by: Sasha Levin --- include/uapi/linux/bpf.h | 2 +- tools/include/uapi/linux/bpf.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f41386fc9041f..107eb71d17250 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2291,7 +2291,7 @@ union bpf_attr { * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) * Description * Select a **SO_REUSEPORT** socket from a - * **BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*. + * **BPF_MAP_TYPE_REUSEPORT_SOCKARRAY** *map*. * It checks the selected socket is matching the incoming * request in the socket buffer. * Return diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f41386fc9041f..107eb71d17250 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2291,7 +2291,7 @@ union bpf_attr { * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) * Description * Select a **SO_REUSEPORT** socket from a - * **BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*. + * **BPF_MAP_TYPE_REUSEPORT_SOCKARRAY** *map*. * It checks the selected socket is matching the incoming * request in the socket buffer. * Return From 75192f18c50d782bd955757d7572875d8b8a8296 Mon Sep 17 00:00:00 2001 From: Dylan Hung Date: Thu, 29 Oct 2020 14:27:23 +0800 Subject: [PATCH 0550/5344] ARM: dts: aspeed-g6: Fix HVI3C function-group in pinctrl dtsi [ Upstream commit 8c295b7f3d01359ff4336fcb6e406e6ed37957d6 ] The HVI3C shall be a group of I3C function, not an independent function. Correct the function name from "HVI3C" to "I3C". Signed-off-by: Dylan Hung Reviewed-by: Andrew Jeffery Fixes: f510f04c8c83 ("ARM: dts: aspeed: Add AST2600 pinmux nodes") Link: https://lore.kernel.org/r/20201029062723.20798-1-dylan_hung@aspeedtech.com Signed-off-by: Joel Stanley Signed-off-by: Sasha Levin --- arch/arm/boot/dts/aspeed-g6-pinctrl.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/aspeed-g6-pinctrl.dtsi b/arch/arm/boot/dts/aspeed-g6-pinctrl.dtsi index 5b8bf58e89cb4..996e006e06c25 100644 --- a/arch/arm/boot/dts/aspeed-g6-pinctrl.dtsi +++ b/arch/arm/boot/dts/aspeed-g6-pinctrl.dtsi @@ -208,12 +208,12 @@ }; pinctrl_hvi3c3_default: hvi3c3_default { - function = "HVI3C3"; + function = "I3C3"; groups = "HVI3C3"; }; pinctrl_hvi3c4_default: hvi3c4_default { - function = "HVI3C4"; + function = "I3C4"; groups = "HVI3C4"; }; From f3fab0ddeb15ed9e2123c2e19f91b6fb3b5be5ce Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 1 Jul 2021 12:15:50 +0200 Subject: [PATCH 0551/5344] arm64: dts: renesas: r8a77995: draak: Remove bogus adv7511w properties [ Upstream commit 4ec82a7bb3db8c6005e715c63224c32d458917a2 ] The "max-clock" and "min-vrefresh" properties fail to validate with commit cfe34bb7a770c5d8 ("dt-bindings: drm: bridge: adi,adv7511.txt: convert to yaml"). Drop them, as they are parts of an out-of-tree workaround that is not needed upstream. Fixes: bcf3003438ea4645 ("arm64: dts: renesas: r8a77995: draak: Enable HDMI display output") Signed-off-by: Geert Uytterhoeven Acked-by: Laurent Pinchart Reviewed-by: Ulrich Hecht Link: https://lore.kernel.org/r/975b6686bc423421b147d367fe7fb9a0db99c5af.1625134398.git.geert+renesas@glider.be Signed-off-by: Sasha Levin --- arch/arm64/boot/dts/renesas/r8a77995-draak.dts | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/arm64/boot/dts/renesas/r8a77995-draak.dts b/arch/arm64/boot/dts/renesas/r8a77995-draak.dts index 67634cb01d6b6..cbdd46ed3ca63 100644 --- a/arch/arm64/boot/dts/renesas/r8a77995-draak.dts +++ b/arch/arm64/boot/dts/renesas/r8a77995-draak.dts @@ -277,10 +277,6 @@ interrupt-parent = <&gpio1>; interrupts = <28 IRQ_TYPE_LEVEL_LOW>; - /* Depends on LVDS */ - max-clock = <135000000>; - min-vrefresh = <50>; - adi,input-depth = <8>; adi,input-colorspace = "rgb"; adi,input-clock = "1x"; From 73fd86b6d93a5f7e3910c92023fd3ac5cc1c3806 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Fri, 2 Jul 2021 17:54:15 -0700 Subject: [PATCH 0552/5344] soc: qcom: rpmhpd: Use corner in power_off [ Upstream commit d43b3a989bc8c06fd4bbb69a7500d180db2d68e8 ] rpmhpd_aggregate_corner() takes a corner as parameter, but in rpmhpd_power_off() the code requests the level of the first corner instead. In all (known) current cases the first corner has level 0, so this change should be a nop, but in case that there's a power domain with a non-zero lowest level this makes sure that rpmhpd_power_off() actually requests the lowest level - which is the closest to "power off" we can get. While touching the code, also skip the unnecessary zero-initialization of "ret". Fixes: 279b7e8a62cc ("soc: qcom: rpmhpd: Add RPMh power domain driver") Reviewed-by: Rajendra Nayak Reviewed-by: Stephen Boyd Reviewed-by: Sibi Sankar Tested-by: Sibi Sankar Link: https://lore.kernel.org/r/20210703005416.2668319-2-bjorn.andersson@linaro.org Signed-off-by: Bjorn Andersson Signed-off-by: Sasha Levin --- drivers/soc/qcom/rpmhpd.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/soc/qcom/rpmhpd.c b/drivers/soc/qcom/rpmhpd.c index 51850cc68b701..aa24237a78405 100644 --- a/drivers/soc/qcom/rpmhpd.c +++ b/drivers/soc/qcom/rpmhpd.c @@ -235,12 +235,11 @@ static int rpmhpd_power_on(struct generic_pm_domain *domain) static int rpmhpd_power_off(struct generic_pm_domain *domain) { struct rpmhpd *pd = domain_to_rpmhpd(domain); - int ret = 0; + int ret; mutex_lock(&rpmhpd_lock); - ret = rpmhpd_aggregate_corner(pd, pd->level[0]); - + ret = rpmhpd_aggregate_corner(pd, 0); if (!ret) pd->enabled = false; From ec1af4ef1c20224ef06c2b859b233fbe2e0e19e9 Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Fri, 25 Jun 2021 07:33:27 +0200 Subject: [PATCH 0553/5344] media: dvb-usb: fix uninit-value in dvb_usb_adapter_dvb_init [ Upstream commit c5453769f77ce19a5b03f1f49946fd3f8a374009 ] If dibusb_read_eeprom_byte fails, the mac address is not initialized. And nova_t_read_mac_address does not handle this failure, which leads to the uninit-value in dvb_usb_adapter_dvb_init. Fix this by handling the failure of dibusb_read_eeprom_byte. Reported-by: syzbot+e27b4fd589762b0b9329@syzkaller.appspotmail.com Fixes: 786baecfe78f ("[media] dvb-usb: move it to drivers/media/usb/dvb-usb") Signed-off-by: Dongliang Mu Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/usb/dvb-usb/nova-t-usb2.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/media/usb/dvb-usb/nova-t-usb2.c b/drivers/media/usb/dvb-usb/nova-t-usb2.c index e368935a50894..c16d4f1624952 100644 --- a/drivers/media/usb/dvb-usb/nova-t-usb2.c +++ b/drivers/media/usb/dvb-usb/nova-t-usb2.c @@ -130,7 +130,7 @@ static int nova_t_rc_query(struct dvb_usb_device *d, u32 *event, int *state) static int nova_t_read_mac_address (struct dvb_usb_device *d, u8 mac[6]) { - int i; + int i, ret; u8 b; mac[0] = 0x00; @@ -139,7 +139,9 @@ static int nova_t_read_mac_address (struct dvb_usb_device *d, u8 mac[6]) /* this is a complete guess, but works for my box */ for (i = 136; i < 139; i++) { - dibusb_read_eeprom_byte(d,i, &b); + ret = dibusb_read_eeprom_byte(d, i, &b); + if (ret) + return ret; mac[5 - (i - 136)] = b; } From 34aae55c40e39415ca018ff45379a516f0d20ac4 Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Fri, 25 Jun 2021 07:59:04 +0200 Subject: [PATCH 0554/5344] media: dvb-usb: fix uninit-value in vp702x_read_mac_addr [ Upstream commit 797c061ad715a9a1480eb73f44b6939fbe3209ed ] If vp702x_usb_in_op fails, the mac address is not initialized. And vp702x_read_mac_addr does not handle this failure, which leads to the uninit-value in dvb_usb_adapter_dvb_init. Fix this by handling the failure of vp702x_usb_in_op. Fixes: 786baecfe78f ("[media] dvb-usb: move it to drivers/media/usb/dvb-usb") Signed-off-by: Dongliang Mu Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/usb/dvb-usb/vp702x.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/media/usb/dvb-usb/vp702x.c b/drivers/media/usb/dvb-usb/vp702x.c index 381b5c898a076..b7ee972455e5b 100644 --- a/drivers/media/usb/dvb-usb/vp702x.c +++ b/drivers/media/usb/dvb-usb/vp702x.c @@ -291,16 +291,22 @@ static int vp702x_rc_query(struct dvb_usb_device *d, u32 *event, int *state) static int vp702x_read_mac_addr(struct dvb_usb_device *d,u8 mac[6]) { u8 i, *buf; + int ret; struct vp702x_device_state *st = d->priv; mutex_lock(&st->buf_mutex); buf = st->buf; - for (i = 6; i < 12; i++) - vp702x_usb_in_op(d, READ_EEPROM_REQ, i, 1, &buf[i - 6], 1); + for (i = 6; i < 12; i++) { + ret = vp702x_usb_in_op(d, READ_EEPROM_REQ, i, 1, + &buf[i - 6], 1); + if (ret < 0) + goto err; + } memcpy(mac, buf, 6); +err: mutex_unlock(&st->buf_mutex); - return 0; + return ret; } static int vp702x_frontend_attach(struct dvb_usb_adapter *adap) From d2251fbfbf548d97b5dacabbcdc7840fa64ca9a4 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Sun, 20 Jun 2021 21:45:42 +0200 Subject: [PATCH 0555/5344] media: go7007: remove redundant initialization [ Upstream commit 6f5885a7750545973bf1a942d2f0f129aef0aa06 ] In go7007_alloc() kzalloc() is used for struct go7007 allocation. It means that there is no need in zeroing any members, because kzalloc will take care of it. Removing these reduntant initialization steps increases execution speed a lot: Before: + 86.802 us | go7007_alloc(); After: + 29.595 us | go7007_alloc(); Fixes: 866b8695d67e8 ("Staging: add the go7007 video driver") Signed-off-by: Pavel Skripkin Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/usb/go7007/go7007-driver.c | 26 ------------------------ 1 file changed, 26 deletions(-) diff --git a/drivers/media/usb/go7007/go7007-driver.c b/drivers/media/usb/go7007/go7007-driver.c index 153a0c3e3da64..b9302d77d6c83 100644 --- a/drivers/media/usb/go7007/go7007-driver.c +++ b/drivers/media/usb/go7007/go7007-driver.c @@ -691,49 +691,23 @@ struct go7007 *go7007_alloc(const struct go7007_board_info *board, struct device *dev) { struct go7007 *go; - int i; go = kzalloc(sizeof(struct go7007), GFP_KERNEL); if (go == NULL) return NULL; go->dev = dev; go->board_info = board; - go->board_id = 0; go->tuner_type = -1; - go->channel_number = 0; - go->name[0] = 0; mutex_init(&go->hw_lock); init_waitqueue_head(&go->frame_waitq); spin_lock_init(&go->spinlock); go->status = STATUS_INIT; - memset(&go->i2c_adapter, 0, sizeof(go->i2c_adapter)); - go->i2c_adapter_online = 0; - go->interrupt_available = 0; init_waitqueue_head(&go->interrupt_waitq); - go->input = 0; go7007_update_board(go); - go->encoder_h_halve = 0; - go->encoder_v_halve = 0; - go->encoder_subsample = 0; go->format = V4L2_PIX_FMT_MJPEG; go->bitrate = 1500000; go->fps_scale = 1; - go->pali = 0; go->aspect_ratio = GO7007_RATIO_1_1; - go->gop_size = 0; - go->ipb = 0; - go->closed_gop = 0; - go->repeat_seqhead = 0; - go->seq_header_enable = 0; - go->gop_header_enable = 0; - go->dvd_mode = 0; - go->interlace_coding = 0; - for (i = 0; i < 4; ++i) - go->modet[i].enable = 0; - for (i = 0; i < 1624; ++i) - go->modet_map[i] = 0; - go->audio_deliver = NULL; - go->audio_enabled = 0; return go; } From 7cdb7785872ba6d09e021dab8a327b9678e0614f Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Mon, 19 Jul 2021 16:57:08 +0200 Subject: [PATCH 0556/5344] media: coda: fix frame_mem_ctrl for YUV420 and YVU420 formats [ Upstream commit 44693d74f5653f82cd7ca0fe730eed0f6b83306a ] The frame memory control register value is currently determined before userspace selects the final capture format and never corrected. Update ctx->frame_mem_ctrl in __coda_start_decoding() to fix decoding into YUV420 or YVU420 capture buffers. Reported-by: Andrej Picej Fixes: 497e6b8559a6 ("media: coda: add sequence initialization work") Signed-off-by: Philipp Zabel Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/platform/coda/coda-bit.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/media/platform/coda/coda-bit.c b/drivers/media/platform/coda/coda-bit.c index 00c7bed3dd572..e6b68be09f8f0 100644 --- a/drivers/media/platform/coda/coda-bit.c +++ b/drivers/media/platform/coda/coda-bit.c @@ -2023,17 +2023,25 @@ static int __coda_start_decoding(struct coda_ctx *ctx) u32 src_fourcc, dst_fourcc; int ret; + q_data_src = get_q_data(ctx, V4L2_BUF_TYPE_VIDEO_OUTPUT); + q_data_dst = get_q_data(ctx, V4L2_BUF_TYPE_VIDEO_CAPTURE); + src_fourcc = q_data_src->fourcc; + dst_fourcc = q_data_dst->fourcc; + if (!ctx->initialized) { ret = __coda_decoder_seq_init(ctx); if (ret < 0) return ret; + } else { + ctx->frame_mem_ctrl &= ~(CODA_FRAME_CHROMA_INTERLEAVE | (0x3 << 9) | + CODA9_FRAME_TILED2LINEAR); + if (dst_fourcc == V4L2_PIX_FMT_NV12 || dst_fourcc == V4L2_PIX_FMT_YUYV) + ctx->frame_mem_ctrl |= CODA_FRAME_CHROMA_INTERLEAVE; + if (ctx->tiled_map_type == GDI_TILED_FRAME_MB_RASTER_MAP) + ctx->frame_mem_ctrl |= (0x3 << 9) | + ((ctx->use_vdoa) ? 0 : CODA9_FRAME_TILED2LINEAR); } - q_data_src = get_q_data(ctx, V4L2_BUF_TYPE_VIDEO_OUTPUT); - q_data_dst = get_q_data(ctx, V4L2_BUF_TYPE_VIDEO_CAPTURE); - src_fourcc = q_data_src->fourcc; - dst_fourcc = q_data_dst->fourcc; - coda_write(dev, ctx->parabuf.paddr, CODA_REG_BIT_PARA_BUF_ADDR); ret = coda_alloc_framebuffers(ctx, q_data_dst, src_fourcc); From bfcfc00d1f148b9d244067d4227c35a363a7c0eb Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 25 Jun 2021 18:00:09 +0300 Subject: [PATCH 0557/5344] Bluetooth: sco: prevent information leak in sco_conn_defer_accept() [ Upstream commit 59da0b38bc2ea570ede23a3332ecb3e7574ce6b2 ] Smatch complains that some of these struct members are not initialized leading to a stack information disclosure: net/bluetooth/sco.c:778 sco_conn_defer_accept() warn: check that 'cp.retrans_effort' doesn't leak information This seems like a valid warning. I've added a default case to fix this issue. Fixes: 2f69a82acf6f ("Bluetooth: Use voice setting in deferred SCO connection request") Signed-off-by: Dan Carpenter Signed-off-by: Marcel Holtmann Signed-off-by: Sasha Levin --- net/bluetooth/sco.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index b91d6b440fdf6..335264706aaeb 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -761,6 +761,11 @@ static void sco_conn_defer_accept(struct hci_conn *conn, u16 setting) cp.max_latency = cpu_to_le16(0xffff); cp.retrans_effort = 0xff; break; + default: + /* use CVSD settings as fallback */ + cp.max_latency = cpu_to_le16(0xffff); + cp.retrans_effort = 0xff; + break; } hci_send_cmd(hdev, HCI_OP_ACCEPT_SYNC_CONN_REQ, From dfaf347523d95897bf17cebc2c6dd56b402df2dc Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 12 Jul 2021 13:14:40 +0100 Subject: [PATCH 0558/5344] 6lowpan: iphc: Fix an off-by-one check of array index [ Upstream commit 9af417610b6142e826fd1ee8ba7ff3e9a2133a5a ] The bounds check of id is off-by-one and the comparison should be >= rather >. Currently the WARN_ON_ONCE check does not stop the out of range indexing of &ldev->ctx.table[id] so also add a return path if the bounds are out of range. Addresses-Coverity: ("Illegal address computation"). Fixes: 5609c185f24d ("6lowpan: iphc: add support for stateful compression") Signed-off-by: Colin Ian King Signed-off-by: Marcel Holtmann Signed-off-by: Sasha Levin --- net/6lowpan/debugfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/6lowpan/debugfs.c b/net/6lowpan/debugfs.c index 1c140af06d527..600b9563bfc53 100644 --- a/net/6lowpan/debugfs.c +++ b/net/6lowpan/debugfs.c @@ -170,7 +170,8 @@ static void lowpan_dev_debugfs_ctx_init(struct net_device *dev, struct dentry *root; char buf[32]; - WARN_ON_ONCE(id > LOWPAN_IPHC_CTX_TABLE_SIZE); + if (WARN_ON_ONCE(id >= LOWPAN_IPHC_CTX_TABLE_SIZE)) + return; sprintf(buf, "%d", id); From ad3104c93fc3c393d70c7d6a1586cd8bca9a8843 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 13 Jan 2020 22:39:22 +0100 Subject: [PATCH 0559/5344] netns: protect netns ID lookups with RCU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 2dce224f469f060b9998a5a869151ef83c08ce77 upstream. __peernet2id() can be protected by RCU as it only calls idr_for_each(), which is RCU-safe, and never modifies the nsid table. rtnl_net_dumpid() can also do lockless lookups. It does two nested idr_for_each() calls on nsid tables (one direct call and one indirect call because of rtnl_net_dumpid_one() calling __peernet2id()). The netnsid tables are never updated. Therefore it is safe to not take the nsid_lock and run within an RCU-critical section instead. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman Signed-off-by: Håkon Bugge --- net/core/net_namespace.c | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index c303873496a34..9bf15512601bf 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -211,9 +211,9 @@ static int net_eq_idr(int id, void *net, void *peer) return 0; } -/* Should be called with nsid_lock held. If a new id is assigned, the bool alloc - * is set to true, thus the caller knows that the new id must be notified via - * rtnl. +/* Must be called from RCU-critical section or with nsid_lock held. If + * a new id is assigned, the bool alloc is set to true, thus the + * caller knows that the new id must be notified via rtnl. */ static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc) { @@ -237,7 +237,7 @@ static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc) return NETNSA_NSID_NOT_ASSIGNED; } -/* should be called with nsid_lock held */ +/* Must be called from RCU-critical section or with nsid_lock held */ static int __peernet2id(struct net *net, struct net *peer) { bool no = false; @@ -281,9 +281,10 @@ int peernet2id(struct net *net, struct net *peer) { int id; - spin_lock_bh(&net->nsid_lock); + rcu_read_lock(); id = __peernet2id(net, peer); - spin_unlock_bh(&net->nsid_lock); + rcu_read_unlock(); + return id; } EXPORT_SYMBOL(peernet2id); @@ -962,6 +963,7 @@ struct rtnl_net_dump_cb { int s_idx; }; +/* Runs in RCU-critical section. */ static int rtnl_net_dumpid_one(int id, void *peer, void *data) { struct rtnl_net_dump_cb *net_cb = (struct rtnl_net_dump_cb *)data; @@ -1046,19 +1048,9 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) goto end; } - spin_lock_bh(&net_cb.tgt_net->nsid_lock); - if (net_cb.fillargs.add_ref && - !net_eq(net_cb.ref_net, net_cb.tgt_net) && - !spin_trylock_bh(&net_cb.ref_net->nsid_lock)) { - spin_unlock_bh(&net_cb.tgt_net->nsid_lock); - err = -EAGAIN; - goto end; - } + rcu_read_lock(); idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb); - if (net_cb.fillargs.add_ref && - !net_eq(net_cb.ref_net, net_cb.tgt_net)) - spin_unlock_bh(&net_cb.ref_net->nsid_lock); - spin_unlock_bh(&net_cb.tgt_net->nsid_lock); + rcu_read_unlock(); cb->args[0] = net_cb.idx; end: From 246a8e4ba378f5c9f0ec870d1d0d4804be471f9d Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Wed, 21 Jul 2021 01:22:15 +0800 Subject: [PATCH 0560/5344] drm/amdgpu/acp: Make PM domain really work [ Upstream commit aff890288de2d818e4f83ec40c9315e2d735df07 ] Devices created by mfd_add_hotplug_devices() don't really increase the index of its name, so get_mfd_cell_dev() cannot find any device, hence a NULL dev is passed to pm_genpd_add_device(): [ 56.974926] (NULL device *): amdgpu: device acp_audio_dma.0.auto added to pm domain [ 56.974933] (NULL device *): amdgpu: Failed to add dev to genpd [ 56.974941] [drm:amdgpu_device_ip_init [amdgpu]] *ERROR* hw_init of IP block failed -22 [ 56.975810] amdgpu 0000:00:01.0: amdgpu: amdgpu_device_ip_init failed [ 56.975839] amdgpu 0000:00:01.0: amdgpu: Fatal error during GPU init [ 56.977136] ------------[ cut here ]------------ [ 56.977143] kernel BUG at mm/slub.c:4206! [ 56.977158] invalid opcode: 0000 [#1] SMP NOPTI [ 56.977167] CPU: 1 PID: 1648 Comm: modprobe Not tainted 5.12.0-051200rc8-generic #202104182230 [ 56.977175] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./FM2A68M-HD+, BIOS P5.20 02/13/2019 [ 56.977180] RIP: 0010:kfree+0x3bf/0x410 [ 56.977195] Code: 89 e7 48 d3 e2 f7 da e8 5f 0d 02 00 80 e7 02 75 3e 44 89 ee 4c 89 e7 e8 ef 5f fd ff e9 fa fe ff ff 49 8b 44 24 08 a8 01 75 b7 <0f> 0b 4c 8b 4d b0 48 8b 4d a8 48 89 da 4c 89 e6 41 b8 01 00 00 00 [ 56.977202] RSP: 0018:ffffa48640ff79f0 EFLAGS: 00010246 [ 56.977210] RAX: 0000000000000000 RBX: ffff9286127d5608 RCX: 0000000000000000 [ 56.977215] RDX: 0000000000000000 RSI: ffffffffc099d0fb RDI: ffff9286127d5608 [ 56.977220] RBP: ffffa48640ff7a48 R08: 0000000000000001 R09: 0000000000000001 [ 56.977224] R10: 0000000000000000 R11: ffff9286087d8458 R12: fffff3ae0449f540 [ 56.977229] R13: 0000000000000000 R14: dead000000000122 R15: dead000000000100 [ 56.977234] FS: 00007f9de5929540(0000) GS:ffff928612e80000(0000) knlGS:0000000000000000 [ 56.977240] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 56.977245] CR2: 00007f697dd97160 CR3: 00000001110f0000 CR4: 00000000001506e0 [ 56.977251] Call Trace: [ 56.977261] amdgpu_dm_encoder_destroy+0x1b/0x30 [amdgpu] [ 56.978056] drm_mode_config_cleanup+0x4f/0x2e0 [drm] [ 56.978147] ? kfree+0x3dd/0x410 [ 56.978157] ? drm_managed_release+0xc8/0x100 [drm] [ 56.978232] drm_mode_config_init_release+0xe/0x10 [drm] [ 56.978311] drm_managed_release+0x9d/0x100 [drm] [ 56.978388] devm_drm_dev_init_release+0x4d/0x70 [drm] [ 56.978450] devm_action_release+0x15/0x20 [ 56.978459] release_nodes+0x77/0xc0 [ 56.978469] devres_release_all+0x3f/0x50 [ 56.978477] really_probe+0x245/0x460 [ 56.978485] driver_probe_device+0xe9/0x160 [ 56.978492] device_driver_attach+0xab/0xb0 [ 56.978499] __driver_attach+0x8f/0x150 [ 56.978506] ? device_driver_attach+0xb0/0xb0 [ 56.978513] bus_for_each_dev+0x7e/0xc0 [ 56.978521] driver_attach+0x1e/0x20 [ 56.978528] bus_add_driver+0x135/0x1f0 [ 56.978534] driver_register+0x91/0xf0 [ 56.978540] __pci_register_driver+0x54/0x60 [ 56.978549] amdgpu_init+0x77/0x1000 [amdgpu] [ 56.979246] ? 0xffffffffc0dbc000 [ 56.979254] do_one_initcall+0x48/0x1d0 [ 56.979265] ? kmem_cache_alloc_trace+0x120/0x230 [ 56.979274] ? do_init_module+0x28/0x280 [ 56.979282] do_init_module+0x62/0x280 [ 56.979288] load_module+0x71c/0x7a0 [ 56.979296] __do_sys_finit_module+0xc2/0x120 [ 56.979305] __x64_sys_finit_module+0x1a/0x20 [ 56.979311] do_syscall_64+0x38/0x90 [ 56.979319] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 56.979328] RIP: 0033:0x7f9de54f989d [ 56.979335] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c3 f5 0c 00 f7 d8 64 89 01 48 [ 56.979342] RSP: 002b:00007ffe3c395a28 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 [ 56.979350] RAX: ffffffffffffffda RBX: 0000560df3ef4330 RCX: 00007f9de54f989d [ 56.979355] RDX: 0000000000000000 RSI: 0000560df3a07358 RDI: 000000000000000f [ 56.979360] RBP: 0000000000040000 R08: 0000000000000000 R09: 0000000000000000 [ 56.979365] R10: 000000000000000f R11: 0000000000000246 R12: 0000560df3a07358 [ 56.979369] R13: 0000000000000000 R14: 0000560df3ef4460 R15: 0000560df3ef4330 [ 56.979377] Modules linked in: amdgpu(+) iommu_v2 gpu_sched drm_ttm_helper ttm drm_kms_helper cec rc_core i2c_algo_bit fb_sys_fops syscopyarea sysfillrect sysimgblt nft_counter xt_tcpudp ipt_REJECT nf_reject_ipv4 xt_conntrack iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 iptable_mangle iptable_raw iptable_security ip_set nf_tables libcrc32c nfnetlink ip6_tables iptable_filter bpfilter input_leds binfmt_misc edac_mce_amd kvm_amd ccp kvm snd_hda_codec_realtek snd_hda_codec_generic crct10dif_pclmul snd_hda_codec_hdmi ledtrig_audio ghash_clmulni_intel aesni_intel snd_hda_intel snd_intel_dspcfg snd_seq_midi crypto_simd snd_intel_sdw_acpi cryptd snd_hda_codec snd_seq_midi_event snd_rawmidi snd_hda_core snd_hwdep snd_seq fam15h_power k10temp snd_pcm snd_seq_device snd_timer snd mac_hid soundcore sch_fq_codel nct6775 hwmon_vid drm ip_tables x_tables autofs4 dm_mirror dm_region_hash dm_log hid_generic usbhid hid uas usb_storage r8169 crc32_pclmul realtek ahci xhci_pci i2c_piix4 [ 56.979521] xhci_pci_renesas libahci video [ 56.979541] ---[ end trace cb8f6a346f18da7b ]--- Instead of finding MFD hotplugged device by its name, simply iterate over the child devices to avoid the issue. Squash in unused variable removal (Alex) BugLink: https://bugs.launchpad.net/bugs/1920674 Fixes: 25030321ba28 ("drm/amd: add pm domain for ACP IP sub blocks") Signed-off-by: Kai-Heng Feng Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c | 54 ++++++++++++------------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c index 82155ac3288a0..64ee44c2fdd1d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c @@ -163,17 +163,28 @@ static int acp_poweron(struct generic_pm_domain *genpd) return 0; } -static struct device *get_mfd_cell_dev(const char *device_name, int r) +static int acp_genpd_add_device(struct device *dev, void *data) { - char auto_dev_name[25]; - struct device *dev; + struct generic_pm_domain *gpd = data; + int ret; - snprintf(auto_dev_name, sizeof(auto_dev_name), - "%s.%d.auto", device_name, r); - dev = bus_find_device_by_name(&platform_bus_type, NULL, auto_dev_name); - dev_info(dev, "device %s added to pm domain\n", auto_dev_name); + ret = pm_genpd_add_device(gpd, dev); + if (ret) + dev_err(dev, "Failed to add dev to genpd %d\n", ret); - return dev; + return ret; +} + +static int acp_genpd_remove_device(struct device *dev, void *data) +{ + int ret; + + ret = pm_genpd_remove_device(dev); + if (ret) + dev_err(dev, "Failed to remove dev from genpd %d\n", ret); + + /* Continue to remove */ + return 0; } /** @@ -184,11 +195,10 @@ static struct device *get_mfd_cell_dev(const char *device_name, int r) */ static int acp_hw_init(void *handle) { - int r, i; + int r; uint64_t acp_base; u32 val = 0; u32 count = 0; - struct device *dev; struct i2s_platform_data *i2s_pdata = NULL; struct amdgpu_device *adev = (struct amdgpu_device *)handle; @@ -344,15 +354,10 @@ static int acp_hw_init(void *handle) if (r) goto failure; - for (i = 0; i < ACP_DEVS ; i++) { - dev = get_mfd_cell_dev(adev->acp.acp_cell[i].name, i); - r = pm_genpd_add_device(&adev->acp.acp_genpd->gpd, dev); - if (r) { - dev_err(dev, "Failed to add dev to genpd\n"); - goto failure; - } - } - + r = device_for_each_child(adev->acp.parent, &adev->acp.acp_genpd->gpd, + acp_genpd_add_device); + if (r) + goto failure; /* Assert Soft reset of ACP */ val = cgs_read_register(adev->acp.cgs_device, mmACP_SOFT_RESET); @@ -413,10 +418,8 @@ static int acp_hw_init(void *handle) */ static int acp_hw_fini(void *handle) { - int i, ret; u32 val = 0; u32 count = 0; - struct device *dev; struct amdgpu_device *adev = (struct amdgpu_device *)handle; /* return early if no ACP */ @@ -461,13 +464,8 @@ static int acp_hw_fini(void *handle) udelay(100); } - for (i = 0; i < ACP_DEVS ; i++) { - dev = get_mfd_cell_dev(adev->acp.acp_cell[i].name, i); - ret = pm_genpd_remove_device(dev); - /* If removal fails, dont giveup and try rest */ - if (ret) - dev_err(dev, "remove dev from genpd failed\n"); - } + device_for_each_child(adev->acp.parent, NULL, + acp_genpd_remove_device); mfd_remove_devices(adev->acp.parent); kfree(adev->acp.acp_res); From a3e00d1500d3e77a3cf2339534ea6ee33ebaec45 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Jul 2021 13:05:41 -0700 Subject: [PATCH 0561/5344] tcp: seq_file: Avoid skipping sk during tcp_seek_last_pos [ Upstream commit 525e2f9fd0229eb10cb460a9e6d978257f24804e ] st->bucket stores the current bucket number. st->offset stores the offset within this bucket that is the sk to be seq_show(). Thus, st->offset only makes sense within the same st->bucket. These two variables are an optimization for the common no-lseek case. When resuming the seq_file iteration (i.e. seq_start()), tcp_seek_last_pos() tries to continue from the st->offset at bucket st->bucket. However, it is possible that the bucket pointed by st->bucket has changed and st->offset may end up skipping the whole st->bucket without finding a sk. In this case, tcp_seek_last_pos() currently continues to satisfy the offset condition in the next (and incorrect) bucket. Instead, regardless of the offset value, the first sk of the next bucket should be returned. Thus, "bucket == st->bucket" check is added to tcp_seek_last_pos(). The chance of hitting this is small and the issue is a decade old, so targeting for the next tree. Fixes: a8b690f98baf ("tcp: Fix slowness in read /proc/net/tcp") Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko Reviewed-by: Eric Dumazet Acked-by: Kuniyuki Iwashima Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210701200541.1033917-1-kafai@fb.com Signed-off-by: Sasha Levin --- net/ipv4/tcp_ipv4.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 46708ed228989..fb8263f8d70ba 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2326,6 +2326,7 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos) static void *tcp_seek_last_pos(struct seq_file *seq) { struct tcp_iter_state *st = seq->private; + int bucket = st->bucket; int offset = st->offset; int orig_num = st->num; void *rc = NULL; @@ -2336,7 +2337,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq) break; st->state = TCP_SEQ_STATE_LISTENING; rc = listening_get_next(seq, NULL); - while (offset-- && rc) + while (offset-- && rc && bucket == st->bucket) rc = listening_get_next(seq, rc); if (rc) break; @@ -2347,7 +2348,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq) if (st->bucket > tcp_hashinfo.ehash_mask) break; rc = established_get_first(seq); - while (offset-- && rc) + while (offset-- && rc && bucket == st->bucket) rc = established_get_next(seq, rc); } From 02415a92e4bfaf25aef892ea90f33e38c099980e Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Sun, 11 Jul 2021 23:40:23 +0200 Subject: [PATCH 0562/5344] ARM: dts: meson8: Use a higher default GPU clock frequency [ Upstream commit 44cf630bcb8c5ec78125805c9447dd5766792224 ] We are seeing "imprecise external abort (0x1406)" errors during boot (which then cause the whole board to hang) on Meson8 (but not Meson8m2). These are observed while trying to access the GPU's registers when the MALI clock is running at it's default setting of 24MHz. The 3.10 vendor kernel uses 318.75MHz as "default" GPU frequency. Using that makes the "imprecise external aborts" go away. Add the assigned-clocks and assigned-clock-rates properties to also bump the MALI clock to 318.75MHz before accessing any of it's registers. Fixes: 7d3f6b536e72c9 ("ARM: dts: meson8: add the Mali-450 MP6 GPU") Reported-by: Demetris Ierokipides Signed-off-by: Martin Blumenstingl Reviewed-by: Neil Armstrong Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20210711214023.2163565-1-martin.blumenstingl@googlemail.com Signed-off-by: Sasha Levin --- arch/arm/boot/dts/meson8.dtsi | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm/boot/dts/meson8.dtsi b/arch/arm/boot/dts/meson8.dtsi index 3efe9d41c2bb6..d7c9dbee0f016 100644 --- a/arch/arm/boot/dts/meson8.dtsi +++ b/arch/arm/boot/dts/meson8.dtsi @@ -241,8 +241,13 @@ "pp2", "ppmmu2", "pp4", "ppmmu4", "pp5", "ppmmu5", "pp6", "ppmmu6"; resets = <&reset RESET_MALI>; + clocks = <&clkc CLKID_CLK81>, <&clkc CLKID_MALI>; clock-names = "bus", "core"; + + assigned-clocks = <&clkc CLKID_MALI>; + assigned-clock-rates = <318750000>; + operating-points-v2 = <&gpu_opp_table>; }; }; From bdc707bc27c1e7a5692718719559146f23e81b4e Mon Sep 17 00:00:00 2001 From: Anand Moon Date: Mon, 5 Jul 2021 11:23:53 +0000 Subject: [PATCH 0563/5344] ARM: dts: meson8b: odroidc1: Fix the pwm regulator supply properties [ Upstream commit 876228e9f935f19c7afc7ba394d17e2ec9143b65 ] After enabling CONFIG_REGULATOR_DEBUG=y we observe below debug logs. Changes help link VCCK and VDDEE pwm regulator to 5V regulator supply instead of dummy regulator. [ 7.117140] pwm-regulator regulator-vcck: Looking up pwm-supply from device tree [ 7.117153] pwm-regulator regulator-vcck: Looking up pwm-supply property in node /regulator-vcck failed [ 7.117184] VCCK: supplied by regulator-dummy [ 7.117194] regulator-dummy: could not add device link regulator.8: -ENOENT [ 7.117266] VCCK: 860 <--> 1140 mV at 986 mV, enabled [ 7.118498] VDDEE: will resolve supply early: pwm [ 7.118515] pwm-regulator regulator-vddee: Looking up pwm-supply from device tree [ 7.118526] pwm-regulator regulator-vddee: Looking up pwm-supply property in node /regulator-vddee failed [ 7.118553] VDDEE: supplied by regulator-dummy [ 7.118563] regulator-dummy: could not add device link regulator.9: -ENOENT Fixes: 524d96083b66 ("ARM: dts: meson8b: odroidc1: add the CPU voltage regulator") Fixes: 8bdf38be712d ("ARM: dts: meson8b: odroidc1: add the VDDEE regulator") Tested-by: Martin Blumenstingl Cc: Martin Blumenstingl Signed-off-by: Anand Moon Reviewed-by: Martin Blumenstingl [narmstrong: fixed typo in commit s/observer/observe/] Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20210705112358.3554-2-linux.amoon@gmail.com Signed-off-by: Sasha Levin --- arch/arm/boot/dts/meson8b-odroidc1.dts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/meson8b-odroidc1.dts b/arch/arm/boot/dts/meson8b-odroidc1.dts index 0f9c71137bed5..c413af9a7af8e 100644 --- a/arch/arm/boot/dts/meson8b-odroidc1.dts +++ b/arch/arm/boot/dts/meson8b-odroidc1.dts @@ -130,7 +130,7 @@ regulator-min-microvolt = <860000>; regulator-max-microvolt = <1140000>; - vin-supply = <&p5v0>; + pwm-supply = <&p5v0>; pwms = <&pwm_cd 0 12218 0>; pwm-dutycycle-range = <91 0>; @@ -162,7 +162,7 @@ regulator-min-microvolt = <860000>; regulator-max-microvolt = <1140000>; - vin-supply = <&p5v0>; + pwm-supply = <&p5v0>; pwms = <&pwm_cd 1 12218 0>; pwm-dutycycle-range = <91 0>; From 05e18323d97ec0cc4e810380d1652be76dbf304d Mon Sep 17 00:00:00 2001 From: Anand Moon Date: Mon, 5 Jul 2021 11:23:54 +0000 Subject: [PATCH 0564/5344] ARM: dts: meson8b: mxq: Fix the pwm regulator supply properties [ Upstream commit 632062e540becbbcb067523ec8bcadb1239d9578 ] After enabling CONFIG_REGULATOR_DEBUG=y we observer below debug logs. Changes help link VCCK and VDDEE pwm regulator to 5V regulator supply instead of dummy regulator. Add missing pwm-supply for regulator-vcck regulator node. [ 7.117140] pwm-regulator regulator-vcck: Looking up pwm-supply from device tree [ 7.117153] pwm-regulator regulator-vcck: Looking up pwm-supply property in node /regulator-vcck failed [ 7.117184] VCCK: supplied by regulator-dummy [ 7.117194] regulator-dummy: could not add device link regulator.8: -ENOENT [ 7.117266] VCCK: 860 <--> 1140 mV at 986 mV, enabled [ 7.118498] VDDEE: will resolve supply early: pwm [ 7.118515] pwm-regulator regulator-vddee: Looking up pwm-supply from device tree [ 7.118526] pwm-regulator regulator-vddee: Looking up pwm-supply property in node /regulator-vddee failed [ 7.118553] VDDEE: supplied by regulator-dummy [ 7.118563] regulator-dummy: could not add device link regulator.9: -ENOENT Fixes: dee51cd0d2e8 ("ARM: dts: meson8b: mxq: add the VDDEE regulator") Fixes: d94f60e3dfa0 ("ARM: dts: meson8b: mxq: improve support for the TRONFY MXQ S805") Cc: Martin Blumenstingl Signed-off-by: Anand Moon Reviewed-by: Martin Blumenstingl Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20210705112358.3554-3-linux.amoon@gmail.com Signed-off-by: Sasha Levin --- arch/arm/boot/dts/meson8b-mxq.dts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/meson8b-mxq.dts b/arch/arm/boot/dts/meson8b-mxq.dts index 6e39ad52e42d3..ab8fe55963f7c 100644 --- a/arch/arm/boot/dts/meson8b-mxq.dts +++ b/arch/arm/boot/dts/meson8b-mxq.dts @@ -39,6 +39,8 @@ regulator-min-microvolt = <860000>; regulator-max-microvolt = <1140000>; + pwm-supply = <&vcc_5v>; + pwms = <&pwm_cd 0 1148 0>; pwm-dutycycle-range = <100 0>; @@ -84,7 +86,7 @@ regulator-min-microvolt = <860000>; regulator-max-microvolt = <1140000>; - vin-supply = <&vcc_5v>; + pwm-supply = <&vcc_5v>; pwms = <&pwm_cd 1 1148 0>; pwm-dutycycle-range = <100 0>; From 596ad843d502d42a8f841fe84de595da71b24313 Mon Sep 17 00:00:00 2001 From: Anand Moon Date: Mon, 5 Jul 2021 11:23:55 +0000 Subject: [PATCH 0565/5344] ARM: dts: meson8b: ec100: Fix the pwm regulator supply properties [ Upstream commit 72ccc373b064ae3ac0c5b5f2306069b60ca118df ] After enabling CONFIG_REGULATOR_DEBUG=y we observer below debug logs. Changes help link VCCK and VDDEE pwm regulator to 5V regulator supply instead of dummy regulator. [ 7.117140] pwm-regulator regulator-vcck: Looking up pwm-supply from device tree [ 7.117153] pwm-regulator regulator-vcck: Looking up pwm-supply property in node /regulator-vcck failed [ 7.117184] VCCK: supplied by regulator-dummy [ 7.117194] regulator-dummy: could not add device link regulator.8: -ENOENT [ 7.117266] VCCK: 860 <--> 1140 mV at 986 mV, enabled [ 7.118498] VDDEE: will resolve supply early: pwm [ 7.118515] pwm-regulator regulator-vddee: Looking up pwm-supply from device tree [ 7.118526] pwm-regulator regulator-vddee: Looking up pwm-supply property in node /regulator-vddee failed [ 7.118553] VDDEE: supplied by regulator-dummy [ 7.118563] regulator-dummy: could not add device link regulator.9: -ENOENT Fixes: 087a1d8b4e4c ("ARM: dts: meson8b: ec100: add the VDDEE regulator") Fixes: 3e7db1c1b7a3 ("ARM: dts: meson8b: ec100: improve the description of the regulators") Cc: Martin Blumenstingl Signed-off-by: Anand Moon Reviewed-by: Martin Blumenstingl Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20210705112358.3554-4-linux.amoon@gmail.com Signed-off-by: Sasha Levin --- arch/arm/boot/dts/meson8b-ec100.dts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/meson8b-ec100.dts b/arch/arm/boot/dts/meson8b-ec100.dts index bed1dfef19857..32d1c322dbc65 100644 --- a/arch/arm/boot/dts/meson8b-ec100.dts +++ b/arch/arm/boot/dts/meson8b-ec100.dts @@ -148,7 +148,7 @@ regulator-min-microvolt = <860000>; regulator-max-microvolt = <1140000>; - vin-supply = <&vcc_5v>; + pwm-supply = <&vcc_5v>; pwms = <&pwm_cd 0 1148 0>; pwm-dutycycle-range = <100 0>; @@ -232,7 +232,7 @@ regulator-min-microvolt = <860000>; regulator-max-microvolt = <1140000>; - vin-supply = <&vcc_5v>; + pwm-supply = <&vcc_5v>; pwms = <&pwm_cd 1 1148 0>; pwm-dutycycle-range = <100 0>; From bcb61e8c0219b251dd6f4aba490fc215313a7793 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Thu, 8 Apr 2021 17:20:04 +0300 Subject: [PATCH 0566/5344] net/mlx5e: Prohibit inner indir TIRs in IPoIB [ Upstream commit 9c43f3865c2a03be104f1c1d5e9129c2a2bdba88 ] TIR's rx_hash_field_selector_inner can be enabled only when tunneled_offload_en = 1. tunneled_offload_en is filled according to the tunneled_offload_en field in struct mlx5e_params, which is false in the IPoIB profile. On the other hand, the IPoIB profile passes inner_ttc = true to mlx5e_create_indirect_tirs, which potentially allows the latter function to attempt to create inner indirect TIRs without having tunneled_offload_en set. This commit prohibits this behavior by passing inner_ttc = false to mlx5e_create_indirect_tirs. The latter function won't attempt to create inner indirect TIRs. As inner indirect TIRs are not created in the IPoIB profile (this commit blocks it explicitly, and even before they would have failed to be created), the call to mlx5e_create_inner_ttc_table in mlx5i_create_flow_steering is a no-op and can be removed. Fixes: 46dc933cee82 ("net/mlx5e: Provide explicit directive if to create inner indirect tirs") Fixes: 458821c72bd0 ("net/mlx5e: IPoIB, Add inner TTC table to IPoIB flow steering") Signed-off-by: Maxim Mikityanskiy Signed-off-by: Saeed Mahameed Signed-off-by: Sasha Levin --- .../net/ethernet/mellanox/mlx5/core/en/fs.h | 6 ------ .../net/ethernet/mellanox/mlx5/core/en_fs.c | 10 +++++----- .../ethernet/mellanox/mlx5/core/ipoib/ipoib.c | 18 ++---------------- 3 files changed, 7 insertions(+), 27 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h index d48292ccda294..9239d767443f2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h @@ -234,18 +234,12 @@ struct ttc_params { void mlx5e_set_ttc_basic_params(struct mlx5e_priv *priv, struct ttc_params *ttc_params); void mlx5e_set_ttc_ft_params(struct ttc_params *ttc_params); -void mlx5e_set_inner_ttc_ft_params(struct ttc_params *ttc_params); int mlx5e_create_ttc_table(struct mlx5e_priv *priv, struct ttc_params *params, struct mlx5e_ttc_table *ttc); void mlx5e_destroy_ttc_table(struct mlx5e_priv *priv, struct mlx5e_ttc_table *ttc); -int mlx5e_create_inner_ttc_table(struct mlx5e_priv *priv, struct ttc_params *params, - struct mlx5e_ttc_table *ttc); -void mlx5e_destroy_inner_ttc_table(struct mlx5e_priv *priv, - struct mlx5e_ttc_table *ttc); - void mlx5e_destroy_flow_table(struct mlx5e_flow_table *ft); void mlx5e_enable_cvlan_filter(struct mlx5e_priv *priv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c index 72711fc4bbbbc..024251e276a1c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c @@ -1124,7 +1124,7 @@ void mlx5e_set_ttc_basic_params(struct mlx5e_priv *priv, ttc_params->inner_ttc = &priv->fs.inner_ttc; } -void mlx5e_set_inner_ttc_ft_params(struct ttc_params *ttc_params) +static void mlx5e_set_inner_ttc_ft_params(struct ttc_params *ttc_params) { struct mlx5_flow_table_attr *ft_attr = &ttc_params->ft_attr; @@ -1143,8 +1143,8 @@ void mlx5e_set_ttc_ft_params(struct ttc_params *ttc_params) ft_attr->prio = MLX5E_NIC_PRIO; } -int mlx5e_create_inner_ttc_table(struct mlx5e_priv *priv, struct ttc_params *params, - struct mlx5e_ttc_table *ttc) +static int mlx5e_create_inner_ttc_table(struct mlx5e_priv *priv, struct ttc_params *params, + struct mlx5e_ttc_table *ttc) { struct mlx5e_flow_table *ft = &ttc->ft; int err; @@ -1174,8 +1174,8 @@ int mlx5e_create_inner_ttc_table(struct mlx5e_priv *priv, struct ttc_params *par return err; } -void mlx5e_destroy_inner_ttc_table(struct mlx5e_priv *priv, - struct mlx5e_ttc_table *ttc) +static void mlx5e_destroy_inner_ttc_table(struct mlx5e_priv *priv, + struct mlx5e_ttc_table *ttc) { if (!mlx5e_tunnel_inner_ft_supported(priv->mdev)) return; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c index 0fed2419623d1..1f3d12faa2a5b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c @@ -319,17 +319,6 @@ static int mlx5i_create_flow_steering(struct mlx5e_priv *priv) } mlx5e_set_ttc_basic_params(priv, &ttc_params); - mlx5e_set_inner_ttc_ft_params(&ttc_params); - for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) - ttc_params.indir_tirn[tt] = priv->inner_indir_tir[tt].tirn; - - err = mlx5e_create_inner_ttc_table(priv, &ttc_params, &priv->fs.inner_ttc); - if (err) { - netdev_err(priv->netdev, "Failed to create inner ttc table, err=%d\n", - err); - goto err_destroy_arfs_tables; - } - mlx5e_set_ttc_ft_params(&ttc_params); for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) ttc_params.indir_tirn[tt] = priv->indir_tir[tt].tirn; @@ -338,13 +327,11 @@ static int mlx5i_create_flow_steering(struct mlx5e_priv *priv) if (err) { netdev_err(priv->netdev, "Failed to create ttc table, err=%d\n", err); - goto err_destroy_inner_ttc_table; + goto err_destroy_arfs_tables; } return 0; -err_destroy_inner_ttc_table: - mlx5e_destroy_inner_ttc_table(priv, &priv->fs.inner_ttc); err_destroy_arfs_tables: mlx5e_arfs_destroy_tables(priv); @@ -354,7 +341,6 @@ static int mlx5i_create_flow_steering(struct mlx5e_priv *priv) static void mlx5i_destroy_flow_steering(struct mlx5e_priv *priv) { mlx5e_destroy_ttc_table(priv, &priv->fs.ttc); - mlx5e_destroy_inner_ttc_table(priv, &priv->fs.inner_ttc); mlx5e_arfs_destroy_tables(priv); } @@ -379,7 +365,7 @@ static int mlx5i_init_rx(struct mlx5e_priv *priv) if (err) goto err_destroy_indirect_rqts; - err = mlx5e_create_indirect_tirs(priv, true); + err = mlx5e_create_indirect_tirs(priv, false); if (err) goto err_destroy_direct_rqts; From 63ba2f150985a49090c6ef70ac21cde11a80a1e8 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 20 Jul 2021 10:18:27 -0400 Subject: [PATCH 0567/5344] cgroup/cpuset: Fix a partition bug with hotplug [ Upstream commit 15d428e6fe77fffc3f4fff923336036f5496ef17 ] In cpuset_hotplug_workfn(), the detection of whether the cpu list has been changed is done by comparing the effective cpus of the top cpuset with the cpu_active_mask. However, in the rare case that just all the CPUs in the subparts_cpus are offlined, the detection fails and the partition states are not updated correctly. Fix it by forcing the cpus_updated flag to true in this particular case. Fixes: 4b842da276a8 ("cpuset: Make CPU hotplug work with partition") Signed-off-by: Waiman Long Signed-off-by: Tejun Heo Signed-off-by: Sasha Levin --- kernel/cgroup/cpuset.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index bab6a934862e3..badfa8f153599 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3166,6 +3166,13 @@ static void cpuset_hotplug_workfn(struct work_struct *work) cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus); mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); + /* + * In the rare case that hotplug removes all the cpus in subparts_cpus, + * we assumed that cpus are updated. + */ + if (!cpus_updated && top_cpuset.nr_subparts_cpus) + cpus_updated = true; + /* synchronize cpus_allowed to cpu_active_mask */ if (cpus_updated) { spin_lock_irq(&callback_lock); From ceb360d235452be121ebc40c75c204190a2afb7f Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sun, 30 May 2021 22:13:45 +0300 Subject: [PATCH 0568/5344] i2c: highlander: add IRQ check [ Upstream commit f16a3bb69aa6baabf8f0aca982c8cf21e2a4f6bc ] The driver is written as if platform_get_irq() returns 0 on errors (while actually it returns a negative error code), blithely passing these error codes to request_irq() (which takes *unsigned* IRQ #) -- which fails with -EINVAL. Add the necessary error check to the pre-existing *if* statement forcing the driver into the polling mode... Fixes: 4ad48e6ab18c ("i2c: Renesas Highlander FPGA SMBus support") Signed-off-by: Sergey Shtylyov Signed-off-by: Wolfram Sang Signed-off-by: Sasha Levin --- drivers/i2c/busses/i2c-highlander.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-highlander.c b/drivers/i2c/busses/i2c-highlander.c index ff340d7ae2e52..6a880c2623808 100644 --- a/drivers/i2c/busses/i2c-highlander.c +++ b/drivers/i2c/busses/i2c-highlander.c @@ -379,7 +379,7 @@ static int highlander_i2c_probe(struct platform_device *pdev) platform_set_drvdata(pdev, dev); dev->irq = platform_get_irq(pdev, 0); - if (iic_force_poll) + if (dev->irq < 0 || iic_force_poll) dev->irq = 0; if (dev->irq) { From 436920cf77fe05124ef9f275bc66d6e8f5f2f663 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Sat, 29 May 2021 14:19:33 +0300 Subject: [PATCH 0569/5344] leds: lt3593: Put fwnode in any case during ->probe() [ Upstream commit 7e1baaaa2407a642ea19b58e214fab9a69cda1d7 ] device_get_next_child_node() bumps a reference counting of a returned variable. We have to balance it whenever we return to the caller. Fixes: 8cd7d6daba93 ("leds: lt3593: Add device tree probing glue") Cc: Daniel Mack Signed-off-by: Andy Shevchenko Signed-off-by: Pavel Machek Signed-off-by: Sasha Levin --- drivers/leds/leds-lt3593.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/leds/leds-lt3593.c b/drivers/leds/leds-lt3593.c index c94995f0daa2a..03ae33093ce63 100644 --- a/drivers/leds/leds-lt3593.c +++ b/drivers/leds/leds-lt3593.c @@ -103,10 +103,9 @@ static int lt3593_led_probe(struct platform_device *pdev) init_data.default_label = ":"; ret = devm_led_classdev_register_ext(dev, &led_data->cdev, &init_data); - if (ret < 0) { - fwnode_handle_put(child); + fwnode_handle_put(child); + if (ret < 0) return ret; - } led_data->cdev.dev->of_node = dev->of_node; platform_set_drvdata(pdev, led_data); From 88402d5b755988d64cf605d5c3d1cfd8e118e907 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 21 Feb 2021 12:52:08 +0100 Subject: [PATCH 0570/5344] leds: trigger: audio: Add an activate callback to ensure the initial brightness is set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 64f67b5240db79eceb0bd57dae8e591fd3103ba0 ] Some 2-in-1s with a detachable (USB) keyboard(dock) have mute-LEDs in the speaker- and/or mic-mute keys on the keyboard. Examples of this are the Lenovo Thinkpad10 tablet (with its USB kbd-dock) and the HP x2 10 series. The detachable nature of these keyboards means that the keyboard and thus the mute LEDs may show up after the user (or userspace restoring old mixer settings) has muted the speaker and/or mic. Current LED-class devices with a default_trigger of "audio-mute" or "audio-micmute" initialize the brightness member of led_classdev with ledtrig_audio_get() before registering the LED. This makes the software state after attaching the keyboard match the actual audio mute state, e.g. cat /sys/class/leds/foo/brightness will show the right value. But before this commit nothing was actually calling the led_classdev's brightness_set[_blocking] callback so the value returned by ledtrig_audio_get() was never actually being sent to the hw, leading to the mute LEDs staying in their default power-on state, after attaching the keyboard, even if ledtrig_audio_get() returned a different state. This could be fixed by having the individual LED drivers call brightness_set[_blocking] themselves after registering the LED, but this really is something which should be done by a led-trigger activate callback. Add an activate callback for this, fixing the issue of the mute LEDs being out of sync after (re)attaching the keyboard. Cc: Takashi Iwai Fixes: faa2541f5b1a ("leds: trigger: Introduce audio mute LED trigger") Reviewed-by: Marek Behún Signed-off-by: Hans de Goede Signed-off-by: Pavel Machek Signed-off-by: Sasha Levin --- drivers/leds/trigger/ledtrig-audio.c | 37 ++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/drivers/leds/trigger/ledtrig-audio.c b/drivers/leds/trigger/ledtrig-audio.c index f76621e88482d..c6b437e6369b8 100644 --- a/drivers/leds/trigger/ledtrig-audio.c +++ b/drivers/leds/trigger/ledtrig-audio.c @@ -6,10 +6,33 @@ #include #include #include +#include "../leds.h" -static struct led_trigger *ledtrig_audio[NUM_AUDIO_LEDS]; static enum led_brightness audio_state[NUM_AUDIO_LEDS]; +static int ledtrig_audio_mute_activate(struct led_classdev *led_cdev) +{ + led_set_brightness_nosleep(led_cdev, audio_state[LED_AUDIO_MUTE]); + return 0; +} + +static int ledtrig_audio_micmute_activate(struct led_classdev *led_cdev) +{ + led_set_brightness_nosleep(led_cdev, audio_state[LED_AUDIO_MICMUTE]); + return 0; +} + +static struct led_trigger ledtrig_audio[NUM_AUDIO_LEDS] = { + [LED_AUDIO_MUTE] = { + .name = "audio-mute", + .activate = ledtrig_audio_mute_activate, + }, + [LED_AUDIO_MICMUTE] = { + .name = "audio-micmute", + .activate = ledtrig_audio_micmute_activate, + }, +}; + enum led_brightness ledtrig_audio_get(enum led_audio type) { return audio_state[type]; @@ -19,24 +42,22 @@ EXPORT_SYMBOL_GPL(ledtrig_audio_get); void ledtrig_audio_set(enum led_audio type, enum led_brightness state) { audio_state[type] = state; - led_trigger_event(ledtrig_audio[type], state); + led_trigger_event(&ledtrig_audio[type], state); } EXPORT_SYMBOL_GPL(ledtrig_audio_set); static int __init ledtrig_audio_init(void) { - led_trigger_register_simple("audio-mute", - &ledtrig_audio[LED_AUDIO_MUTE]); - led_trigger_register_simple("audio-micmute", - &ledtrig_audio[LED_AUDIO_MICMUTE]); + led_trigger_register(&ledtrig_audio[LED_AUDIO_MUTE]); + led_trigger_register(&ledtrig_audio[LED_AUDIO_MICMUTE]); return 0; } module_init(ledtrig_audio_init); static void __exit ledtrig_audio_exit(void) { - led_trigger_unregister_simple(ledtrig_audio[LED_AUDIO_MUTE]); - led_trigger_unregister_simple(ledtrig_audio[LED_AUDIO_MICMUTE]); + led_trigger_unregister(&ledtrig_audio[LED_AUDIO_MUTE]); + led_trigger_unregister(&ledtrig_audio[LED_AUDIO_MICMUTE]); } module_exit(ledtrig_audio_exit); From 232744ec719735eb4e99f158f949a7dcdddb812b Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 9 Jul 2021 14:30:25 +0200 Subject: [PATCH 0571/5344] media: venus: venc: Fix potential null pointer dereference on pointer fmt [ Upstream commit 09ea9719a423fc675d40dd05407165e161ea0c48 ] Currently the call to find_format can potentially return a NULL to fmt and the nullpointer is later dereferenced on the assignment of pixmp->num_planes = fmt->num_planes. Fix this by adding a NULL pointer check and returning NULL for the failure case. Addresses-Coverity: ("Dereference null return") Fixes: aaaa93eda64b ("[media] media: venus: venc: add video encoder files") Signed-off-by: Colin Ian King Signed-off-by: Stanimir Varbanov Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/platform/qcom/venus/venc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/media/platform/qcom/venus/venc.c b/drivers/media/platform/qcom/venus/venc.c index 30028ceb548b1..766ca497f8565 100644 --- a/drivers/media/platform/qcom/venus/venc.c +++ b/drivers/media/platform/qcom/venus/venc.c @@ -308,6 +308,8 @@ venc_try_fmt_common(struct venus_inst *inst, struct v4l2_format *f) else return NULL; fmt = find_format(inst, pixmp->pixelformat, f->type); + if (!fmt) + return NULL; } pixmp->width = clamp(pixmp->width, frame_width_min(inst), From 44c766a77ddc12452e13cae60248cee0507509c9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 29 Jul 2021 17:54:28 +0200 Subject: [PATCH 0572/5344] PCI: PM: Avoid forcing PCI_D0 for wakeup reasons inconsistently [ Upstream commit da9f2150684ea684a7ddd6d7f0e38b2bdf43dcd8 ] It is inconsistent to return PCI_D0 from pci_target_state() instead of the original target state if 'wakeup' is true and the device cannot signal PME from D0. This only happens when the device cannot signal PME from the original target state and any shallower power states (including D0) and that case is effectively equivalent to the one in which PME singaling is not supported at all. Since the original target state is returned in the latter case, make the function do that in the former one too. Link: https://lore.kernel.org/linux-pm/3149540.aeNJFYEL58@kreacher/ Fixes: 666ff6f83e1d ("PCI/PM: Avoid using device_may_wakeup() for runtime PM") Reported-by: Mika Westerberg Reported-by: Utkarsh H Patel Reported-by: Koba Ko Signed-off-by: Rafael J. Wysocki Reviewed-by: Mika Westerberg Tested-by: Mika Westerberg Signed-off-by: Sasha Levin --- drivers/pci/pci.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 1455147158922..53c34fd64763e 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -2359,16 +2359,20 @@ static pci_power_t pci_target_state(struct pci_dev *dev, bool wakeup) if (dev->current_state == PCI_D3cold) target_state = PCI_D3cold; - if (wakeup) { + if (wakeup && dev->pme_support) { + pci_power_t state = target_state; + /* * Find the deepest state from which the device can generate * PME#. */ - if (dev->pme_support) { - while (target_state - && !(dev->pme_support & (1 << target_state))) - target_state--; - } + while (state && !(dev->pme_support & (1 << state))) + state--; + + if (state) + return state; + else if (dev->pme_support & 1) + return PCI_D0; } return target_state; From b01e7eac6823c3ae2001bef28318f6ce9cd601e8 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 29 Jul 2021 16:49:10 +0200 Subject: [PATCH 0573/5344] PCI: PM: Enable PME if it can be signaled from D3cold [ Upstream commit 0e00392a895c95c6d12d42158236c8862a2f43f2 ] PME signaling is only enabled by __pci_enable_wake() if the target device can signal PME from the given target power state (to avoid pointless reconfiguration of the device), but if the hierarchy above the device goes into D3cold, the device itself will end up in D3cold too, so if it can signal PME from D3cold, it should be enabled to do so in __pci_enable_wake(). [Note that if the device does not end up in D3cold and it cannot signal PME from the original target power state, it will not signal PME, so in that case the behavior does not change.] Link: https://lore.kernel.org/linux-pm/3149540.aeNJFYEL58@kreacher/ Fixes: 5bcc2fb4e815 ("PCI PM: Simplify PCI wake-up code") Reported-by: Mika Westerberg Reported-by: Utkarsh H Patel Reported-by: Koba Ko Signed-off-by: Rafael J. Wysocki Reviewed-by: Mika Westerberg Tested-by: Mika Westerberg Signed-off-by: Sasha Levin --- drivers/pci/pci.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 53c34fd64763e..76e4cc3ebe63e 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -2255,7 +2255,14 @@ static int __pci_enable_wake(struct pci_dev *dev, pci_power_t state, bool enable if (enable) { int error; - if (pci_pme_capable(dev, state)) + /* + * Enable PME signaling if the device can signal PME from + * D3cold regardless of whether or not it can signal PME from + * the current target state, because that will allow it to + * signal PME when the hierarchy above it goes into D3cold and + * the device itself ends up in D3cold as a result of that. + */ + if (pci_pme_capable(dev, state) || pci_pme_capable(dev, PCI_D3cold)) pci_pme_active(dev, true); else ret = 1; From c1dcb523bd4faa9f90b800cf9e77880bbdcef8fd Mon Sep 17 00:00:00 2001 From: Stephan Gerhold Date: Mon, 12 Jul 2021 15:57:03 +0200 Subject: [PATCH 0574/5344] soc: qcom: smsm: Fix missed interrupts if state changes while masked [ Upstream commit e3d4571955050736bbf3eda0a9538a09d9fcfce8 ] The SMSM driver detects interrupt edges by tracking the last state it has seen (and has triggered the interrupt handler for). This works fine, but only if the interrupt does not change state while masked. For example, if an interrupt is unmasked while the state is HIGH, the stored last_value for that interrupt might still be LOW. Then, when the remote processor triggers smsm_intr() we assume that nothing has changed, even though the state might have changed from HIGH to LOW. Attempt to fix this by checking the current remote state before unmasking an IRQ. Use atomic operations to avoid the interrupt handler from interfering with the unmask function. This fixes modem crashes in some edge cases with the BAM-DMUX driver. Specifically, the BAM-DMUX interrupt handler is not called for the HIGH -> LOW smsm state transition if the BAM-DMUX driver is loaded (and therefore unmasks the interrupt) after the modem was already started: qcom-q6v5-mss 4080000.remoteproc: fatal error received: a2_task.c:3188: Assert FALSE failed: A2 DL PER deadlock timer expired waiting for Apps ACK Fixes: c97c4090ff72 ("soc: qcom: smsm: Add driver for Qualcomm SMSM") Signed-off-by: Stephan Gerhold Link: https://lore.kernel.org/r/20210712135703.324748-2-stephan@gerhold.net Signed-off-by: Bjorn Andersson Signed-off-by: Sasha Levin --- drivers/soc/qcom/smsm.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/soc/qcom/smsm.c b/drivers/soc/qcom/smsm.c index 70c3c90b997c9..c428d0f78816e 100644 --- a/drivers/soc/qcom/smsm.c +++ b/drivers/soc/qcom/smsm.c @@ -109,7 +109,7 @@ struct smsm_entry { DECLARE_BITMAP(irq_enabled, 32); DECLARE_BITMAP(irq_rising, 32); DECLARE_BITMAP(irq_falling, 32); - u32 last_value; + unsigned long last_value; u32 *remote_state; u32 *subscription; @@ -204,8 +204,7 @@ static irqreturn_t smsm_intr(int irq, void *data) u32 val; val = readl(entry->remote_state); - changed = val ^ entry->last_value; - entry->last_value = val; + changed = val ^ xchg(&entry->last_value, val); for_each_set_bit(i, entry->irq_enabled, 32) { if (!(changed & BIT(i))) @@ -266,6 +265,12 @@ static void smsm_unmask_irq(struct irq_data *irqd) struct qcom_smsm *smsm = entry->smsm; u32 val; + /* Make sure our last cached state is up-to-date */ + if (readl(entry->remote_state) & BIT(irq)) + set_bit(irq, &entry->last_value); + else + clear_bit(irq, &entry->last_value); + set_bit(irq, entry->irq_enabled); if (entry->subscription) { From cba993812e28ee37000a45d01ab3eb2403afab83 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 2 Aug 2021 18:24:44 +0200 Subject: [PATCH 0575/5344] debugfs: Return error during {full/open}_proxy_open() on rmmod MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 112cedc8e600b668688eb809bf11817adec58ddc ] If a kernel module gets unloaded then it printed report about a leak before commit 275678e7a9be ("debugfs: Check module state before warning in {full/open}_proxy_open()"). An additional check was added in this commit to avoid this printing. But it was forgotten that the function must return an error in this case because it was not actually opened. As result, the systems started to crash or to hang when a module was unloaded while something was trying to open a file. Fixes: 275678e7a9be ("debugfs: Check module state before warning in {full/open}_proxy_open()") Cc: Taehee Yoo Reported-by: Mário Lopes Signed-off-by: Sven Eckelmann Link: https://lore.kernel.org/r/20210802162444.7848-1-sven@narfation.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- fs/debugfs/file.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 943637298f650..a32c5c7dcfd89 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -178,8 +178,10 @@ static int open_proxy_open(struct inode *inode, struct file *filp) if (!fops_get(real_fops)) { #ifdef CONFIG_MODULES if (real_fops->owner && - real_fops->owner->state == MODULE_STATE_GOING) + real_fops->owner->state == MODULE_STATE_GOING) { + r = -ENXIO; goto out; + } #endif /* Huh? Module did not clean up after itself at exit? */ @@ -313,8 +315,10 @@ static int full_proxy_open(struct inode *inode, struct file *filp) if (!fops_get(real_fops)) { #ifdef CONFIG_MODULES if (real_fops->owner && - real_fops->owner->state == MODULE_STATE_GOING) + real_fops->owner->state == MODULE_STATE_GOING) { + r = -ENXIO; goto out; + } #endif /* Huh? Module did not cleanup after itself at exit? */ From 64f6bf5fd6a8109f766f90255fa450b7a4b43f88 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 4 Aug 2021 16:09:51 +0100 Subject: [PATCH 0576/5344] Bluetooth: increase BTNAMSIZ to 21 chars to fix potential buffer overflow [ Upstream commit 713baf3dae8f45dc8ada4ed2f5fdcbf94a5c274d ] An earlier commit replaced using batostr to using %pMR sprintf for the construction of session->name. Static analysis detected that this new method can use a total of 21 characters (including the trailing '\0') so we need to increase the BTNAMSIZ from 18 to 21 to fix potential buffer overflows. Addresses-Coverity: ("Out-of-bounds write") Fixes: fcb73338ed53 ("Bluetooth: Use %pMR in sprintf/seq_printf instead of batostr") Signed-off-by: Colin Ian King Signed-off-by: Marcel Holtmann Signed-off-by: Sasha Levin --- net/bluetooth/cmtp/cmtp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bluetooth/cmtp/cmtp.h b/net/bluetooth/cmtp/cmtp.h index c32638dddbf94..f6b9dc4e408f2 100644 --- a/net/bluetooth/cmtp/cmtp.h +++ b/net/bluetooth/cmtp/cmtp.h @@ -26,7 +26,7 @@ #include #include -#define BTNAMSIZ 18 +#define BTNAMSIZ 21 /* CMTP ioctl defines */ #define CMTPCONNADD _IOW('C', 200, int) From 030586f7893d20974669ffa455391842ed1e8bd0 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Tue, 3 Aug 2021 11:27:43 +0100 Subject: [PATCH 0577/5344] PM: EM: Increase energy calculation precision [ Upstream commit 7fcc17d0cb12938d2b3507973a6f93fc9ed2c7a1 ] The Energy Model (EM) provides useful information about device power in each performance state to other subsystems like: Energy Aware Scheduler (EAS). The energy calculation in EAS does arithmetic operation based on the EM em_cpu_energy(). Current implementation of that function uses em_perf_state::cost as a pre-computed cost coefficient equal to: cost = power * max_frequency / frequency. The 'power' is expressed in milli-Watts (or in abstract scale). There are corner cases when the EAS energy calculation for two Performance Domains (PDs) return the same value. The EAS compares these values to choose smaller one. It might happen that this values are equal due to rounding error. In such scenario, we need better resolution, e.g. 1000 times better. To provide this possibility increase the resolution in the em_perf_state::cost for 64-bit architectures. The cost of increasing resolution on 32-bit is pretty high (64-bit division) and is not justified since there are no new 32bit big.LITTLE EAS systems expected which would benefit from this higher resolution. This patch allows to avoid the rounding to milli-Watt errors, which might occur in EAS energy estimation for each PD. The rounding error is common for small tasks which have small utilization value. There are two places in the code where it makes a difference: 1. In the find_energy_efficient_cpu() where we are searching for best_delta. We might suffer there when two PDs return the same result, like in the example below. Scenario: Low utilized system e.g. ~200 sum_util for PD0 and ~220 for PD1. There are quite a few small tasks ~10-15 util. These tasks would suffer for the rounding error. These utilization values are typical when running games on Android. One of our partners has reported 5..10mA less battery drain when running with increased resolution. Some details: We have two PDs: PD0 (big) and PD1 (little) Let's compare w/o patch set ('old') and w/ patch set ('new') We are comparing energy w/ task and w/o task placed in the PDs a) 'old' w/o patch set, PD0 task_util = 13 cost = 480 sum_util_w/o_task = 215 sum_util_w_task = 228 scale_cpu = 1024 energy_w/o_task = 480 * 215 / 1024 = 100.78 => 100 energy_w_task = 480 * 228 / 1024 = 106.87 => 106 energy_diff = 106 - 100 = 6 (this is equal to 'old' PD1's energy_diff in 'c)') b) 'new' w/ patch set, PD0 task_util = 13 cost = 480 * 1000 = 480000 sum_util_w/o_task = 215 sum_util_w_task = 228 energy_w/o_task = 480000 * 215 / 1024 = 100781 energy_w_task = 480000 * 228 / 1024 = 106875 energy_diff = 106875 - 100781 = 6094 (this is not equal to 'new' PD1's energy_diff in 'd)') c) 'old' w/o patch set, PD1 task_util = 13 cost = 160 sum_util_w/o_task = 283 sum_util_w_task = 293 scale_cpu = 355 energy_w/o_task = 160 * 283 / 355 = 127.55 => 127 energy_w_task = 160 * 296 / 355 = 133.41 => 133 energy_diff = 133 - 127 = 6 (this is equal to 'old' PD0's energy_diff in 'a)') d) 'new' w/ patch set, PD1 task_util = 13 cost = 160 * 1000 = 160000 sum_util_w/o_task = 283 sum_util_w_task = 293 scale_cpu = 355 energy_w/o_task = 160000 * 283 / 355 = 127549 energy_w_task = 160000 * 296 / 355 = 133408 energy_diff = 133408 - 127549 = 5859 (this is not equal to 'new' PD0's energy_diff in 'b)') 2. Difference in the 6% energy margin filter at the end of find_energy_efficient_cpu(). With this patch the margin comparison also has better resolution, so it's possible to have better task placement thanks to that. Fixes: 27871f7a8a341ef ("PM: Introduce an Energy Model management framework") Reported-by: CCJ Yeh Reviewed-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki Signed-off-by: Sasha Levin --- include/linux/energy_model.h | 16 ++++++++++++++++ kernel/power/energy_model.c | 4 +++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 73f8c3cb95888..9ee6ccc18424f 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -42,6 +42,22 @@ struct em_perf_domain { #define EM_CPU_MAX_POWER 0xFFFF +/* + * Increase resolution of energy estimation calculations for 64-bit + * architectures. The extra resolution improves decision made by EAS for the + * task placement when two Performance Domains might provide similar energy + * estimation values (w/o better resolution the values could be equal). + * + * We increase resolution only if we have enough bits to allow this increased + * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit + * are pretty high and the returns do not justify the increased costs. + */ +#ifdef CONFIG_64BIT +#define em_scale_power(p) ((p) * 1000) +#else +#define em_scale_power(p) (p) +#endif + struct em_data_callback { /** * active_power() - Provide power at the next capacity state of a CPU diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 8dac32bd90894..7ef35eb985baf 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -149,7 +149,9 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, /* Compute the cost of each capacity_state. */ fmax = (u64) table[nr_states - 1].frequency; for (i = 0; i < nr_states; i++) { - table[i].cost = div64_u64(fmax * table[i].power, + unsigned long power_res = em_scale_power(table[i].power); + + table[i].cost = div64_u64(fmax * power_res, table[i].frequency); } From 66955c85d9ae44ca93d77adfd4ad55cefb47e1ec Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Mon, 5 Jul 2021 02:05:19 +0300 Subject: [PATCH 0578/5344] drm/msm/dpu: make dpu_hw_ctl_clear_all_blendstages clear necessary LMs [ Upstream commit a41cdb693595ae1904dd793fc15d6954f4295e27 ] dpu_hw_ctl_clear_all_blendstages() clears settings for the few first LMs instead of mixers actually used for the CTL. Change it to clear necessary data, using provided mixer ids. Fixes: 25fdd5933e4c ("drm/msm: Add SDM845 DPU support") Signed-off-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20210704230519.4081467-1-dmitry.baryshkov@linaro.org Signed-off-by: Dmitry Baryshkov Signed-off-by: Rob Clark Signed-off-by: Sasha Levin --- drivers/gpu/drm/msm/disp/dpu1/dpu_hw_ctl.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_ctl.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_ctl.c index 179e8d52cadb4..a08ca7a47400f 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_ctl.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_ctl.c @@ -281,10 +281,12 @@ static void dpu_hw_ctl_clear_all_blendstages(struct dpu_hw_ctl *ctx) int i; for (i = 0; i < ctx->mixer_count; i++) { - DPU_REG_WRITE(c, CTL_LAYER(LM_0 + i), 0); - DPU_REG_WRITE(c, CTL_LAYER_EXT(LM_0 + i), 0); - DPU_REG_WRITE(c, CTL_LAYER_EXT2(LM_0 + i), 0); - DPU_REG_WRITE(c, CTL_LAYER_EXT3(LM_0 + i), 0); + enum dpu_lm mixer_id = ctx->mixer_hw_caps[i].id; + + DPU_REG_WRITE(c, CTL_LAYER(mixer_id), 0); + DPU_REG_WRITE(c, CTL_LAYER_EXT(mixer_id), 0); + DPU_REG_WRITE(c, CTL_LAYER_EXT2(mixer_id), 0); + DPU_REG_WRITE(c, CTL_LAYER_EXT3(mixer_id), 0); } } From 0799a1cdb7c153b846ad2aa35514cfcebf19ab94 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 5 Aug 2021 09:21:10 +0200 Subject: [PATCH 0579/5344] arm64: dts: exynos: correct GIC CPU interfaces address range on Exynos7 [ Upstream commit 01c72cad790cb6cd3ccbe4c1402b6cb6c6bbffd0 ] The GIC-400 CPU interfaces address range is defined as 0x2000-0x3FFF (by ARM). Reported-by: Sam Protsenko Reported-by: Marc Zyngier Signed-off-by: Krzysztof Kozlowski Reviewed-by: Sam Protsenko Reviewed-by: Alim Akhtar Fixes: b9024cbc937d ("arm64: dts: Add initial device tree support for exynos7") Link: https://lore.kernel.org/r/20210805072110.4730-1-krzysztof.kozlowski@canonical.com Signed-off-by: Sasha Levin --- arch/arm64/boot/dts/exynos/exynos7.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/exynos/exynos7.dtsi b/arch/arm64/boot/dts/exynos/exynos7.dtsi index 25549d9552ae2..84f92b44c3235 100644 --- a/arch/arm64/boot/dts/exynos/exynos7.dtsi +++ b/arch/arm64/boot/dts/exynos/exynos7.dtsi @@ -113,7 +113,7 @@ #address-cells = <0>; interrupt-controller; reg = <0x11001000 0x1000>, - <0x11002000 0x1000>, + <0x11002000 0x2000>, <0x11004000 0x2000>, <0x11006000 0x2000>; }; From 21c43aa26d1e7dec00ab8de99d31e68df309f0d0 Mon Sep 17 00:00:00 2001 From: William Breathitt Gray Date: Tue, 3 Aug 2021 21:06:11 +0900 Subject: [PATCH 0580/5344] counter: 104-quad-8: Return error when invalid mode during ceiling_write [ Upstream commit 728246e8f7269ecd35a2c6e6795323e6d8f48db7 ] The 104-QUAD-8 only has two count modes where a ceiling value makes sense: Range Limit and Modulo-N. Outside of these two modes, setting a ceiling value is an invalid operation -- so let's report it as such by returning -EINVAL. Fixes: fc069262261c ("counter: 104-quad-8: Add lock guards - generic interface") Acked-by: Syed Nayyar Waris Signed-off-by: William Breathitt Gray Link: https://lore.kernel.org/r/a2147f022829b66839a1db5530a7fada47856847.1627990337.git.vilhelm.gray@gmail.com Signed-off-by: Jonathan Cameron Signed-off-by: Sasha Levin --- drivers/counter/104-quad-8.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/counter/104-quad-8.c b/drivers/counter/104-quad-8.c index 5c23a9a56921b..f261a57af1c01 100644 --- a/drivers/counter/104-quad-8.c +++ b/drivers/counter/104-quad-8.c @@ -1230,12 +1230,13 @@ static ssize_t quad8_count_ceiling_write(struct counter_device *counter, case 1: case 3: quad8_preset_register_set(priv, count->id, ceiling); - break; + mutex_unlock(&priv->lock); + return len; } mutex_unlock(&priv->lock); - return len; + return -EINVAL; } static ssize_t quad8_count_preset_enable_read(struct counter_device *counter, From 97f759412383d4f603f1fb762c516cfc40193fb6 Mon Sep 17 00:00:00 2001 From: Desmond Cheong Zhi Xi Date: Tue, 10 Aug 2021 12:14:10 +0800 Subject: [PATCH 0581/5344] Bluetooth: fix repeated calls to sco_sock_kill [ Upstream commit e1dee2c1de2b4dd00eb44004a4bda6326ed07b59 ] In commit 4e1a720d0312 ("Bluetooth: avoid killing an already killed socket"), a check was added to sco_sock_kill to skip killing a socket if the SOCK_DEAD flag was set. This was done after a trace for a use-after-free bug showed that the same sock pointer was being killed twice. Unfortunately, this check prevents sco_sock_kill from running on any socket. sco_sock_kill kills a socket only if it's zapped and orphaned, however sock_orphan announces that the socket is dead before detaching it. i.e., orphaned sockets have the SOCK_DEAD flag set. To fix this, we remove the check for SOCK_DEAD, and avoid repeated calls to sco_sock_kill by removing incorrect calls in: 1. sco_sock_timeout. The socket should not be killed on timeout as further processing is expected to be done. For example, sco_sock_connect sets the timer then waits for the socket to be connected or for an error to be returned. 2. sco_conn_del. This function should clean up resources for the connection, but the socket itself should be cleaned up in sco_sock_release. 3. sco_sock_close. Calls to sco_sock_close in sco_sock_cleanup_listen and sco_sock_release are followed by sco_sock_kill. Hence the duplicated call should be removed. Fixes: 4e1a720d0312 ("Bluetooth: avoid killing an already killed socket") Signed-off-by: Desmond Cheong Zhi Xi Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Sasha Levin --- net/bluetooth/sco.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 335264706aaeb..1b7540cb8e5c4 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -84,7 +84,6 @@ static void sco_sock_timeout(struct timer_list *t) sk->sk_state_change(sk); bh_unlock_sock(sk); - sco_sock_kill(sk); sock_put(sk); } @@ -176,7 +175,6 @@ static void sco_conn_del(struct hci_conn *hcon, int err) sco_sock_clear_timer(sk); sco_chan_del(sk, err); bh_unlock_sock(sk); - sco_sock_kill(sk); sock_put(sk); } @@ -393,8 +391,7 @@ static void sco_sock_cleanup_listen(struct sock *parent) */ static void sco_sock_kill(struct sock *sk) { - if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket || - sock_flag(sk, SOCK_DEAD)) + if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket) return; BT_DBG("sk %p state %d", sk, sk->sk_state); @@ -446,7 +443,6 @@ static void sco_sock_close(struct sock *sk) lock_sock(sk); __sco_sock_close(sk); release_sock(sk); - sco_sock_kill(sk); } static void sco_sock_init(struct sock *sk, struct sock *parent) From 712cdce8c233de24882b5646cb2094574a3bef55 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 6 Aug 2021 11:15:13 +0200 Subject: [PATCH 0582/5344] drm/msm/dsi: Fix some reference counted resource leaks [ Upstream commit 6977cc89c87506ff17e6c05f0e37f46752256e82 ] 'of_find_device_by_node()' takes a reference that must be released when not needed anymore. This is expected to be done in 'dsi_destroy()'. However, there are 2 issues in 'dsi_get_phy()'. First, if 'of_find_device_by_node()' succeeds but 'platform_get_drvdata()' returns NULL, 'msm_dsi->phy_dev' will still be NULL, and the reference won't be released in 'dsi_destroy()'. Secondly, as 'of_find_device_by_node()' already takes a reference, there is no need for an additional 'get_device()'. Move the assignment to 'msm_dsi->phy_dev' a few lines above and remove the unneeded 'get_device()' to solve both issues. Fixes: ec31abf6684e ("drm/msm/dsi: Separate PHY to another platform device") Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/f15bc57648a00e7c99f943903468a04639d50596.1628241097.git.christophe.jaillet@wanadoo.fr Signed-off-by: Rob Clark Signed-off-by: Sasha Levin --- drivers/gpu/drm/msm/dsi/dsi.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/msm/dsi/dsi.c b/drivers/gpu/drm/msm/dsi/dsi.c index 55ea4bc2ee9cb..0d37ae5b310c4 100644 --- a/drivers/gpu/drm/msm/dsi/dsi.c +++ b/drivers/gpu/drm/msm/dsi/dsi.c @@ -26,8 +26,10 @@ static int dsi_get_phy(struct msm_dsi *msm_dsi) } phy_pdev = of_find_device_by_node(phy_node); - if (phy_pdev) + if (phy_pdev) { msm_dsi->phy = platform_get_drvdata(phy_pdev); + msm_dsi->phy_dev = &phy_pdev->dev; + } of_node_put(phy_node); @@ -36,8 +38,6 @@ static int dsi_get_phy(struct msm_dsi *msm_dsi) return -EPROBE_DEFER; } - msm_dsi->phy_dev = get_device(&phy_pdev->dev); - return 0; } From f0dbe82a71e345458a117d1a52f86b4e76546c81 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Mon, 9 Aug 2021 23:27:28 +0300 Subject: [PATCH 0583/5344] usb: gadget: udc: at91: add IRQ check [ Upstream commit 50855c31573b02963f0aa2aacfd4ea41c31ae0e0 ] The driver neglects to check the result of platform_get_irq()'s call and blithely passes the negative error codes to devm_request_irq() (which takes *unsigned* IRQ #), causing it to fail with -EINVAL, overriding an original error code. Stop calling devm_request_irq() with the invalid IRQ #s. Fixes: 8b2e76687b39 ("USB: AT91 UDC updates, mostly power management") Signed-off-by: Sergey Shtylyov Acked-by: Felipe Balbi Link: https://lore.kernel.org/r/6654a224-739a-1a80-12f0-76d920f87b6c@omp.ru Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/usb/gadget/udc/at91_udc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/udc/at91_udc.c b/drivers/usb/gadget/udc/at91_udc.c index 194ffb1ed4620..d7714c94b1196 100644 --- a/drivers/usb/gadget/udc/at91_udc.c +++ b/drivers/usb/gadget/udc/at91_udc.c @@ -1878,7 +1878,9 @@ static int at91udc_probe(struct platform_device *pdev) clk_disable(udc->iclk); /* request UDC and maybe VBUS irqs */ - udc->udp_irq = platform_get_irq(pdev, 0); + udc->udp_irq = retval = platform_get_irq(pdev, 0); + if (retval < 0) + goto err_unprepare_iclk; retval = devm_request_irq(dev, udc->udp_irq, at91_udc_irq, 0, driver_name, udc); if (retval) { From 2c19c49d1b6c0b425d19cf40726b3a9bc595a8a1 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Mon, 9 Aug 2021 23:50:18 +0300 Subject: [PATCH 0584/5344] usb: phy: fsl-usb: add IRQ check [ Upstream commit ecc2f30dbb25969908115c81ec23650ed982b004 ] The driver neglects to check the result of platform_get_irq()'s call and blithely passes the negative error codes to request_irq() (which takes *unsigned* IRQ #), causing it to fail with -EINVAL, overriding an original error code. Stop calling request_irq() with the invalid IRQ #s. Fixes: 0807c500a1a6 ("USB: add Freescale USB OTG Transceiver driver") Acked-by: Felipe Balbi Signed-off-by: Sergey Shtylyov Link: https://lore.kernel.org/r/b0a86089-8b8b-122e-fd6d-73e8c2304964@omp.ru Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/usb/phy/phy-fsl-usb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/phy/phy-fsl-usb.c b/drivers/usb/phy/phy-fsl-usb.c index b451f4695f3f0..446c7bf67873c 100644 --- a/drivers/usb/phy/phy-fsl-usb.c +++ b/drivers/usb/phy/phy-fsl-usb.c @@ -873,6 +873,8 @@ int usb_otg_start(struct platform_device *pdev) /* request irq */ p_otg->irq = platform_get_irq(pdev, 0); + if (p_otg->irq < 0) + return p_otg->irq; status = request_irq(p_otg->irq, fsl_otg_isr, IRQF_SHARED, driver_name, p_otg); if (status) { From 9893347e1bb2e5d28c98057dcbae5e34d7c40699 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Mon, 9 Aug 2021 23:53:16 +0300 Subject: [PATCH 0585/5344] usb: phy: twl6030: add IRQ checks [ Upstream commit 0881e22c06e66af0b64773c91c8868ead3d01aa1 ] The driver neglects to check the result of platform_get_irq()'s calls and blithely passes the negative error codes to request_threaded_irq() (which takes *unsigned* IRQ #), causing them both to fail with -EINVAL, overriding an original error code. Stop calling request_threaded_irq() with the invalid IRQ #s. Fixes: c33fad0c3748 ("usb: otg: Adding twl6030-usb transceiver driver for OMAP4430") Acked-by: Felipe Balbi Signed-off-by: Sergey Shtylyov Link: https://lore.kernel.org/r/9507f50b-50f1-6dc4-f57c-3ed4e53a1c25@omp.ru Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/usb/phy/phy-twl6030-usb.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/usb/phy/phy-twl6030-usb.c b/drivers/usb/phy/phy-twl6030-usb.c index 9a7e655d52801..9337c30f0743b 100644 --- a/drivers/usb/phy/phy-twl6030-usb.c +++ b/drivers/usb/phy/phy-twl6030-usb.c @@ -348,6 +348,11 @@ static int twl6030_usb_probe(struct platform_device *pdev) twl->irq2 = platform_get_irq(pdev, 1); twl->linkstat = MUSB_UNKNOWN; + if (twl->irq1 < 0) + return twl->irq1; + if (twl->irq2 < 0) + return twl->irq2; + twl->comparator.set_vbus = twl6030_set_vbus; twl->comparator.start_srp = twl6030_start_srp; From 23978373a53722ae5e4546a328c04a80609a101e Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 11 Aug 2021 17:52:54 +0200 Subject: [PATCH 0586/5344] usb: gadget: udc: renesas_usb3: Fix soc_device_match() abuse MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit cea45a3bd2dd4d9c35581328f571afd32b3c9f48 ] soc_device_match() is intended as a last resort, to handle e.g. quirks that cannot be handled by matching based on a compatible value. As the device nodes for the Renesas USB 3.0 Peripheral Controller on R-Car E3 and RZ/G2E do have SoC-specific compatible values, the latter can and should be used to match against these devices. This also fixes support for the USB 3.0 Peripheral Controller on the R-Car E3e (R8A779M6) SoC, which is a different grading of the R-Car E3 (R8A77990) SoC, using the same SoC-specific compatible value. Fixes: 30025efa8b5e75f5 ("usb: gadget: udc: renesas_usb3: add support for r8a77990") Fixes: 546970fdab1da5fe ("usb: gadget: udc: renesas_usb3: add support for r8a774c0") Reviewed-by: Niklas Söderlund Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/760981fb4cd110d7cbfc9dcffa365e7c8b25c6e5.1628696960.git.geert+renesas@glider.be Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/usb/gadget/udc/renesas_usb3.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/usb/gadget/udc/renesas_usb3.c b/drivers/usb/gadget/udc/renesas_usb3.c index 08a93cf68efff..b6653bc7acc26 100644 --- a/drivers/usb/gadget/udc/renesas_usb3.c +++ b/drivers/usb/gadget/udc/renesas_usb3.c @@ -2692,10 +2692,15 @@ static const struct renesas_usb3_priv renesas_usb3_priv_r8a77990 = { static const struct of_device_id usb3_of_match[] = { { + .compatible = "renesas,r8a774c0-usb3-peri", + .data = &renesas_usb3_priv_r8a77990, + }, { .compatible = "renesas,r8a7795-usb3-peri", .data = &renesas_usb3_priv_gen3, - }, - { + }, { + .compatible = "renesas,r8a77990-usb3-peri", + .data = &renesas_usb3_priv_r8a77990, + }, { .compatible = "renesas,rcar-gen3-usb3-peri", .data = &renesas_usb3_priv_gen3, }, @@ -2704,18 +2709,10 @@ static const struct of_device_id usb3_of_match[] = { MODULE_DEVICE_TABLE(of, usb3_of_match); static const struct soc_device_attribute renesas_usb3_quirks_match[] = { - { - .soc_id = "r8a774c0", - .data = &renesas_usb3_priv_r8a77990, - }, { .soc_id = "r8a7795", .revision = "ES1.*", .data = &renesas_usb3_priv_r8a7795_es1, }, - { - .soc_id = "r8a77990", - .data = &renesas_usb3_priv_r8a77990, - }, { /* sentinel */ }, }; From 70fb8fea10fc33863fc4b8120214286d5a29a175 Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Tue, 10 Aug 2021 12:53:15 +0800 Subject: [PATCH 0587/5344] Bluetooth: Move shutdown callback before flushing tx and rx queue [ Upstream commit 0ea53674d07fb6db2dd7a7ec2fdc85a12eb246c2 ] Commit 0ea9fd001a14 ("Bluetooth: Shutdown controller after workqueues are flushed or cancelled") introduced a regression that makes mtkbtsdio driver stops working: [ 36.593956] Bluetooth: hci0: Firmware already downloaded [ 46.814613] Bluetooth: hci0: Execution of wmt command timed out [ 46.814619] Bluetooth: hci0: Failed to send wmt func ctrl (-110) The shutdown callback depends on the result of hdev->rx_work, so we should call it before flushing rx_work: -> btmtksdio_shutdown() -> mtk_hci_wmt_sync() -> __hci_cmd_send() -> wait for BTMTKSDIO_TX_WAIT_VND_EVT gets cleared -> btmtksdio_recv_event() -> hci_recv_frame() -> queue_work(hdev->workqueue, &hdev->rx_work) -> clears BTMTKSDIO_TX_WAIT_VND_EVT So move the shutdown callback before flushing TX/RX queue to resolve the issue. Reported-and-tested-by: Mattijs Korpershoek Tested-by: Hsin-Yi Wang Cc: Guenter Roeck Fixes: 0ea9fd001a14 ("Bluetooth: Shutdown controller after workqueues are flushed or cancelled") Signed-off-by: Kai-Heng Feng Signed-off-by: Marcel Holtmann Signed-off-by: Sasha Levin --- net/bluetooth/hci_core.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 83a07fca9000f..4e49728003707 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1685,6 +1685,14 @@ int hci_dev_do_close(struct hci_dev *hdev) hci_request_cancel_all(hdev); hci_req_sync_lock(hdev); + if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) && + !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && + test_bit(HCI_UP, &hdev->flags)) { + /* Execute vendor specific shutdown routine */ + if (hdev->shutdown) + hdev->shutdown(hdev); + } + if (!test_and_clear_bit(HCI_UP, &hdev->flags)) { cancel_delayed_work_sync(&hdev->cmd_timer); hci_req_sync_unlock(hdev); From 0584fb716fb9aa56e67ce52ae59879fd54d0447a Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Fri, 13 Aug 2021 23:30:18 +0300 Subject: [PATCH 0588/5344] usb: host: ohci-tmio: add IRQ check [ Upstream commit 4ac5132e8a4300637a2da8f5d6bc7650db735b8a ] The driver neglects to check the result of platform_get_irq()'s call and blithely passes the negative error codes to usb_add_hcd() (which takes *unsigned* IRQ #), causing request_irq() that it calls to fail with -EINVAL, overriding an original error code. Stop calling usb_add_hcd() with the invalid IRQ #s. Fixes: 78c73414f4f6 ("USB: ohci: add support for tmio-ohci cell") Acked-by: Alan Stern Signed-off-by: Sergey Shtylyov Link: https://lore.kernel.org/r/402e1a45-a0a4-0e08-566a-7ca1331506b1@omp.ru Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/usb/host/ohci-tmio.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/host/ohci-tmio.c b/drivers/usb/host/ohci-tmio.c index fb6f5e9ae5c62..fed43c6dd85cc 100644 --- a/drivers/usb/host/ohci-tmio.c +++ b/drivers/usb/host/ohci-tmio.c @@ -202,6 +202,9 @@ static int ohci_hcd_tmio_drv_probe(struct platform_device *dev) if (!cell) return -EINVAL; + if (irq < 0) + return irq; + hcd = usb_create_hcd(&ohci_tmio_hc_driver, &dev->dev, dev_name(&dev->dev)); if (!hcd) { ret = -ENOMEM; From cfb166c1bd51bcf133f934aef49d9a3dc856e68e Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Fri, 13 Aug 2021 23:32:38 +0300 Subject: [PATCH 0589/5344] usb: phy: tahvo: add IRQ check [ Upstream commit 0d45a1373e669880b8beaecc8765f44cb0241e47 ] The driver neglects to check the result of platform_get_irq()'s call and blithely passes the negative error codes to request_threaded_irq() (which takes *unsigned* IRQ #), causing it to fail with -EINVAL, overriding an original error code. Stop calling request_threaded_irq() with the invalid IRQ #s. Fixes: 9ba96ae5074c ("usb: omap1: Tahvo USB transceiver driver") Acked-by: Felipe Balbi Signed-off-by: Sergey Shtylyov Link: https://lore.kernel.org/r/8280d6a4-8e9a-7cfe-1aa9-db586dc9afdf@omp.ru Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/usb/phy/phy-tahvo.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/phy/phy-tahvo.c b/drivers/usb/phy/phy-tahvo.c index baebb1f5a9737..a3e043e3e4aae 100644 --- a/drivers/usb/phy/phy-tahvo.c +++ b/drivers/usb/phy/phy-tahvo.c @@ -393,7 +393,9 @@ static int tahvo_usb_probe(struct platform_device *pdev) dev_set_drvdata(&pdev->dev, tu); - tu->irq = platform_get_irq(pdev, 0); + tu->irq = ret = platform_get_irq(pdev, 0); + if (ret < 0) + return ret; ret = request_threaded_irq(tu->irq, NULL, tahvo_usb_vbus_interrupt, IRQF_ONESHOT, "tahvo-vbus", tu); From fc5b720fa3b4ec6697274b27cca8c3a6cac75af8 Mon Sep 17 00:00:00 2001 From: Chih-Kang Chang Date: Mon, 16 Aug 2021 16:51:28 +0800 Subject: [PATCH 0590/5344] mac80211: Fix insufficient headroom issue for AMSDU [ Upstream commit f50d2ff8f016b79a2ff4acd5943a1eda40c545d4 ] ieee80211_amsdu_realloc_pad() fails to account for extra_tx_headroom, the original reserved headroom might be eaten. Add the necessary extra_tx_headroom. Fixes: 6e0456b54545 ("mac80211: add A-MSDU tx support") Signed-off-by: Chih-Kang Chang Signed-off-by: Ping-Ke Shih Link: https://lore.kernel.org/r/20210816085128.10931-2-pkshih@realtek.com [fix indentation] Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin --- net/mac80211/tx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 538722522ffe9..4dfac7a25e5ad 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -3189,7 +3189,9 @@ static bool ieee80211_amsdu_prepare_head(struct ieee80211_sub_if_data *sdata, if (info->control.flags & IEEE80211_TX_CTRL_AMSDU) return true; - if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(*amsdu_hdr))) + if (!ieee80211_amsdu_realloc_pad(local, skb, + sizeof(*amsdu_hdr) + + local->hw.extra_tx_headroom)) return false; data = skb_push(skb, sizeof(*amsdu_hdr)); From 006dc56e3772ed257ab1c2cc183922cb90beeca3 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Mon, 26 Jul 2021 09:33:28 -0400 Subject: [PATCH 0591/5344] lockd: Fix invalid lockowner cast after vfs_test_lock [ Upstream commit cd2d644ddba183ec7b451b7c20d5c7cc06fcf0d7 ] After calling vfs_test_lock() the pointer to a conflicting lock can be returned, and that lock is not guarunteed to be owned by nlm. In that case, we cannot cast it to struct nlm_lockowner. Instead return the pid of that conflicting lock. Fixes: 646d73e91b42 ("lockd: Show pid of lockd for remote locks") Signed-off-by: Benjamin Coddington Signed-off-by: Chuck Lever Signed-off-by: Sasha Levin --- fs/lockd/svclock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 61d3cc2283dc8..498cb70c2c0d0 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -634,7 +634,7 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, conflock->caller = "somehost"; /* FIXME */ conflock->len = strlen(conflock->caller); conflock->oh.len = 0; /* don't return OH info */ - conflock->svid = ((struct nlm_lockowner *)lock->fl.fl_owner)->pid; + conflock->svid = lock->fl.fl_pid; conflock->fl.fl_type = lock->fl.fl_type; conflock->fl.fl_start = lock->fl.fl_start; conflock->fl.fl_end = lock->fl.fl_end; From 082cb0355f6eea66ba25a933372aed73b4ce53f7 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 12 Aug 2021 16:41:43 -0400 Subject: [PATCH 0592/5344] nfsd4: Fix forced-expiry locking [ Upstream commit f7104cc1a9159cd0d3e8526cb638ae0301de4b61 ] This should use the network-namespace-wide client_lock, not the per-client cl_lock. You shouldn't see any bugs unless you're actually using the forced-expiry interface introduced by 89c905beccbb. Fixes: 89c905beccbb "nfsd: allow forced expiration of NFSv4 clients" Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever Signed-off-by: Sasha Levin --- fs/nfsd/nfs4state.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 56b33065f5ab6..a48fcd4180c74 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2577,9 +2577,9 @@ static void force_expire_client(struct nfs4_client *clp) struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); bool already_expired; - spin_lock(&clp->cl_lock); + spin_lock(&nn->client_lock); clp->cl_time = 0; - spin_unlock(&clp->cl_lock); + spin_unlock(&nn->client_lock); wait_event(expiry_wq, atomic_read(&clp->cl_rpc_users) == 0); spin_lock(&nn->client_lock); From 0263929b87d6723bf66a7e81b7b990cde85174de Mon Sep 17 00:00:00 2001 From: Nadezda Lutovinova Date: Wed, 18 Aug 2021 17:12:47 +0300 Subject: [PATCH 0593/5344] usb: gadget: mv_u3d: request_irq() after initializing UDC [ Upstream commit 2af0c5ffadaf9d13eca28409d4238b4e672942d3 ] If IRQ occurs between calling request_irq() and mv_u3d_eps_init(), then null pointer dereference occurs since u3d->eps[] wasn't initialized yet but used in mv_u3d_nuke(). The patch puts registration of the interrupt handler after initializing of neccesery data. Found by Linux Driver Verification project (linuxtesting.org). Fixes: 90fccb529d24 ("usb: gadget: Gadget directory cleanup - group UDC drivers") Acked-by: Felipe Balbi Signed-off-by: Nadezda Lutovinova Link: https://lore.kernel.org/r/20210818141247.4794-1-lutovinova@ispras.ru Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/usb/gadget/udc/mv_u3d_core.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/usb/gadget/udc/mv_u3d_core.c b/drivers/usb/gadget/udc/mv_u3d_core.c index 35e02a8d0091a..bdba3f48c0527 100644 --- a/drivers/usb/gadget/udc/mv_u3d_core.c +++ b/drivers/usb/gadget/udc/mv_u3d_core.c @@ -1922,14 +1922,6 @@ static int mv_u3d_probe(struct platform_device *dev) goto err_get_irq; } u3d->irq = r->start; - if (request_irq(u3d->irq, mv_u3d_irq, - IRQF_SHARED, driver_name, u3d)) { - u3d->irq = 0; - dev_err(&dev->dev, "Request irq %d for u3d failed\n", - u3d->irq); - retval = -ENODEV; - goto err_request_irq; - } /* initialize gadget structure */ u3d->gadget.ops = &mv_u3d_ops; /* usb_gadget_ops */ @@ -1942,6 +1934,15 @@ static int mv_u3d_probe(struct platform_device *dev) mv_u3d_eps_init(u3d); + if (request_irq(u3d->irq, mv_u3d_irq, + IRQF_SHARED, driver_name, u3d)) { + u3d->irq = 0; + dev_err(&dev->dev, "Request irq %d for u3d failed\n", + u3d->irq); + retval = -ENODEV; + goto err_request_irq; + } + /* external vbus detection */ if (u3d->vbus) { u3d->clock_gating = 1; @@ -1965,8 +1966,8 @@ static int mv_u3d_probe(struct platform_device *dev) err_unregister: free_irq(u3d->irq, u3d); -err_request_irq: err_get_irq: +err_request_irq: kfree(u3d->status_req); err_alloc_status_req: kfree(u3d->eps); From 2ace7b23546d87e3384c2c05e4d464a4ce0d0d9c Mon Sep 17 00:00:00 2001 From: Xu Yu Date: Wed, 18 Aug 2021 12:47:52 -0700 Subject: [PATCH 0594/5344] mm/swap: consider max pages in iomap_swapfile_add_extent [ Upstream commit 36ca7943ac18aebf8aad4c50829eb2ea5ec847df ] When the max pages (last_page in the swap header + 1) is smaller than the total pages (inode size) of the swapfile, iomap_swapfile_activate overwrites sis->max with total pages. However, frontswap_map is a swap page state bitmap allocated using the initial sis->max page count read from the swap header. If swapfile activation increases sis->max, it's possible for the frontswap code to walk off the end of the bitmap, thereby corrupting kernel memory. [djwong: modify the description a bit; the original paragraph reads: "However, frontswap_map is allocated using max pages. When test and clear the sis offset, which is larger than max pages, of frontswap_map in __frontswap_invalidate_page(), neighbors of frontswap_map may be overwritten, i.e., slab is polluted." Note also that this bug resulted in a behavioral change: activating a swap file that was formatted and later extended results in all pages being activated, not the number of pages recorded in the swap header.] This fixes the issue by considering the limitation of max pages of swap info in iomap_swapfile_add_extent(). To reproduce the case, compile kernel with slub RED ZONE, then run test: $ sudo stress-ng -a 1 -x softlockup,resources -t 72h --metrics --times \ --verify -v -Y /root/tmpdir/stress-ng/stress-statistic-12.yaml \ --log-file /root/tmpdir/stress-ng/stress-logfile-12.txt \ --temp-path /root/tmpdir/stress-ng/ We'll get the error log as below: [ 1151.015141] ============================================================================= [ 1151.016489] BUG kmalloc-16 (Not tainted): Right Redzone overwritten [ 1151.017486] ----------------------------------------------------------------------------- [ 1151.017486] [ 1151.018997] Disabling lock debugging due to kernel taint [ 1151.019873] INFO: 0x0000000084e43932-0x0000000098d17cae @offset=7392. First byte 0x0 instead of 0xcc [ 1151.021303] INFO: Allocated in __do_sys_swapon+0xcf6/0x1170 age=43417 cpu=9 pid=3816 [ 1151.022538] __slab_alloc+0xe/0x20 [ 1151.023069] __kmalloc_node+0xfd/0x4b0 [ 1151.023704] __do_sys_swapon+0xcf6/0x1170 [ 1151.024346] do_syscall_64+0x33/0x40 [ 1151.024925] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 1151.025749] INFO: Freed in put_cred_rcu+0xa1/0xc0 age=43424 cpu=3 pid=2041 [ 1151.026889] kfree+0x276/0x2b0 [ 1151.027405] put_cred_rcu+0xa1/0xc0 [ 1151.027949] rcu_do_batch+0x17d/0x410 [ 1151.028566] rcu_core+0x14e/0x2b0 [ 1151.029084] __do_softirq+0x101/0x29e [ 1151.029645] asm_call_irq_on_stack+0x12/0x20 [ 1151.030381] do_softirq_own_stack+0x37/0x40 [ 1151.031037] do_softirq.part.15+0x2b/0x30 [ 1151.031710] __local_bh_enable_ip+0x4b/0x50 [ 1151.032412] copy_fpstate_to_sigframe+0x111/0x360 [ 1151.033197] __setup_rt_frame+0xce/0x480 [ 1151.033809] arch_do_signal+0x1a3/0x250 [ 1151.034463] exit_to_user_mode_prepare+0xcf/0x110 [ 1151.035242] syscall_exit_to_user_mode+0x27/0x190 [ 1151.035970] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 1151.036795] INFO: Slab 0x000000003b9de4dc objects=44 used=9 fp=0x00000000539e349e flags=0xfffffc0010201 [ 1151.038323] INFO: Object 0x000000004855ba01 @offset=7376 fp=0x0000000000000000 [ 1151.038323] [ 1151.039683] Redzone 000000008d0afd3d: cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc ................ [ 1151.041180] Object 000000004855ba01: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ [ 1151.042714] Redzone 0000000084e43932: 00 00 00 c0 cc cc cc cc ........ [ 1151.044120] Padding 000000000864c042: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZZZZZZZZZ [ 1151.045615] CPU: 5 PID: 3816 Comm: stress-ng Tainted: G B 5.10.50+ #7 [ 1151.046846] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 [ 1151.048633] Call Trace: [ 1151.049072] dump_stack+0x57/0x6a [ 1151.049585] check_bytes_and_report+0xed/0x110 [ 1151.050320] check_object+0x1eb/0x290 [ 1151.050924] ? __x64_sys_swapoff+0x39a/0x540 [ 1151.051646] free_debug_processing+0x151/0x350 [ 1151.052333] __slab_free+0x21a/0x3a0 [ 1151.052938] ? _cond_resched+0x2d/0x40 [ 1151.053529] ? __vunmap+0x1de/0x220 [ 1151.054139] ? __x64_sys_swapoff+0x39a/0x540 [ 1151.054796] ? kfree+0x276/0x2b0 [ 1151.055307] kfree+0x276/0x2b0 [ 1151.055832] __x64_sys_swapoff+0x39a/0x540 [ 1151.056466] do_syscall_64+0x33/0x40 [ 1151.057084] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 1151.057866] RIP: 0033:0x150340b0ffb7 [ 1151.058481] Code: Unable to access opcode bytes at RIP 0x150340b0ff8d. [ 1151.059537] RSP: 002b:00007fff7f4ee238 EFLAGS: 00000246 ORIG_RAX: 00000000000000a8 [ 1151.060768] RAX: ffffffffffffffda RBX: 00007fff7f4ee66c RCX: 0000150340b0ffb7 [ 1151.061904] RDX: 000000000000000a RSI: 0000000000018094 RDI: 00007fff7f4ee860 [ 1151.063033] RBP: 00007fff7f4ef980 R08: 0000000000000000 R09: 0000150340a672bd [ 1151.064135] R10: 00007fff7f4edca0 R11: 0000000000000246 R12: 0000000000018094 [ 1151.065253] R13: 0000000000000005 R14: 000000000160d930 R15: 00007fff7f4ee66c [ 1151.066413] FIX kmalloc-16: Restoring 0x0000000084e43932-0x0000000098d17cae=0xcc [ 1151.066413] [ 1151.067890] FIX kmalloc-16: Object at 0x000000004855ba01 not freed Fixes: 67482129cdab ("iomap: add a swapfile activation function") Fixes: a45c0eccc564 ("iomap: move the swapfile code into a separate file") Signed-off-by: Gang Deng Signed-off-by: Xu Yu Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Sasha Levin --- fs/iomap/swapfile.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c index bd0cc3dcc9807..2d18246f67266 100644 --- a/fs/iomap/swapfile.c +++ b/fs/iomap/swapfile.c @@ -30,11 +30,16 @@ static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi) { struct iomap *iomap = &isi->iomap; unsigned long nr_pages; + unsigned long max_pages; uint64_t first_ppage; uint64_t first_ppage_reported; uint64_t next_ppage; int error; + if (unlikely(isi->nr_pages >= isi->sis->max)) + return 0; + max_pages = isi->sis->max - isi->nr_pages; + /* * Round the start up and the end down so that the physical * extent aligns to a page boundary. @@ -47,6 +52,7 @@ static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi) if (first_ppage >= next_ppage) return 0; nr_pages = next_ppage - first_ppage; + nr_pages = min(nr_pages, max_pages); /* * Calculate how much swap space we're adding; the first page contains From 71893c11f81a4af91e276192b29da09f3bc75702 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Thu, 19 Aug 2021 18:15:21 +0300 Subject: [PATCH 0595/5344] Bluetooth: add timeout sanity check to hci_inquiry [ Upstream commit f41a4b2b5eb7872109723dab8ae1603bdd9d9ec1 ] Syzbot hit "task hung" bug in hci_req_sync(). The problem was in unreasonable huge inquiry timeout passed from userspace. Fix it by adding sanity check for timeout value to hci_inquiry(). Since hci_inquiry() is the only user of hci_req_sync() with user controlled timeout value, it makes sense to check timeout value in hci_inquiry() and don't touch hci_req_sync(). Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-and-tested-by: syzbot+be2baed593ea56c6a84c@syzkaller.appspotmail.com Signed-off-by: Pavel Skripkin Signed-off-by: Marcel Holtmann Signed-off-by: Sasha Levin --- net/bluetooth/hci_core.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 4e49728003707..bdd330527cfa2 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1297,6 +1297,12 @@ int hci_inquiry(void __user *arg) goto done; } + /* Restrict maximum inquiry length to 60 seconds */ + if (ir.length > 60) { + err = -EINVAL; + goto done; + } + hci_dev_lock(hdev); if (inquiry_cache_age(hdev) > INQUIRY_CACHE_AGE_MAX || inquiry_cache_empty(hdev) || ir.flags & IREQ_CACHE_FLUSH) { From 443fb4759fa786a19a7275a832739ea7ac77a921 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Thu, 12 Aug 2021 23:35:09 +0300 Subject: [PATCH 0596/5344] i2c: iop3xx: fix deferred probing [ Upstream commit a1299505162ad00def3573260c2c68b9c8e8d697 ] When adding the code to handle platform_get_irq*() errors in the commit 489447380a29 ("handle errors returned by platform_get_irq*()"), the actual error code was enforced to be -ENXIO in the driver for some strange reason. This didn't matter much until the deferred probing was introduced -- which requires an actual error code to be propagated upstream from the failure site. While fixing this, also stop overriding the errors from request_irq() to -EIO (done since the pre-git era). Fixes: 489447380a29 ("[PATCH] handle errors returned by platform_get_irq*()") Signed-off-by: Sergey Shtylyov Signed-off-by: Wolfram Sang Signed-off-by: Sasha Levin --- drivers/i2c/busses/i2c-iop3xx.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/i2c/busses/i2c-iop3xx.c b/drivers/i2c/busses/i2c-iop3xx.c index 2f8b8050a2233..899624721c1ea 100644 --- a/drivers/i2c/busses/i2c-iop3xx.c +++ b/drivers/i2c/busses/i2c-iop3xx.c @@ -467,16 +467,14 @@ iop3xx_i2c_probe(struct platform_device *pdev) irq = platform_get_irq(pdev, 0); if (irq < 0) { - ret = -ENXIO; + ret = irq; goto unmap; } ret = request_irq(irq, iop3xx_i2c_irq_handler, 0, pdev->name, adapter_data); - if (ret) { - ret = -EIO; + if (ret) goto unmap; - } memcpy(new_adapter->name, pdev->name, strlen(pdev->name)); new_adapter->owner = THIS_MODULE; From 8d2ebc0e31cddd4badd4dbcfaca59ad4e29d6905 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sun, 4 Jul 2021 17:45:25 +0300 Subject: [PATCH 0597/5344] i2c: s3c2410: fix IRQ check [ Upstream commit d6840a5e370b7ea4fde16ce2caf431bcc87f9a75 ] Iff platform_get_irq() returns 0, the driver's probe() method will return 0 early (as if the method's call was successful). Let's consider IRQ0 valid for simplicity -- devm_request_irq() can always override that decision... Fixes: e0d1ec97853f ("i2c-s3c2410: Change IRQ to be plain integer.") Signed-off-by: Sergey Shtylyov Reviewed-by: Krzysztof Kozlowski Signed-off-by: Wolfram Sang Signed-off-by: Sasha Levin --- drivers/i2c/busses/i2c-s3c2410.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-s3c2410.c b/drivers/i2c/busses/i2c-s3c2410.c index d6322698b2458..e6f927c6f8af9 100644 --- a/drivers/i2c/busses/i2c-s3c2410.c +++ b/drivers/i2c/busses/i2c-s3c2410.c @@ -1141,7 +1141,7 @@ static int s3c24xx_i2c_probe(struct platform_device *pdev) */ if (!(i2c->quirks & QUIRK_POLL)) { i2c->irq = ret = platform_get_irq(pdev, 0); - if (ret <= 0) { + if (ret < 0) { dev_err(&pdev->dev, "cannot find IRQ\n"); clk_unprepare(i2c->clk); return ret; From 365c810d952c346f50aef352d3b4c60962194c68 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 5 Aug 2021 13:37:46 +0300 Subject: [PATCH 0598/5344] rsi: fix error code in rsi_load_9116_firmware() [ Upstream commit d0f8430332a16c7baa80ce2886339182c5d85f37 ] This code returns success if the kmemdup() fails, but obviously it should return -ENOMEM instead. Fixes: e5a1ecc97e5f ("rsi: add firmware loading for 9116 device") Signed-off-by: Dan Carpenter Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210805103746.GA26417@kili Signed-off-by: Sasha Levin --- drivers/net/wireless/rsi/rsi_91x_hal.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/rsi/rsi_91x_hal.c b/drivers/net/wireless/rsi/rsi_91x_hal.c index 0523ccee9b12c..a82412b9f409e 100644 --- a/drivers/net/wireless/rsi/rsi_91x_hal.c +++ b/drivers/net/wireless/rsi/rsi_91x_hal.c @@ -1038,8 +1038,10 @@ static int rsi_load_9116_firmware(struct rsi_hw *adapter) } ta_firmware = kmemdup(fw_entry->data, fw_entry->size, GFP_KERNEL); - if (!ta_firmware) + if (!ta_firmware) { + status = -ENOMEM; goto fail_release_fw; + } fw_p = ta_firmware; instructions_sz = fw_entry->size; rsi_dbg(INFO_ZONE, "FW Length = %d bytes\n", instructions_sz); From 09e93f9252bb8709ac9c0103cde8cafcfdbf74eb Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 16 Aug 2021 21:39:47 +0300 Subject: [PATCH 0599/5344] rsi: fix an error code in rsi_probe() [ Upstream commit 9adcdf6758d7c4c9bdaf22d78eb9fcae260ed113 ] Return -ENODEV instead of success for unsupported devices. Fixes: 54fdb318c111 ("rsi: add new device model for 9116") Signed-off-by: Dan Carpenter Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210816183947.GA2119@kili Signed-off-by: Sasha Levin --- drivers/net/wireless/rsi/rsi_91x_usb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/rsi/rsi_91x_usb.c b/drivers/net/wireless/rsi/rsi_91x_usb.c index a296f4e0d324a..e8aa3d4bda885 100644 --- a/drivers/net/wireless/rsi/rsi_91x_usb.c +++ b/drivers/net/wireless/rsi/rsi_91x_usb.c @@ -806,6 +806,7 @@ static int rsi_probe(struct usb_interface *pfunction, } else { rsi_dbg(ERR_ZONE, "%s: Unsupported RSI device id 0x%x\n", __func__, id->idProduct); + status = -ENODEV; goto err1; } From 6d80a2e4f782f45b1c3187f32753b0e14db9c59e Mon Sep 17 00:00:00 2001 From: Cezary Rojewski Date: Wed, 18 Aug 2021 09:57:33 +0200 Subject: [PATCH 0600/5344] ASoC: Intel: Skylake: Leave data as is when invoking TLV IPCs [ Upstream commit 126b3422adc80f29d2129db7f61e0113a8a526c6 ] Advancing pointer initially fixed issue for some users but caused regression for others. Leave data as it to make it easier for end users to adjust their topology files if needed. Fixes: a8cd7066f042 ("ASoC: Intel: Skylake: Strip T and L from TLV IPCs") Signed-off-by: Cezary Rojewski Tested-by: Lukasz Majczak Link: https://lore.kernel.org/r/20210818075742.1515155-3-cezary.rojewski@intel.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/intel/skylake/skl-topology.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/sound/soc/intel/skylake/skl-topology.c b/sound/soc/intel/skylake/skl-topology.c index 1940b17f27efa..104c73eb9769d 100644 --- a/sound/soc/intel/skylake/skl-topology.c +++ b/sound/soc/intel/skylake/skl-topology.c @@ -1463,12 +1463,6 @@ static int skl_tplg_tlv_control_set(struct snd_kcontrol *kcontrol, struct skl_dev *skl = get_skl_ctx(w->dapm->dev); if (ac->params) { - /* - * Widget data is expected to be stripped of T and L - */ - size -= 2 * sizeof(unsigned int); - data += 2; - if (size > ac->max) return -EINVAL; ac->size = size; From 43a9280f2a31d1ec6a8f17ee02d3c99b78b0778a Mon Sep 17 00:00:00 2001 From: Cezary Rojewski Date: Wed, 18 Aug 2021 09:57:35 +0200 Subject: [PATCH 0601/5344] ASoC: Intel: Skylake: Fix module resource and format selection [ Upstream commit e8b374b649afe756c2470e0e6668022e90bf8518 ] Module configuration may differ between its instances depending on resources required and input and output audio format. Available parameters to select from are stored in module resource and interface (format) lists. These come from topology, together with description of each of pipe's modules. Ignoring index value provided by topology and relying always on 0th entry leads to unexpected module behavior due to under/overbudged resources assigned or impropper format selection. Fix by taking entry at index specified by topology. Fixes: f6fa56e22559 ("ASoC: Intel: Skylake: Parse and update module config structure") Signed-off-by: Cezary Rojewski Tested-by: Lukasz Majczak Link: https://lore.kernel.org/r/20210818075742.1515155-5-cezary.rojewski@intel.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/intel/skylake/skl-topology.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/sound/soc/intel/skylake/skl-topology.c b/sound/soc/intel/skylake/skl-topology.c index 104c73eb9769d..254b796e635d1 100644 --- a/sound/soc/intel/skylake/skl-topology.c +++ b/sound/soc/intel/skylake/skl-topology.c @@ -113,7 +113,7 @@ static int is_skl_dsp_widget_type(struct snd_soc_dapm_widget *w, static void skl_dump_mconfig(struct skl_dev *skl, struct skl_module_cfg *mcfg) { - struct skl_module_iface *iface = &mcfg->module->formats[0]; + struct skl_module_iface *iface = &mcfg->module->formats[mcfg->fmt_idx]; dev_dbg(skl->dev, "Dumping config\n"); dev_dbg(skl->dev, "Input Format:\n"); @@ -195,8 +195,8 @@ static void skl_tplg_update_params_fixup(struct skl_module_cfg *m_cfg, struct skl_module_fmt *in_fmt, *out_fmt; /* Fixups will be applied to pin 0 only */ - in_fmt = &m_cfg->module->formats[0].inputs[0].fmt; - out_fmt = &m_cfg->module->formats[0].outputs[0].fmt; + in_fmt = &m_cfg->module->formats[m_cfg->fmt_idx].inputs[0].fmt; + out_fmt = &m_cfg->module->formats[m_cfg->fmt_idx].outputs[0].fmt; if (params->stream == SNDRV_PCM_STREAM_PLAYBACK) { if (is_fe) { @@ -239,9 +239,9 @@ static void skl_tplg_update_buffer_size(struct skl_dev *skl, /* Since fixups is applied to pin 0 only, ibs, obs needs * change for pin 0 only */ - res = &mcfg->module->resources[0]; - in_fmt = &mcfg->module->formats[0].inputs[0].fmt; - out_fmt = &mcfg->module->formats[0].outputs[0].fmt; + res = &mcfg->module->resources[mcfg->res_idx]; + in_fmt = &mcfg->module->formats[mcfg->fmt_idx].inputs[0].fmt; + out_fmt = &mcfg->module->formats[mcfg->fmt_idx].outputs[0].fmt; if (mcfg->m_type == SKL_MODULE_TYPE_SRCINT) multiplier = 5; @@ -1631,11 +1631,12 @@ int skl_tplg_update_pipe_params(struct device *dev, struct skl_module_cfg *mconfig, struct skl_pipe_params *params) { - struct skl_module_res *res = &mconfig->module->resources[0]; + struct skl_module_res *res; struct skl_dev *skl = get_skl_ctx(dev); struct skl_module_fmt *format = NULL; u8 cfg_idx = mconfig->pipe->cur_config_idx; + res = &mconfig->module->resources[mconfig->res_idx]; skl_tplg_fill_dma_id(mconfig, params); mconfig->fmt_idx = mconfig->mod_cfg[cfg_idx].fmt_idx; mconfig->res_idx = mconfig->mod_cfg[cfg_idx].res_idx; @@ -1644,9 +1645,9 @@ int skl_tplg_update_pipe_params(struct device *dev, return 0; if (params->stream == SNDRV_PCM_STREAM_PLAYBACK) - format = &mconfig->module->formats[0].inputs[0].fmt; + format = &mconfig->module->formats[mconfig->fmt_idx].inputs[0].fmt; else - format = &mconfig->module->formats[0].outputs[0].fmt; + format = &mconfig->module->formats[mconfig->fmt_idx].outputs[0].fmt; /* set the hw_params */ format->s_freq = params->s_freq; From 61465acae42802ba76329dc4f93ff50b463bf22b Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Tue, 10 Aug 2021 11:16:43 +0300 Subject: [PATCH 0602/5344] mmc: dw_mmc: Fix issue with uninitialized dma_slave_config [ Upstream commit c3ff0189d3bc9c03845fe37472c140f0fefd0c79 ] Depending on the DMA driver being used, the struct dma_slave_config may need to be initialized to zero for the unused data. For example, we have three DMA drivers using src_port_window_size and dst_port_window_size. If these are left uninitialized, it can cause DMA failures. For dw_mmc, this is probably not currently an issue but is still good to fix though. Fixes: 3fc7eaef44db ("mmc: dw_mmc: Add external dma interface support") Cc: Shawn Lin Cc: Jaehoon Chung Cc: Peter Ujfalusi Cc: Vinod Koul Signed-off-by: Tony Lindgren Link: https://lore.kernel.org/r/20210810081644.19353-2-tony@atomide.com Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- drivers/mmc/host/dw_mmc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c index 6ace82028667b..7b280cb363271 100644 --- a/drivers/mmc/host/dw_mmc.c +++ b/drivers/mmc/host/dw_mmc.c @@ -782,6 +782,7 @@ static int dw_mci_edmac_start_dma(struct dw_mci *host, int ret = 0; /* Set external dma config: burst size, burst width */ + memset(&cfg, 0, sizeof(cfg)); cfg.dst_addr = host->phy_regs + fifo_offset; cfg.src_addr = cfg.dst_addr; cfg.dst_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES; From 2a6d04e622342eed6890c7fda9e1b30bba4ba972 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Tue, 10 Aug 2021 11:16:44 +0300 Subject: [PATCH 0603/5344] mmc: moxart: Fix issue with uninitialized dma_slave_config [ Upstream commit ee5165354d498e5bceb0b386e480ac84c5f8c28c ] Depending on the DMA driver being used, the struct dma_slave_config may need to be initialized to zero for the unused data. For example, we have three DMA drivers using src_port_window_size and dst_port_window_size. If these are left uninitialized, it can cause DMA failures. For moxart, this is probably not currently an issue but is still good to fix though. Fixes: 1b66e94e6b99 ("mmc: moxart: Add MOXA ART SD/MMC driver") Cc: Jonas Jensen Cc: Vinod Koul Cc: Peter Ujfalusi Signed-off-by: Tony Lindgren Link: https://lore.kernel.org/r/20210810081644.19353-3-tony@atomide.com Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- drivers/mmc/host/moxart-mmc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mmc/host/moxart-mmc.c b/drivers/mmc/host/moxart-mmc.c index a0670e9cd0127..5553a5643f405 100644 --- a/drivers/mmc/host/moxart-mmc.c +++ b/drivers/mmc/host/moxart-mmc.c @@ -631,6 +631,7 @@ static int moxart_probe(struct platform_device *pdev) host->dma_chan_tx, host->dma_chan_rx); host->have_dma = true; + memset(&cfg, 0, sizeof(cfg)); cfg.src_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES; cfg.dst_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES; From 7de193f6017478c4ec83b4f0b6a98122d606baed Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 20 Aug 2021 09:39:35 -0700 Subject: [PATCH 0604/5344] bpf: Fix possible out of bound write in narrow load handling [ Upstream commit d7af7e497f0308bc97809cc48b58e8e0f13887e1 ] Fix a verifier bug found by smatch static checker in [0]. This problem has never been seen in prod to my best knowledge. Fixing it still seems to be a good idea since it's hard to say for sure whether it's possible or not to have a scenario where a combination of convert_ctx_access() and a narrow load would lead to an out of bound write. When narrow load is handled, one or two new instructions are added to insn_buf array, but before it was only checked that cnt >= ARRAY_SIZE(insn_buf) And it's safe to add a new instruction to insn_buf[cnt++] only once. The second try will lead to out of bound write. And this is what can happen if `shift` is set. Fix it by making sure that if the BPF_RSH instruction has to be added in addition to BPF_AND then there is enough space for two more instructions in insn_buf. The full report [0] is below: kernel/bpf/verifier.c:12304 convert_ctx_accesses() warn: offset 'cnt' incremented past end of array kernel/bpf/verifier.c:12311 convert_ctx_accesses() warn: offset 'cnt' incremented past end of array kernel/bpf/verifier.c 12282 12283 insn->off = off & ~(size_default - 1); 12284 insn->code = BPF_LDX | BPF_MEM | size_code; 12285 } 12286 12287 target_size = 0; 12288 cnt = convert_ctx_access(type, insn, insn_buf, env->prog, 12289 &target_size); 12290 if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) || ^^^^^^^^^^^^^^^^^^^^^^^^^^^ Bounds check. 12291 (ctx_field_size && !target_size)) { 12292 verbose(env, "bpf verifier is misconfigured\n"); 12293 return -EINVAL; 12294 } 12295 12296 if (is_narrower_load && size < target_size) { 12297 u8 shift = bpf_ctx_narrow_access_offset( 12298 off, size, size_default) * 8; 12299 if (ctx_field_size <= 4) { 12300 if (shift) 12301 insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH, ^^^^^ increment beyond end of array 12302 insn->dst_reg, 12303 shift); --> 12304 insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, ^^^^^ out of bounds write 12305 (1 << size * 8) - 1); 12306 } else { 12307 if (shift) 12308 insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH, 12309 insn->dst_reg, 12310 shift); 12311 insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg, ^^^^^^^^^^^^^^^ Same. 12312 (1ULL << size * 8) - 1); 12313 } 12314 } 12315 12316 new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); 12317 if (!new_prog) 12318 return -ENOMEM; 12319 12320 delta += cnt - 1; 12321 12322 /* keep walking new program and skip insns we just inserted */ 12323 env->prog = new_prog; 12324 insn = new_prog->insnsi + i + delta; 12325 } 12326 12327 return 0; 12328 } [0] https://lore.kernel.org/bpf/20210817050843.GA21456@kili/ v1->v2: - clarify that problem was only seen by static checker but not in prod; Fixes: 46f53a65d2de ("bpf: Allow narrow loads with offset > 0") Reported-by: Dan Carpenter Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210820163935.1902398-1-rdna@fb.com Signed-off-by: Sasha Levin --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eb95cf0f5609b..e613a8447900a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8963,6 +8963,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (is_narrower_load && size < target_size) { u8 shift = bpf_ctx_narrow_access_offset( off, size, size_default) * 8; + if (shift && cnt + 1 >= ARRAY_SIZE(insn_buf)) { + verbose(env, "bpf verifier narrow ctx load misconfigured\n"); + return -EINVAL; + } if (ctx_field_size <= 4) { if (shift) insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH, From e32beca5fee3977d4b5652d171faa6a9c1c66656 Mon Sep 17 00:00:00 2001 From: Len Baker Date: Tue, 17 Aug 2021 12:27:09 +0200 Subject: [PATCH 0605/5344] CIFS: Fix a potencially linear read overflow [ Upstream commit f980d055a0f858d73d9467bb0b570721bbfcdfb8 ] strlcpy() reads the entire source buffer first. This read may exceed the destination size limit. This is both inefficient and can lead to linear read overflows if a source string is not NUL-terminated. Also, the strnlen() call does not avoid the read overflow in the strlcpy function when a not NUL-terminated string is passed. So, replace this block by a call to kstrndup() that avoids this type of overflow and does the same. Fixes: 066ce6899484d ("cifs: rename cifs_strlcpy_to_host and make it use new functions") Signed-off-by: Len Baker Reviewed-by: Paulo Alcantara (SUSE) Reviewed-by: Jeff Layton Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/cifs/cifs_unicode.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 9bd03a2310328..171ad8b42107e 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -358,14 +358,9 @@ cifs_strndup_from_utf16(const char *src, const int maxlen, if (!dst) return NULL; cifs_from_utf16(dst, (__le16 *) src, len, maxlen, codepage, - NO_MAP_UNI_RSVD); + NO_MAP_UNI_RSVD); } else { - len = strnlen(src, maxlen); - len++; - dst = kmalloc(len, GFP_KERNEL); - if (!dst) - return NULL; - strlcpy(dst, src, len); + dst = kstrndup(src, maxlen, GFP_KERNEL); } return dst; From 061532d06174b94636100f20d1eac3d47fb9a5ab Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sun, 4 Jul 2021 17:38:45 +0300 Subject: [PATCH 0606/5344] i2c: mt65xx: fix IRQ check [ Upstream commit 58fb7c643d346e2364404554f531cfa6a1a3917c ] Iff platform_get_irq() returns 0, the driver's probe() method will return 0 early (as if the method's call was successful). Let's consider IRQ0 valid for simplicity -- devm_request_irq() can always override that decision... Fixes: ce38815d39ea ("I2C: mediatek: Add driver for MediaTek I2C controller") Signed-off-by: Sergey Shtylyov Reviewed-by: Qii Wang Signed-off-by: Wolfram Sang Signed-off-by: Sasha Levin --- drivers/i2c/busses/i2c-mt65xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-mt65xx.c b/drivers/i2c/busses/i2c-mt65xx.c index e1ef0122ef759..5587e7c549c4f 100644 --- a/drivers/i2c/busses/i2c-mt65xx.c +++ b/drivers/i2c/busses/i2c-mt65xx.c @@ -932,7 +932,7 @@ static int mtk_i2c_probe(struct platform_device *pdev) return PTR_ERR(i2c->pdmabase); irq = platform_get_irq(pdev, 0); - if (irq <= 0) + if (irq < 0) return irq; init_completion(&i2c->msg_complete); From b2337ef82e84d6a70dbf7edadd04e65f2000a89f Mon Sep 17 00:00:00 2001 From: Evgeny Novikov Date: Wed, 25 Aug 2021 20:09:02 +0300 Subject: [PATCH 0607/5344] usb: ehci-orion: Handle errors of clk_prepare_enable() in probe [ Upstream commit 4720f1bf4ee4a784d9ece05420ba33c9222a3004 ] ehci_orion_drv_probe() did not account for possible errors of clk_prepare_enable() that in particular could cause invocation of clk_disable_unprepare() on clocks that were not prepared/enabled yet, e.g. in remove or on handling errors of usb_add_hcd() in probe. Though, there were several patches fixing different issues with clocks in this driver, they did not solve this problem. Add handling of errors of clk_prepare_enable() in ehci_orion_drv_probe() to avoid calls of clk_disable_unprepare() without previous successful invocation of clk_prepare_enable(). Found by Linux Driver Verification project (linuxtesting.org). Fixes: 8c869edaee07 ("ARM: Orion: EHCI: Add support for enabling clocks") Co-developed-by: Kirill Shilimanov Reviewed-by: Andrew Lunn Acked-by: Alan Stern Signed-off-by: Evgeny Novikov Signed-off-by: Kirill Shilimanov Link: https://lore.kernel.org/r/20210825170902.11234-1-novikov@ispras.ru Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/usb/host/ehci-orion.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/usb/host/ehci-orion.c b/drivers/usb/host/ehci-orion.c index a319b1df3011c..3626758b3e2aa 100644 --- a/drivers/usb/host/ehci-orion.c +++ b/drivers/usb/host/ehci-orion.c @@ -264,8 +264,11 @@ static int ehci_orion_drv_probe(struct platform_device *pdev) * the clock does not exists. */ priv->clk = devm_clk_get(&pdev->dev, NULL); - if (!IS_ERR(priv->clk)) - clk_prepare_enable(priv->clk); + if (!IS_ERR(priv->clk)) { + err = clk_prepare_enable(priv->clk); + if (err) + goto err_put_hcd; + } priv->phy = devm_phy_optional_get(&pdev->dev, "usb"); if (IS_ERR(priv->phy)) { @@ -311,6 +314,7 @@ static int ehci_orion_drv_probe(struct platform_device *pdev) err_dis_clk: if (!IS_ERR(priv->clk)) clk_disable_unprepare(priv->clk); +err_put_hcd: usb_put_hcd(hcd); err: dev_err(&pdev->dev, "init %s fail, %d\n", From cc5a906defcfed37aa9c95660be1a60a09bf5a67 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 18 Aug 2021 21:32:38 +0200 Subject: [PATCH 0608/5344] usb: bdc: Fix an error handling path in 'bdc_probe()' when no suitable DMA config is available [ Upstream commit d2f42e09393c774ab79088d8e3afcc62b3328fc9 ] If no suitable DMA configuration is available, a previous 'bdc_phy_init()' call must be undone by a corresponding 'bdc_phy_exit()' call. Branch to the existing error handling path instead of returning directly. Fixes: cc29d4f67757 ("usb: bdc: Add support for USB phy") Acked-by: Florian Fainelli Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/0c5910979f39225d5d8fe68c9ab1c147c68ddee1.1629314734.git.christophe.jaillet@wanadoo.fr Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/usb/gadget/udc/bdc/bdc_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/udc/bdc/bdc_core.c b/drivers/usb/gadget/udc/bdc/bdc_core.c index 3d33499db50b5..845aead48d85b 100644 --- a/drivers/usb/gadget/udc/bdc/bdc_core.c +++ b/drivers/usb/gadget/udc/bdc/bdc_core.c @@ -565,7 +565,8 @@ static int bdc_probe(struct platform_device *pdev) if (ret) { dev_err(dev, "No suitable DMA config available, abort\n"); - return -ENOTSUPP; + ret = -ENOTSUPP; + goto phycleanup; } dev_dbg(dev, "Using 32-bit address\n"); } From e752b3be6679ce758a01bfb3311f07352746ec87 Mon Sep 17 00:00:00 2001 From: Andy Duan Date: Thu, 19 Aug 2021 10:10:33 +0800 Subject: [PATCH 0609/5344] tty: serial: fsl_lpuart: fix the wrong mapbase value [ Upstream commit d5c38948448abc2bb6b36dbf85a554bf4748885e ] Register offset needs to be applied on mapbase also. dma_tx/rx_request use the physical address of UARTDATA. Register offset is currently only applied to membase (the corresponding virtual addr) but not on mapbase. Fixes: 24b1e5f0e83c ("tty: serial: lpuart: add imx7ulp support") Reviewed-by: Leonard Crestez Signed-off-by: Adriana Reus Signed-off-by: Sherry Sun Signed-off-by: Andy Duan Link: https://lore.kernel.org/r/20210819021033.32606-1-sherry.sun@nxp.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/tty/serial/fsl_lpuart.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c index b053345dfd1ae..13e705b53217d 100644 --- a/drivers/tty/serial/fsl_lpuart.c +++ b/drivers/tty/serial/fsl_lpuart.c @@ -2414,7 +2414,7 @@ static int lpuart_probe(struct platform_device *pdev) return PTR_ERR(sport->port.membase); sport->port.membase += sdata->reg_off; - sport->port.mapbase = res->start; + sport->port.mapbase = res->start + sdata->reg_off; sport->port.dev = &pdev->dev; sport->port.type = PORT_LPUART; sport->devtype = sdata->devtype; From a0baff0a1082087420cfcba44b14bbada0dd92b2 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 16 Aug 2021 07:25:10 +0200 Subject: [PATCH 0610/5344] ASoC: wcd9335: Fix a double irq free in the remove function [ Upstream commit 7a6a723e98aa45f393e6add18f7309dfffa1b0e2 ] There is no point in calling 'free_irq()' explicitly for 'WCD9335_IRQ_SLIMBUS' in the remove function. The irqs are requested in 'wcd9335_setup_irqs()' using a resource managed function (i.e. 'devm_request_threaded_irq()'). 'wcd9335_setup_irqs()' requests all what is defined in the 'wcd9335_irqs' structure. This structure has only one entry for 'WCD9335_IRQ_SLIMBUS'. So 'devm_request...irq()' + explicit 'free_irq()' would lead to a double free. Remove the unneeded 'free_irq()' from the remove function. Fixes: 20aedafdf492 ("ASoC: wcd9335: add support to wcd9335 codec") Signed-off-by: Christophe JAILLET Message-Id: <0614d63bc00edd7e81dd367504128f3d84f72efa.1629091028.git.christophe.jaillet@wanadoo.fr> Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/codecs/wcd9335.c | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/soc/codecs/wcd9335.c b/sound/soc/codecs/wcd9335.c index 81906c25e4a87..a31a20dcd6b53 100644 --- a/sound/soc/codecs/wcd9335.c +++ b/sound/soc/codecs/wcd9335.c @@ -4869,7 +4869,6 @@ static void wcd9335_codec_remove(struct snd_soc_component *comp) struct wcd9335_codec *wcd = dev_get_drvdata(comp->dev); wcd_clsh_ctrl_free(wcd->clsh_ctrl); - free_irq(regmap_irq_get_virq(wcd->irq_data, WCD9335_IRQ_SLIMBUS), wcd); } static int wcd9335_codec_set_sysclk(struct snd_soc_component *comp, From debded04831b61c18778b8463dada3a2b79c6806 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 16 Aug 2021 07:25:20 +0200 Subject: [PATCH 0611/5344] ASoC: wcd9335: Fix a memory leak in the error handling path of the probe function [ Upstream commit fc6fc81caa63900cef9ebb8b2e365c3ed5a9effb ] If 'wcd9335_setup_irqs()' fails, me must release the memory allocated in 'wcd_clsh_ctrl_alloc()', as already done in the remove function. Add an error handling path and the missing 'wcd_clsh_ctrl_free()' call. Fixes: 20aedafdf492 ("ASoC: wcd9335: add support to wcd9335 codec") Signed-off-by: Christophe JAILLET Message-Id: <6dc12372f09fabb70bf05941dbe6a1382dc93e43.1629091028.git.christophe.jaillet@wanadoo.fr> Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/codecs/wcd9335.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/sound/soc/codecs/wcd9335.c b/sound/soc/codecs/wcd9335.c index a31a20dcd6b53..5ec63217ad020 100644 --- a/sound/soc/codecs/wcd9335.c +++ b/sound/soc/codecs/wcd9335.c @@ -4844,6 +4844,7 @@ static void wcd9335_codec_init(struct snd_soc_component *component) static int wcd9335_codec_probe(struct snd_soc_component *component) { struct wcd9335_codec *wcd = dev_get_drvdata(component->dev); + int ret; int i; snd_soc_component_init_regmap(component, wcd->regmap); @@ -4861,7 +4862,15 @@ static int wcd9335_codec_probe(struct snd_soc_component *component) for (i = 0; i < NUM_CODEC_DAIS; i++) INIT_LIST_HEAD(&wcd->dai[i].slim_ch_list); - return wcd9335_setup_irqs(wcd); + ret = wcd9335_setup_irqs(wcd); + if (ret) + goto free_clsh_ctrl; + + return 0; + +free_clsh_ctrl: + wcd_clsh_ctrl_free(wcd->clsh_ctrl); + return ret; } static void wcd9335_codec_remove(struct snd_soc_component *comp) From 933cbc0bbee6641cb2d073c37a0f9cc808bb4c68 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 16 Aug 2021 07:25:28 +0200 Subject: [PATCH 0612/5344] ASoC: wcd9335: Disable irq on slave ports in the remove function [ Upstream commit d3efd26af2e044ff2b48d38bb871630282d77e60 ] The probe calls 'wcd9335_setup_irqs()' to enable interrupts on all slave ports. This must be undone in the remove function. Add a 'wcd9335_teardown_irqs()' function that undoes 'wcd9335_setup_irqs()' function, and call it from the remove function. Fixes: 20aedafdf492 ("ASoC: wcd9335: add support to wcd9335 codec") Signed-off-by: Christophe JAILLET Message-Id: <8f761244d79bd4c098af8a482be9121d3a486d1b.1629091028.git.christophe.jaillet@wanadoo.fr> Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/codecs/wcd9335.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/sound/soc/codecs/wcd9335.c b/sound/soc/codecs/wcd9335.c index 5ec63217ad020..016aff97e2fb2 100644 --- a/sound/soc/codecs/wcd9335.c +++ b/sound/soc/codecs/wcd9335.c @@ -4076,6 +4076,16 @@ static int wcd9335_setup_irqs(struct wcd9335_codec *wcd) return ret; } +static void wcd9335_teardown_irqs(struct wcd9335_codec *wcd) +{ + int i; + + /* disable interrupts on all slave ports */ + for (i = 0; i < WCD9335_SLIM_NUM_PORT_REG; i++) + regmap_write(wcd->if_regmap, WCD9335_SLIM_PGD_PORT_INT_EN0 + i, + 0x00); +} + static void wcd9335_cdc_sido_ccl_enable(struct wcd9335_codec *wcd, bool ccl_flag) { @@ -4878,6 +4888,7 @@ static void wcd9335_codec_remove(struct snd_soc_component *comp) struct wcd9335_codec *wcd = dev_get_drvdata(comp->dev); wcd_clsh_ctrl_free(wcd->clsh_ctrl); + wcd9335_teardown_irqs(wcd); } static int wcd9335_codec_set_sysclk(struct snd_soc_component *comp, From 5186e59d9e19f23f35a780ff13ecdd526b2c113a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 13 Aug 2021 14:34:38 +0300 Subject: [PATCH 0613/5344] ath6kl: wmi: fix an error code in ath6kl_wmi_sync_point() [ Upstream commit fd6729ec534cffbbeb3917761e6d1fe6a412d3fe ] This error path is unlikely because of it checked for NULL and returned -ENOMEM earlier in the function. But it should return an error code here as well if we ever do hit it because of a race condition or something. Fixes: bdcd81707973 ("Add ath6kl cleaned up driver") Signed-off-by: Dan Carpenter Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210813113438.GB30697@kili Signed-off-by: Sasha Levin --- drivers/net/wireless/ath/ath6kl/wmi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath6kl/wmi.c b/drivers/net/wireless/ath/ath6kl/wmi.c index c610fe21c85c0..31ffec3a59727 100644 --- a/drivers/net/wireless/ath/ath6kl/wmi.c +++ b/drivers/net/wireless/ath/ath6kl/wmi.c @@ -2510,8 +2510,10 @@ static int ath6kl_wmi_sync_point(struct wmi *wmi, u8 if_idx) goto free_data_skb; for (index = 0; index < num_pri_streams; index++) { - if (WARN_ON(!data_sync_bufs[index].skb)) + if (WARN_ON(!data_sync_bufs[index].skb)) { + ret = -ENOMEM; goto free_data_skb; + } ep_id = ath6kl_ac2_endpoint_id(wmi->parent_dev, data_sync_bufs[index]. From 3d7e5b7590f9ad4a0774435d06f19ec72bff9812 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Tue, 27 Jul 2021 10:52:31 +0800 Subject: [PATCH 0614/5344] bcma: Fix memory leak for internally-handled cores [ Upstream commit b63aed3ff195130fef12e0af590f4838cf0201d8 ] kmemleak reported that dev_name() of internally-handled cores were leaked on driver unbinding. Let's use device_initialize() to take refcounts for them and put_device() to properly free the related stuff. While looking at it, there's another potential issue for those which should be *registered* into driver core. If device_register() failed, we put device once and freed bcma_device structures. In bcma_unregister_cores(), they're treated as unregistered and we hit both UAF and double-free. That smells not good and has also been fixed now. Fixes: ab54bc8460b5 ("bcma: fill core details for every device") Signed-off-by: Zenghui Yu Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210727025232.663-2-yuzenghui@huawei.com Signed-off-by: Sasha Levin --- drivers/bcma/main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/bcma/main.c b/drivers/bcma/main.c index 6535614a7dc13..1df2b5801c3bc 100644 --- a/drivers/bcma/main.c +++ b/drivers/bcma/main.c @@ -236,6 +236,7 @@ EXPORT_SYMBOL(bcma_core_irq); void bcma_prepare_core(struct bcma_bus *bus, struct bcma_device *core) { + device_initialize(&core->dev); core->dev.release = bcma_release_core_dev; core->dev.bus = &bcma_bus_type; dev_set_name(&core->dev, "bcma%d:%d", bus->num, core->core_index); @@ -277,11 +278,10 @@ static void bcma_register_core(struct bcma_bus *bus, struct bcma_device *core) { int err; - err = device_register(&core->dev); + err = device_add(&core->dev); if (err) { bcma_err(bus, "Could not register dev for core 0x%03X\n", core->id.id); - put_device(&core->dev); return; } core->dev_registered = true; @@ -372,7 +372,7 @@ void bcma_unregister_cores(struct bcma_bus *bus) /* Now noone uses internally-handled cores, we can free them */ list_for_each_entry_safe(core, tmp, &bus->cores, list) { list_del(&core->list); - kfree(core); + put_device(&core->dev); } } From 2c3c86709a4e7fb75dd0d230e0fc6dfc41adafa7 Mon Sep 17 00:00:00 2001 From: Ahmad Fatoum Date: Tue, 17 Aug 2021 08:35:22 +0200 Subject: [PATCH 0615/5344] brcmfmac: pcie: fix oops on failure to resume and reprobe [ Upstream commit d745ca4f2c4ae9f1bd8cf7d8ac6e22d739bffd19 ] When resuming from suspend, brcmf_pcie_pm_leave_D3 will first attempt a hot resume and then fall back to removing the PCI device and then reprobing. If this probe fails, the kernel will oops, because brcmf_err, which is called to report the failure will dereference the stale bus pointer. Open code and use the default bus-less brcmf_err to avoid this. Fixes: 8602e62441ab ("brcmfmac: pass bus to the __brcmf_err() in pcie.c") Signed-off-by: Ahmad Fatoum Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210817063521.22450-1-a.fatoum@pengutronix.de Signed-off-by: Sasha Levin --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c index bda042138e967..e6001f0a81a3a 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c @@ -2073,7 +2073,7 @@ static int brcmf_pcie_pm_leave_D3(struct device *dev) err = brcmf_pcie_probe(pdev, NULL); if (err) - brcmf_err(bus, "probe after resume failed, err=%d\n", err); + __brcmf_err(NULL, __func__, "probe after resume failed, err=%d\n", err); return err; } From 5db78ebb46d0256caa7e24bb7c99a0c9bf24a8bc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 29 Aug 2021 15:16:14 -0700 Subject: [PATCH 0616/5344] ipv6: make exception cache less predictible [ Upstream commit a00df2caffed3883c341d5685f830434312e4a43 ] Even after commit 4785305c05b2 ("ipv6: use siphash in rt6_exception_hash()"), an attacker can still use brute force to learn some secrets from a victim linux host. One way to defeat these attacks is to make the max depth of the hash table bucket a random value. Before this patch, each bucket of the hash table used to store exceptions could contain 6 items under attack. After the patch, each bucket would contains a random number of items, between 6 and 10. The attacker can no longer infer secrets. This is slightly increasing memory size used by the hash table, we do not expect this to be a problem. Following patch is dealing with the same issue in IPv4. Fixes: 35732d01fe31 ("ipv6: introduce a hash table to store dst cache") Signed-off-by: Eric Dumazet Reported-by: Keyu Man Cc: Wei Wang Cc: Martin KaFai Lau Reviewed-by: David Ahern Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/ipv6/route.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d6fc22f7d7a67..575bd0f1b0089 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1667,6 +1667,7 @@ static int rt6_insert_exception(struct rt6_info *nrt, struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; struct fib6_nh *nh = res->nh; + int max_depth; int err = 0; spin_lock_bh(&rt6_exception_lock); @@ -1721,7 +1722,9 @@ static int rt6_insert_exception(struct rt6_info *nrt, bucket->depth++; net->ipv6.rt6_stats->fib_rt_cache++; - if (bucket->depth > FIB6_MAX_DEPTH) + /* Randomize max depth to avoid some side channels attacks. */ + max_depth = FIB6_MAX_DEPTH + prandom_u32_max(FIB6_MAX_DEPTH); + while (bucket->depth > max_depth) rt6_exception_remove_oldest(bucket); out: From 69788e1a56af62764bb6e65da19760c9bc1d4d24 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 29 Aug 2021 15:16:15 -0700 Subject: [PATCH 0617/5344] ipv4: make exception cache less predictible [ Upstream commit 67d6d681e15b578c1725bad8ad079e05d1c48a8e ] Even after commit 6457378fe796 ("ipv4: use siphash instead of Jenkins in fnhe_hashfun()"), an attacker can still use brute force to learn some secrets from a victim linux host. One way to defeat these attacks is to make the max depth of the hash table bucket a random value. Before this patch, each bucket of the hash table used to store exceptions could contain 6 items under attack. After the patch, each bucket would contains a random number of items, between 6 and 10. The attacker can no longer infer secrets. This is slightly increasing memory size used by the hash table, by 50% in average, we do not expect this to be a problem. This patch is more complex than the prior one (IPv6 equivalent), because IPv4 was reusing the oldest entry. Since we need to be able to evict more than one entry per update_or_create_fnhe() call, I had to replace fnhe_oldest() with fnhe_remove_oldest(). Also note that we will queue extra kfree_rcu() calls under stress, which hopefully wont be a too big issue. Fixes: 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions.") Signed-off-by: Eric Dumazet Reported-by: Keyu Man Cc: Willy Tarreau Signed-off-by: David S. Miller Reviewed-by: David Ahern Tested-by: David Ahern Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/ipv4/route.c | 46 ++++++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 0e976848d4bb9..03f35764a40ea 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -610,18 +610,25 @@ static void fnhe_flush_routes(struct fib_nh_exception *fnhe) } } -static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) +static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash) { - struct fib_nh_exception *fnhe, *oldest; + struct fib_nh_exception __rcu **fnhe_p, **oldest_p; + struct fib_nh_exception *fnhe, *oldest = NULL; - oldest = rcu_dereference(hash->chain); - for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; - fnhe = rcu_dereference(fnhe->fnhe_next)) { - if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) + for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) { + fnhe = rcu_dereference_protected(*fnhe_p, + lockdep_is_held(&fnhe_lock)); + if (!fnhe) + break; + if (!oldest || + time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) { oldest = fnhe; + oldest_p = fnhe_p; + } } fnhe_flush_routes(oldest); - return oldest; + *oldest_p = oldest->fnhe_next; + kfree_rcu(oldest, rcu); } static inline u32 fnhe_hashfun(__be32 daddr) @@ -700,16 +707,21 @@ static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr, if (rt) fill_route_from_fnhe(rt, fnhe); } else { - if (depth > FNHE_RECLAIM_DEPTH) - fnhe = fnhe_oldest(hash); - else { - fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); - if (!fnhe) - goto out_unlock; - - fnhe->fnhe_next = hash->chain; - rcu_assign_pointer(hash->chain, fnhe); + /* Randomize max depth to avoid some side channels attacks. */ + int max_depth = FNHE_RECLAIM_DEPTH + + prandom_u32_max(FNHE_RECLAIM_DEPTH); + + while (depth > max_depth) { + fnhe_remove_oldest(hash); + depth--; } + + fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); + if (!fnhe) + goto out_unlock; + + fnhe->fnhe_next = hash->chain; + fnhe->fnhe_genid = genid; fnhe->fnhe_daddr = daddr; fnhe->fnhe_gw = gw; @@ -717,6 +729,8 @@ static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr, fnhe->fnhe_mtu_locked = lock; fnhe->fnhe_expires = max(1UL, expires); + rcu_assign_pointer(hash->chain, fnhe); + /* Exception created; mark the cached routes for the nexthop * stale, so anyone caching it rechecks if this exception * applies to them. From fd6c47c3f0e7c262f3643353dac06f83217b41c0 Mon Sep 17 00:00:00 2001 From: Xiyu Yang Date: Sun, 29 Aug 2021 23:58:01 +0800 Subject: [PATCH 0618/5344] net: sched: Fix qdisc_rate_table refcount leak when get tcf_block failed [ Upstream commit c66070125837900163b81a03063ddd657a7e9bfb ] The reference counting issue happens in one exception handling path of cbq_change_class(). When failing to get tcf_block, the function forgets to decrease the refcount of "rtab" increased by qdisc_put_rtab(), causing a refcount leak. Fix this issue by jumping to "failure" label when get tcf_block failed. Fixes: 6529eaba33f0 ("net: sched: introduce tcf block infractructure") Signed-off-by: Xiyu Yang Reviewed-by: Cong Wang Link: https://lore.kernel.org/r/1630252681-71588-1-git-send-email-xiyuyang19@fudan.edu.cn Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/sched/sch_cbq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 39b427dc75128..e5972889cd81c 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1614,7 +1614,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); if (err) { kfree(cl); - return err; + goto failure; } if (tca[TCA_RATE]) { From aba37ee22122c049abc2b84d4125936a58668b12 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Sat, 28 Aug 2021 16:23:15 +0200 Subject: [PATCH 0619/5344] net: qualcomm: fix QCA7000 checksum handling [ Upstream commit 429205da6c834447a57279af128bdd56ccd5225e ] Based on tests the QCA7000 doesn't support checksum offloading. So assume ip_summed is CHECKSUM_NONE and let the kernel take care of the checksum handling. This fixes data transfer issues in noisy environments. Reported-by: Michael Heimpold Fixes: 291ab06ecf67 ("net: qualcomm: new Ethernet over SPI driver for QCA7000") Signed-off-by: Stefan Wahren Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/qualcomm/qca_spi.c | 2 +- drivers/net/ethernet/qualcomm/qca_uart.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/qualcomm/qca_spi.c b/drivers/net/ethernet/qualcomm/qca_spi.c index baac016f3ec0b..15591ad5fe4ea 100644 --- a/drivers/net/ethernet/qualcomm/qca_spi.c +++ b/drivers/net/ethernet/qualcomm/qca_spi.c @@ -434,7 +434,7 @@ qcaspi_receive(struct qcaspi *qca) skb_put(qca->rx_skb, retcode); qca->rx_skb->protocol = eth_type_trans( qca->rx_skb, qca->rx_skb->dev); - qca->rx_skb->ip_summed = CHECKSUM_UNNECESSARY; + skb_checksum_none_assert(qca->rx_skb); netif_rx_ni(qca->rx_skb); qca->rx_skb = netdev_alloc_skb_ip_align(net_dev, net_dev->mtu + VLAN_ETH_HLEN); diff --git a/drivers/net/ethernet/qualcomm/qca_uart.c b/drivers/net/ethernet/qualcomm/qca_uart.c index 0981068504fa7..ade70f5df496c 100644 --- a/drivers/net/ethernet/qualcomm/qca_uart.c +++ b/drivers/net/ethernet/qualcomm/qca_uart.c @@ -107,7 +107,7 @@ qca_tty_receive(struct serdev_device *serdev, const unsigned char *data, skb_put(qca->rx_skb, retcode); qca->rx_skb->protocol = eth_type_trans( qca->rx_skb, qca->rx_skb->dev); - qca->rx_skb->ip_summed = CHECKSUM_UNNECESSARY; + skb_checksum_none_assert(qca->rx_skb); netif_rx_ni(qca->rx_skb); qca->rx_skb = netdev_alloc_skb_ip_align(netdev, netdev->mtu + From 6296177a4ea30b611734cc9bc96e09caffacd278 Mon Sep 17 00:00:00 2001 From: Subbaraya Sundeep Date: Mon, 30 Aug 2021 23:30:43 +0530 Subject: [PATCH 0620/5344] octeontx2-af: Fix loop in free and unmap counter [ Upstream commit 6537e96d743b89294b397b4865c6c061abae31b0 ] When the given counter does not belong to the entry then code ends up in infinite loop because the loop cursor, entry is not getting updated further. This patch fixes that by updating entry for every iteration. Fixes: a958dd59f9ce ("octeontx2-af: Map or unmap NPC MCAM entry and counter") Signed-off-by: Subbaraya Sundeep Signed-off-by: Sunil Goutham Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c index d82a519a0cd9a..f9f246c82c974 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c @@ -2013,10 +2013,11 @@ int rvu_mbox_handler_npc_mcam_unmap_counter(struct rvu *rvu, index = find_next_bit(mcam->bmap, mcam->bmap_entries, entry); if (index >= mcam->bmap_entries) break; + entry = index + 1; + if (mcam->entry2cntr_map[index] != req->cntr) continue; - entry = index + 1; npc_unmap_mcam_entry_and_cntr(rvu, mcam, blkaddr, index, req->cntr); } From 452e3de13095437afe90406093658c40867c5b75 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Aug 2021 19:02:10 -0700 Subject: [PATCH 0621/5344] ipv4: fix endianness issue in inet_rtm_getroute_build_skb() [ Upstream commit 92548b0ee220e000d81c27ac9a80e0ede895a881 ] The UDP length field should be in network order. This removes the following sparse error: net/ipv4/route.c:3173:27: warning: incorrect type in assignment (different base types) net/ipv4/route.c:3173:27: expected restricted __be16 [usertype] len net/ipv4/route.c:3173:27: got unsigned long Fixes: 404eb77ea766 ("ipv4: support sport, dport and ip_proto in RTM_GETROUTE") Signed-off-by: Eric Dumazet Cc: Roopa Prabhu Cc: David Ahern Reviewed-by: David Ahern Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 03f35764a40ea..539492998864e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3004,7 +3004,7 @@ static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst, udph = skb_put_zero(skb, sizeof(struct udphdr)); udph->source = sport; udph->dest = dport; - udph->len = sizeof(struct udphdr); + udph->len = htons(sizeof(struct udphdr)); udph->check = 0; break; } From 2de04eae3d1484e7597fb7a172761ca82a36ed45 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 7 Sep 2021 16:16:58 +0300 Subject: [PATCH 0622/5344] bpf: Introduce BPF nospec instruction for mitigating Spectre v4 commit f5e81d1117501546b7be050c5fbafa6efd2c722c upstream. In case of JITs, each of the JIT backends compiles the BPF nospec instruction /either/ to a machine instruction which emits a speculation barrier /or/ to /no/ machine instruction in case the underlying architecture is not affected by Speculative Store Bypass or has different mitigations in place already. This covers both x86 and (implicitly) arm64: In case of x86, we use 'lfence' instruction for mitigation. In case of arm64, we rely on the firmware mitigation as controlled via the ssbd kernel parameter. Whenever the mitigation is enabled, it works for all of the kernel code with no need to provide any additional instructions here (hence only comment in arm64 JIT). Other archs can follow as needed. The BPF nospec instruction is specifically targeting Spectre v4 since i) we don't use a serialization barrier for the Spectre v1 case, and ii) mitigation instructions for v1 and v4 might be different on some archs. The BPF nospec is required for a future commit, where the BPF verifier does annotate intermediate BPF programs with speculation barriers. Co-developed-by: Piotr Krysiuk Co-developed-by: Benedict Schlueter Signed-off-by: Daniel Borkmann Signed-off-by: Piotr Krysiuk Signed-off-by: Benedict Schlueter Acked-by: Alexei Starovoitov Signed-off-by: Sasha Levin [OP: - adjusted context for 5.4 - apply riscv changes to /arch/riscv/net/bpf_jit_comp.c] Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- arch/arm/net/bpf_jit_32.c | 3 +++ arch/arm64/net/bpf_jit_comp.c | 13 +++++++++++++ arch/mips/net/ebpf_jit.c | 3 +++ arch/powerpc/net/bpf_jit_comp64.c | 6 ++++++ arch/riscv/net/bpf_jit_comp.c | 4 ++++ arch/s390/net/bpf_jit_comp.c | 5 +++++ arch/sparc/net/bpf_jit_comp_64.c | 3 +++ arch/x86/net/bpf_jit_comp.c | 7 +++++++ arch/x86/net/bpf_jit_comp32.c | 6 ++++++ include/linux/filter.h | 15 +++++++++++++++ kernel/bpf/core.c | 18 +++++++++++++++++- kernel/bpf/disasm.c | 16 +++++++++------- 12 files changed, 91 insertions(+), 8 deletions(-) diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 7216653424fd6..b51a8c7b01114 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -1602,6 +1602,9 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) rn = arm_bpf_get_reg32(src_lo, tmp2[1], ctx); emit_ldx_r(dst, rn, off, ctx, BPF_SIZE(code)); break; + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + break; /* ST: *(size *)(dst + off) = imm */ case BPF_ST | BPF_MEM | BPF_W: case BPF_ST | BPF_MEM | BPF_H: diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 945e5f690edec..afc7d41347f73 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -701,6 +701,19 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, } break; + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + /* + * Nothing required here. + * + * In case of arm64, we rely on the firmware mitigation of + * Speculative Store Bypass as controlled via the ssbd kernel + * parameter. Whenever the mitigation is enabled, it works + * for all of the kernel code with no need to provide any + * additional instructions. + */ + break; + /* ST: *(size *)(dst + off) = imm */ case BPF_ST | BPF_MEM | BPF_W: case BPF_ST | BPF_MEM | BPF_H: diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c index 561154cbcc401..b31b91e57c341 100644 --- a/arch/mips/net/ebpf_jit.c +++ b/arch/mips/net/ebpf_jit.c @@ -1355,6 +1355,9 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, } break; + case BPF_ST | BPF_NOSPEC: /* speculation barrier */ + break; + case BPF_ST | BPF_B | BPF_MEM: case BPF_ST | BPF_H | BPF_MEM: case BPF_ST | BPF_W | BPF_MEM: diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index be3517ef0574d..20bfd753bcba6 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -644,6 +644,12 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, } break; + /* + * BPF_ST NOSPEC (speculation barrier) + */ + case BPF_ST | BPF_NOSPEC: + break; + /* * BPF_ST(X) */ diff --git a/arch/riscv/net/bpf_jit_comp.c b/arch/riscv/net/bpf_jit_comp.c index e2279fed8f564..0eefe6193253b 100644 --- a/arch/riscv/net/bpf_jit_comp.c +++ b/arch/riscv/net/bpf_jit_comp.c @@ -1313,6 +1313,10 @@ static int emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, emit(rv_ld(rd, 0, RV_REG_T1), ctx); break; + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + break; + /* ST: *(size *)(dst + off) = imm */ case BPF_ST | BPF_MEM | BPF_B: emit_imm(RV_REG_T1, imm, ctx); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index e160f4650f8e4..3e6612d8b921c 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -913,6 +913,11 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, break; } break; + /* + * BPF_NOSPEC (speculation barrier) + */ + case BPF_ST | BPF_NOSPEC: + break; /* * BPF_ST(X) */ diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 3364e2a009899..fef734473c0f3 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -1287,6 +1287,9 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) return 1; break; } + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + break; /* ST: *(size *)(dst + off) = imm */ case BPF_ST | BPF_MEM | BPF_W: case BPF_ST | BPF_MEM | BPF_H: diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 6e884f17634fe..55f62dca28aa2 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -728,6 +728,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, } break; + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + if (boot_cpu_has(X86_FEATURE_XMM2)) + /* Emit 'lfence' */ + EMIT3(0x0F, 0xAE, 0xE8); + break; + /* ST: *(u8*)(dst_reg + off) = imm */ case BPF_ST | BPF_MEM | BPF_B: if (is_ereg(dst_reg)) diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index 0fcba32077c87..2914f900034e0 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -1705,6 +1705,12 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, i++; break; } + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + if (boot_cpu_has(X86_FEATURE_XMM2)) + /* Emit 'lfence' */ + EMIT3(0x0F, 0xAE, 0xE8); + break; /* ST: *(u8*)(dst_reg + off) = imm */ case BPF_ST | BPF_MEM | BPF_H: case BPF_ST | BPF_MEM | BPF_B: diff --git a/include/linux/filter.h b/include/linux/filter.h index a68a7db6eab93..c5473bf40f619 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -68,6 +68,11 @@ struct ctl_table_header; /* unused opcode to mark call to interpreter with arguments */ #define BPF_CALL_ARGS 0xe0 +/* unused opcode to mark speculation barrier for mitigating + * Speculative Store Bypass + */ +#define BPF_NOSPEC 0xc0 + /* As per nm, we expose JITed images as text (code) section for * kallsyms. That way, tools like perf can find it to match * addresses. @@ -368,6 +373,16 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) .off = 0, \ .imm = 0 }) +/* Speculation barrier */ + +#define BPF_ST_NOSPEC() \ + ((struct bpf_insn) { \ + .code = BPF_ST | BPF_NOSPEC, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = 0 }) + /* Internal classic blocks for direct assignment */ #define __BPF_STMT(CODE, K) \ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 323913ba13b38..d9a3d995bd966 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -31,6 +31,7 @@ #include #include +#include #include /* Registers */ @@ -1310,6 +1311,7 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) /* Non-UAPI available opcodes. */ [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS, [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, + [BPF_ST | BPF_NOSPEC] = &&ST_NOSPEC, }; #undef BPF_INSN_3_LBL #undef BPF_INSN_2_LBL @@ -1550,7 +1552,21 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) COND_JMP(s, JSGE, >=) COND_JMP(s, JSLE, <=) #undef COND_JMP - /* STX and ST and LDX*/ + /* ST, STX and LDX*/ + ST_NOSPEC: + /* Speculation barrier for mitigating Speculative Store Bypass. + * In case of arm64, we rely on the firmware mitigation as + * controlled via the ssbd kernel parameter. Whenever the + * mitigation is enabled, it works for all of the kernel code + * with no need to provide any additional instructions here. + * In case of x86, we use 'lfence' insn for mitigation. We + * reuse preexisting logic from Spectre v1 mitigation that + * happens to produce the required code on x86 for v4 as well. + */ +#ifdef CONFIG_X86 + barrier_nospec(); +#endif + CONT; #define LDST(SIZEOP, SIZE) \ STX_MEM_##SIZEOP: \ *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \ diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index b44d8c447afd1..ff1dd7d45b58a 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -162,15 +162,17 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, else verbose(cbs->private_data, "BUG_%02x\n", insn->code); } else if (class == BPF_ST) { - if (BPF_MODE(insn->code) != BPF_MEM) { + if (BPF_MODE(insn->code) == BPF_MEM) { + verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->imm); + } else if (BPF_MODE(insn->code) == 0xc0 /* BPF_NOSPEC, no UAPI */) { + verbose(cbs->private_data, "(%02x) nospec\n", insn->code); + } else { verbose(cbs->private_data, "BUG_st_%02x\n", insn->code); - return; } - verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->dst_reg, - insn->off, insn->imm); } else if (class == BPF_LDX) { if (BPF_MODE(insn->code) != BPF_MEM) { verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code); From 4c903a5bd425ce05229f282c63a13f43d8078d82 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 7 Sep 2021 16:16:59 +0300 Subject: [PATCH 0623/5344] bpf: Fix leakage due to insufficient speculative store bypass mitigation commit 2039f26f3aca5b0e419b98f65dd36481337b86ee upstream. Spectre v4 gadgets make use of memory disambiguation, which is a set of techniques that execute memory access instructions, that is, loads and stores, out of program order; Intel's optimization manual, section 2.4.4.5: A load instruction micro-op may depend on a preceding store. Many microarchitectures block loads until all preceding store addresses are known. The memory disambiguator predicts which loads will not depend on any previous stores. When the disambiguator predicts that a load does not have such a dependency, the load takes its data from the L1 data cache. Eventually, the prediction is verified. If an actual conflict is detected, the load and all succeeding instructions are re-executed. af86ca4e3088 ("bpf: Prevent memory disambiguation attack") tried to mitigate this attack by sanitizing the memory locations through preemptive "fast" (low latency) stores of zero prior to the actual "slow" (high latency) store of a pointer value such that upon dependency misprediction the CPU then speculatively executes the load of the pointer value and retrieves the zero value instead of the attacker controlled scalar value previously stored at that location, meaning, subsequent access in the speculative domain is then redirected to the "zero page". The sanitized preemptive store of zero prior to the actual "slow" store is done through a simple ST instruction based on r10 (frame pointer) with relative offset to the stack location that the verifier has been tracking on the original used register for STX, which does not have to be r10. Thus, there are no memory dependencies for this store, since it's only using r10 and immediate constant of zero; hence af86ca4e3088 /assumed/ a low latency operation. However, a recent attack demonstrated that this mitigation is not sufficient since the preemptive store of zero could also be turned into a "slow" store and is thus bypassed as well: [...] // r2 = oob address (e.g. scalar) // r7 = pointer to map value 31: (7b) *(u64 *)(r10 -16) = r2 // r9 will remain "fast" register, r10 will become "slow" register below 32: (bf) r9 = r10 // JIT maps BPF reg to x86 reg: // r9 -> r15 (callee saved) // r10 -> rbp // train store forward prediction to break dependency link between both r9 // and r10 by evicting them from the predictor's LRU table. 33: (61) r0 = *(u32 *)(r7 +24576) 34: (63) *(u32 *)(r7 +29696) = r0 35: (61) r0 = *(u32 *)(r7 +24580) 36: (63) *(u32 *)(r7 +29700) = r0 37: (61) r0 = *(u32 *)(r7 +24584) 38: (63) *(u32 *)(r7 +29704) = r0 39: (61) r0 = *(u32 *)(r7 +24588) 40: (63) *(u32 *)(r7 +29708) = r0 [...] 543: (61) r0 = *(u32 *)(r7 +25596) 544: (63) *(u32 *)(r7 +30716) = r0 // prepare call to bpf_ringbuf_output() helper. the latter will cause rbp // to spill to stack memory while r13/r14/r15 (all callee saved regs) remain // in hardware registers. rbp becomes slow due to push/pop latency. below is // disasm of bpf_ringbuf_output() helper for better visual context: // // ffffffff8117ee20: 41 54 push r12 // ffffffff8117ee22: 55 push rbp // ffffffff8117ee23: 53 push rbx // ffffffff8117ee24: 48 f7 c1 fc ff ff ff test rcx,0xfffffffffffffffc // ffffffff8117ee2b: 0f 85 af 00 00 00 jne ffffffff8117eee0 <-- jump taken // [...] // ffffffff8117eee0: 49 c7 c4 ea ff ff ff mov r12,0xffffffffffffffea // ffffffff8117eee7: 5b pop rbx // ffffffff8117eee8: 5d pop rbp // ffffffff8117eee9: 4c 89 e0 mov rax,r12 // ffffffff8117eeec: 41 5c pop r12 // ffffffff8117eeee: c3 ret 545: (18) r1 = map[id:4] 547: (bf) r2 = r7 548: (b7) r3 = 0 549: (b7) r4 = 4 550: (85) call bpf_ringbuf_output#194288 // instruction 551 inserted by verifier \ 551: (7a) *(u64 *)(r10 -16) = 0 | /both/ are now slow stores here // storing map value pointer r7 at fp-16 | since value of r10 is "slow". 552: (7b) *(u64 *)(r10 -16) = r7 / // following "fast" read to the same memory location, but due to dependency // misprediction it will speculatively execute before insn 551/552 completes. 553: (79) r2 = *(u64 *)(r9 -16) // in speculative domain contains attacker controlled r2. in non-speculative // domain this contains r7, and thus accesses r7 +0 below. 554: (71) r3 = *(u8 *)(r2 +0) // leak r3 As can be seen, the current speculative store bypass mitigation which the verifier inserts at line 551 is insufficient since /both/, the write of the zero sanitation as well as the map value pointer are a high latency instruction due to prior memory access via push/pop of r10 (rbp) in contrast to the low latency read in line 553 as r9 (r15) which stays in hardware registers. Thus, architecturally, fp-16 is r7, however, microarchitecturally, fp-16 can still be r2. Initial thoughts to address this issue was to track spilled pointer loads from stack and enforce their load via LDX through r10 as well so that /both/ the preemptive store of zero /as well as/ the load use the /same/ register such that a dependency is created between the store and load. However, this option is not sufficient either since it can be bypassed as well under speculation. An updated attack with pointer spill/fills now _all_ based on r10 would look as follows: [...] // r2 = oob address (e.g. scalar) // r7 = pointer to map value [...] // longer store forward prediction training sequence than before. 2062: (61) r0 = *(u32 *)(r7 +25588) 2063: (63) *(u32 *)(r7 +30708) = r0 2064: (61) r0 = *(u32 *)(r7 +25592) 2065: (63) *(u32 *)(r7 +30712) = r0 2066: (61) r0 = *(u32 *)(r7 +25596) 2067: (63) *(u32 *)(r7 +30716) = r0 // store the speculative load address (scalar) this time after the store // forward prediction training. 2068: (7b) *(u64 *)(r10 -16) = r2 // preoccupy the CPU store port by running sequence of dummy stores. 2069: (63) *(u32 *)(r7 +29696) = r0 2070: (63) *(u32 *)(r7 +29700) = r0 2071: (63) *(u32 *)(r7 +29704) = r0 2072: (63) *(u32 *)(r7 +29708) = r0 2073: (63) *(u32 *)(r7 +29712) = r0 2074: (63) *(u32 *)(r7 +29716) = r0 2075: (63) *(u32 *)(r7 +29720) = r0 2076: (63) *(u32 *)(r7 +29724) = r0 2077: (63) *(u32 *)(r7 +29728) = r0 2078: (63) *(u32 *)(r7 +29732) = r0 2079: (63) *(u32 *)(r7 +29736) = r0 2080: (63) *(u32 *)(r7 +29740) = r0 2081: (63) *(u32 *)(r7 +29744) = r0 2082: (63) *(u32 *)(r7 +29748) = r0 2083: (63) *(u32 *)(r7 +29752) = r0 2084: (63) *(u32 *)(r7 +29756) = r0 2085: (63) *(u32 *)(r7 +29760) = r0 2086: (63) *(u32 *)(r7 +29764) = r0 2087: (63) *(u32 *)(r7 +29768) = r0 2088: (63) *(u32 *)(r7 +29772) = r0 2089: (63) *(u32 *)(r7 +29776) = r0 2090: (63) *(u32 *)(r7 +29780) = r0 2091: (63) *(u32 *)(r7 +29784) = r0 2092: (63) *(u32 *)(r7 +29788) = r0 2093: (63) *(u32 *)(r7 +29792) = r0 2094: (63) *(u32 *)(r7 +29796) = r0 2095: (63) *(u32 *)(r7 +29800) = r0 2096: (63) *(u32 *)(r7 +29804) = r0 2097: (63) *(u32 *)(r7 +29808) = r0 2098: (63) *(u32 *)(r7 +29812) = r0 // overwrite scalar with dummy pointer; same as before, also including the // sanitation store with 0 from the current mitigation by the verifier. 2099: (7a) *(u64 *)(r10 -16) = 0 | /both/ are now slow stores here 2100: (7b) *(u64 *)(r10 -16) = r7 | since store unit is still busy. // load from stack intended to bypass stores. 2101: (79) r2 = *(u64 *)(r10 -16) 2102: (71) r3 = *(u8 *)(r2 +0) // leak r3 [...] Looking at the CPU microarchitecture, the scheduler might issue loads (such as seen in line 2101) before stores (line 2099,2100) because the load execution units become available while the store execution unit is still busy with the sequence of dummy stores (line 2069-2098). And so the load may use the prior stored scalar from r2 at address r10 -16 for speculation. The updated attack may work less reliable on CPU microarchitectures where loads and stores share execution resources. This concludes that the sanitizing with zero stores from af86ca4e3088 ("bpf: Prevent memory disambiguation attack") is insufficient. Moreover, the detection of stack reuse from af86ca4e3088 where previously data (STACK_MISC) has been written to a given stack slot where a pointer value is now to be stored does not have sufficient coverage as precondition for the mitigation either; for several reasons outlined as follows: 1) Stack content from prior program runs could still be preserved and is therefore not "random", best example is to split a speculative store bypass attack between tail calls, program A would prepare and store the oob address at a given stack slot and then tail call into program B which does the "slow" store of a pointer to the stack with subsequent "fast" read. From program B PoV such stack slot type is STACK_INVALID, and therefore also must be subject to mitigation. 2) The STACK_SPILL must not be coupled to register_is_const(&stack->spilled_ptr) condition, for example, the previous content of that memory location could also be a pointer to map or map value. Without the fix, a speculative store bypass is not mitigated in such precondition and can then lead to a type confusion in the speculative domain leaking kernel memory near these pointer types. While brainstorming on various alternative mitigation possibilities, we also stumbled upon a retrospective from Chrome developers [0]: [...] For variant 4, we implemented a mitigation to zero the unused memory of the heap prior to allocation, which cost about 1% when done concurrently and 4% for scavenging. Variant 4 defeats everything we could think of. We explored more mitigations for variant 4 but the threat proved to be more pervasive and dangerous than we anticipated. For example, stack slots used by the register allocator in the optimizing compiler could be subject to type confusion, leading to pointer crafting. Mitigating type confusion for stack slots alone would have required a complete redesign of the backend of the optimizing compiler, perhaps man years of work, without a guarantee of completeness. [...] >From BPF side, the problem space is reduced, however, options are rather limited. One idea that has been explored was to xor-obfuscate pointer spills to the BPF stack: [...] // preoccupy the CPU store port by running sequence of dummy stores. [...] 2106: (63) *(u32 *)(r7 +29796) = r0 2107: (63) *(u32 *)(r7 +29800) = r0 2108: (63) *(u32 *)(r7 +29804) = r0 2109: (63) *(u32 *)(r7 +29808) = r0 2110: (63) *(u32 *)(r7 +29812) = r0 // overwrite scalar with dummy pointer; xored with random 'secret' value // of 943576462 before store ... 2111: (b4) w11 = 943576462 2112: (af) r11 ^= r7 2113: (7b) *(u64 *)(r10 -16) = r11 2114: (79) r11 = *(u64 *)(r10 -16) 2115: (b4) w2 = 943576462 2116: (af) r2 ^= r11 // ... and restored with the same 'secret' value with the help of AX reg. 2117: (71) r3 = *(u8 *)(r2 +0) [...] While the above would not prevent speculation, it would make data leakage infeasible by directing it to random locations. In order to be effective and prevent type confusion under speculation, such random secret would have to be regenerated for each store. The additional complexity involved for a tracking mechanism that prevents jumps such that restoring spilled pointers would not get corrupted is not worth the gain for unprivileged. Hence, the fix in here eventually opted for emitting a non-public BPF_ST | BPF_NOSPEC instruction which the x86 JIT translates into a lfence opcode. Inserting the latter in between the store and load instruction is one of the mitigations options [1]. The x86 instruction manual notes: [...] An LFENCE that follows an instruction that stores to memory might complete before the data being stored have become globally visible. [...] The latter meaning that the preceding store instruction finished execution and the store is at minimum guaranteed to be in the CPU's store queue, but it's not guaranteed to be in that CPU's L1 cache at that point (globally visible). The latter would only be guaranteed via sfence. So the load which is guaranteed to execute after the lfence for that local CPU would have to rely on store-to-load forwarding. [2], in section 2.3 on store buffers says: [...] For every store operation that is added to the ROB, an entry is allocated in the store buffer. This entry requires both the virtual and physical address of the target. Only if there is no free entry in the store buffer, the frontend stalls until there is an empty slot available in the store buffer again. Otherwise, the CPU can immediately continue adding subsequent instructions to the ROB and execute them out of order. On Intel CPUs, the store buffer has up to 56 entries. [...] One small upside on the fix is that it lifts constraints from af86ca4e3088 where the sanitize_stack_off relative to r10 must be the same when coming from different paths. The BPF_ST | BPF_NOSPEC gets emitted after a BPF_STX or BPF_ST instruction. This happens either when we store a pointer or data value to the BPF stack for the first time, or upon later pointer spills. The former needs to be enforced since otherwise stale stack data could be leaked under speculation as outlined earlier. For non-x86 JITs the BPF_ST | BPF_NOSPEC mapping is currently optimized away, but others could emit a speculation barrier as well if necessary. For real-world unprivileged programs e.g. generated by LLVM, pointer spill/fill is only generated upon register pressure and LLVM only tries to do that for pointers which are not used often. The program main impact will be the initial BPF_ST | BPF_NOSPEC sanitation for the STACK_INVALID case when the first write to a stack slot occurs e.g. upon map lookup. In future we might refine ways to mitigate the latter cost. [0] https://arxiv.org/pdf/1902.05178.pdf [1] https://msrc-blog.microsoft.com/2018/05/21/analysis-and-mitigation-of-speculative-store-bypass-cve-2018-3639/ [2] https://arxiv.org/pdf/1905.05725.pdf Fixes: af86ca4e3088 ("bpf: Prevent memory disambiguation attack") Fixes: f7cf25b2026d ("bpf: track spill/fill of constants") Co-developed-by: Piotr Krysiuk Co-developed-by: Benedict Schlueter Signed-off-by: Daniel Borkmann Signed-off-by: Piotr Krysiuk Signed-off-by: Benedict Schlueter Acked-by: Alexei Starovoitov Signed-off-by: Sasha Levin [OP: - apply check_stack_write_fixed_off() changes in check_stack_write() - replace env->bypass_spec_v4 -> env->allow_ptr_leaks] Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- include/linux/bpf_verifier.h | 2 +- kernel/bpf/verifier.c | 87 +++++++++++++----------------------- 2 files changed, 33 insertions(+), 56 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 22f070085971b..4292e8e42c120 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -301,8 +301,8 @@ struct bpf_insn_aux_data { }; }; int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ - int sanitize_stack_off; /* stack slot to be cleared */ bool seen; /* this insn was processed by the verifier */ + bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ bool zext_dst; /* this insn zero extends dst reg */ u8 alu_state; /* used in combination with alu_limit */ bool prune_point; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e613a8447900a..2d0db4399e058 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1920,6 +1920,19 @@ static int check_stack_write(struct bpf_verifier_env *env, cur = env->cur_state->frame[env->cur_state->curframe]; if (value_regno >= 0) reg = &cur->regs[value_regno]; + if (!env->allow_ptr_leaks) { + bool sanitize = reg && is_spillable_regtype(reg->type); + + for (i = 0; i < size; i++) { + if (state->stack[spi].slot_type[i] == STACK_INVALID) { + sanitize = true; + break; + } + } + + if (sanitize) + env->insn_aux_data[insn_idx].sanitize_stack_spill = true; + } if (reg && size == BPF_REG_SIZE && register_is_const(reg) && !register_is_null(reg) && env->allow_ptr_leaks) { @@ -1942,47 +1955,10 @@ static int check_stack_write(struct bpf_verifier_env *env, verbose(env, "invalid size of register spill\n"); return -EACCES; } - if (state != cur && reg->type == PTR_TO_STACK) { verbose(env, "cannot spill pointers to stack into stack frame of the caller\n"); return -EINVAL; } - - if (!env->allow_ptr_leaks) { - bool sanitize = false; - - if (state->stack[spi].slot_type[0] == STACK_SPILL && - register_is_const(&state->stack[spi].spilled_ptr)) - sanitize = true; - for (i = 0; i < BPF_REG_SIZE; i++) - if (state->stack[spi].slot_type[i] == STACK_MISC) { - sanitize = true; - break; - } - if (sanitize) { - int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off; - int soff = (-spi - 1) * BPF_REG_SIZE; - - /* detected reuse of integer stack slot with a pointer - * which means either llvm is reusing stack slot or - * an attacker is trying to exploit CVE-2018-3639 - * (speculative store bypass) - * Have to sanitize that slot with preemptive - * store of zero. - */ - if (*poff && *poff != soff) { - /* disallow programs where single insn stores - * into two different stack slots, since verifier - * cannot sanitize them - */ - verbose(env, - "insn %d cannot access two stack slots fp%d and fp%d", - insn_idx, *poff, soff); - return -EINVAL; - } - *poff = soff; - } - } save_register_state(state, spi, reg); } else { u8 type = STACK_MISC; @@ -8860,35 +8836,33 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++, insn++) { bpf_convert_ctx_access_t convert_ctx_access; + bool ctx_access; if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) || insn->code == (BPF_LDX | BPF_MEM | BPF_H) || insn->code == (BPF_LDX | BPF_MEM | BPF_W) || - insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) + insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) { type = BPF_READ; - else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) || - insn->code == (BPF_STX | BPF_MEM | BPF_H) || - insn->code == (BPF_STX | BPF_MEM | BPF_W) || - insn->code == (BPF_STX | BPF_MEM | BPF_DW)) + ctx_access = true; + } else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) || + insn->code == (BPF_STX | BPF_MEM | BPF_H) || + insn->code == (BPF_STX | BPF_MEM | BPF_W) || + insn->code == (BPF_STX | BPF_MEM | BPF_DW) || + insn->code == (BPF_ST | BPF_MEM | BPF_B) || + insn->code == (BPF_ST | BPF_MEM | BPF_H) || + insn->code == (BPF_ST | BPF_MEM | BPF_W) || + insn->code == (BPF_ST | BPF_MEM | BPF_DW)) { type = BPF_WRITE; - else + ctx_access = BPF_CLASS(insn->code) == BPF_STX; + } else { continue; + } if (type == BPF_WRITE && - env->insn_aux_data[i + delta].sanitize_stack_off) { + env->insn_aux_data[i + delta].sanitize_stack_spill) { struct bpf_insn patch[] = { - /* Sanitize suspicious stack slot with zero. - * There are no memory dependencies for this store, - * since it's only using frame pointer and immediate - * constant of zero - */ - BPF_ST_MEM(BPF_DW, BPF_REG_FP, - env->insn_aux_data[i + delta].sanitize_stack_off, - 0), - /* the original STX instruction will immediately - * overwrite the same stack slot with appropriate value - */ *insn, + BPF_ST_NOSPEC(), }; cnt = ARRAY_SIZE(patch); @@ -8902,6 +8876,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) continue; } + if (!ctx_access) + continue; + switch (env->insn_aux_data[i + delta].ptr_type) { case PTR_TO_CTX: if (!ops->convert_ctx_access) From dddddbe2b05246a7c055e9fe5a4f09cc88808e98 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Tue, 7 Sep 2021 16:17:00 +0300 Subject: [PATCH 0624/5344] bpf: verifier: Allocate idmap scratch in verifier env commit c9e73e3d2b1eb1ea7ff068e05007eec3bd8ef1c9 upstream. func_states_equal makes a very short lived allocation for idmap, probably because it's too large to fit on the stack. However the function is called quite often, leading to a lot of alloc / free churn. Replace the temporary allocation with dedicated scratch space in struct bpf_verifier_env. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Edward Cree Link: https://lore.kernel.org/bpf/20210429134656.122225-4-lmb@cloudflare.com Signed-off-by: Greg Kroah-Hartman [OP: adjusted context for 5.4] Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- include/linux/bpf_verifier.h | 8 +++++++ kernel/bpf/verifier.c | 46 ++++++++++++------------------------ 2 files changed, 23 insertions(+), 31 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 4292e8e42c120..d5a7798a4cbff 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -194,6 +194,13 @@ struct bpf_idx_pair { u32 idx; }; +struct bpf_id_pair { + u32 old; + u32 cur; +}; + +/* Maximum number of register states that can exist at once */ +#define BPF_ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) #define MAX_CALL_FRAMES 8 struct bpf_verifier_state { /* call stack tracking */ @@ -370,6 +377,7 @@ struct bpf_verifier_env { const struct bpf_line_info *prev_linfo; struct bpf_verifier_log log; struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1]; + struct bpf_id_pair idmap_scratch[BPF_ID_MAP_SIZE]; struct { int *insn_state; int *insn_stack; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2d0db4399e058..eb234ffa33040 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6982,13 +6982,6 @@ static bool range_within(struct bpf_reg_state *old, old->smax_value >= cur->smax_value; } -/* Maximum number of register states that can exist at once */ -#define ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) -struct idpair { - u32 old; - u32 cur; -}; - /* If in the old state two registers had the same id, then they need to have * the same id in the new state as well. But that id could be different from * the old state, so we need to track the mapping from old to new ids. @@ -6999,11 +6992,11 @@ struct idpair { * So we look through our idmap to see if this old id has been seen before. If * so, we require the new id to match; otherwise, we add the id pair to the map. */ -static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap) +static bool check_ids(u32 old_id, u32 cur_id, struct bpf_id_pair *idmap) { unsigned int i; - for (i = 0; i < ID_MAP_SIZE; i++) { + for (i = 0; i < BPF_ID_MAP_SIZE; i++) { if (!idmap[i].old) { /* Reached an empty slot; haven't seen this id before */ idmap[i].old = old_id; @@ -7116,7 +7109,7 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, /* Returns true if (rold safe implies rcur safe) */ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, - struct idpair *idmap) + struct bpf_id_pair *idmap) { bool equal; @@ -7233,7 +7226,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, static bool stacksafe(struct bpf_func_state *old, struct bpf_func_state *cur, - struct idpair *idmap) + struct bpf_id_pair *idmap) { int i, spi; @@ -7330,32 +7323,23 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur) * whereas register type in current state is meaningful, it means that * the current state will reach 'bpf_exit' instruction safely */ -static bool func_states_equal(struct bpf_func_state *old, +static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old, struct bpf_func_state *cur) { - struct idpair *idmap; - bool ret = false; int i; - idmap = kcalloc(ID_MAP_SIZE, sizeof(struct idpair), GFP_KERNEL); - /* If we failed to allocate the idmap, just say it's not safe */ - if (!idmap) - return false; - - for (i = 0; i < MAX_BPF_REG; i++) { - if (!regsafe(&old->regs[i], &cur->regs[i], idmap)) - goto out_free; - } + memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch)); + for (i = 0; i < MAX_BPF_REG; i++) + if (!regsafe(&old->regs[i], &cur->regs[i], env->idmap_scratch)) + return false; - if (!stacksafe(old, cur, idmap)) - goto out_free; + if (!stacksafe(old, cur, env->idmap_scratch)) + return false; if (!refsafe(old, cur)) - goto out_free; - ret = true; -out_free: - kfree(idmap); - return ret; + return false; + + return true; } static bool states_equal(struct bpf_verifier_env *env, @@ -7382,7 +7366,7 @@ static bool states_equal(struct bpf_verifier_env *env, for (i = 0; i <= old->curframe; i++) { if (old->frame[i]->callsite != cur->frame[i]->callsite) return false; - if (!func_states_equal(old->frame[i], cur->frame[i])) + if (!func_states_equal(env, old->frame[i], cur->frame[i])) return false; } return true; From ef3d1ec661fec1fd04cfd95298b7a1a422935f9d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 7 Sep 2021 16:17:01 +0300 Subject: [PATCH 0625/5344] bpf: Fix pointer arithmetic mask tightening under state pruning commit e042aa532c84d18ff13291d00620502ce7a38dda upstream. In 7fedb63a8307 ("bpf: Tighten speculative pointer arithmetic mask") we narrowed the offset mask for unprivileged pointer arithmetic in order to mitigate a corner case where in the speculative domain it is possible to advance, for example, the map value pointer by up to value_size-1 out-of- bounds in order to leak kernel memory via side-channel to user space. The verifier's state pruning for scalars leaves one corner case open where in the first verification path R_x holds an unknown scalar with an aux->alu_limit of e.g. 7, and in a second verification path that same register R_x, here denoted as R_x', holds an unknown scalar which has tighter bounds and would thus satisfy range_within(R_x, R_x') as well as tnum_in(R_x, R_x') for state pruning, yielding an aux->alu_limit of 3: Given the second path fits the register constraints for pruning, the final generated mask from aux->alu_limit will remain at 7. While technically not wrong for the non-speculative domain, it would however be possible to craft similar cases where the mask would be too wide as in 7fedb63a8307. One way to fix it is to detect the presence of unknown scalar map pointer arithmetic and force a deeper search on unknown scalars to ensure that we do not run into a masking mismatch. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Greg Kroah-Hartman [OP: adjusted context in include/linux/bpf_verifier.h for 5.4] Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 27 +++++++++++++++++---------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d5a7798a4cbff..ee10a9f06b97c 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -371,6 +371,7 @@ struct bpf_verifier_env { struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ u32 used_map_cnt; /* number of used maps */ u32 id_gen; /* used to generate unique reg IDs */ + bool explore_alu_limits; bool allow_ptr_leaks; bool seen_direct_write; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eb234ffa33040..78c283802e048 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4451,6 +4451,12 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0; alu_state |= ptr_is_dst_reg ? BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST; + + /* Limit pruning on unknown scalars to enable deep search for + * potential masking differences from other program paths. + */ + if (!off_is_imm) + env->explore_alu_limits = true; } err = update_alu_sanitation_state(aux, alu_state, alu_limit); @@ -7108,8 +7114,8 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, } /* Returns true if (rold safe implies rcur safe) */ -static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, - struct bpf_id_pair *idmap) +static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, + struct bpf_reg_state *rcur, struct bpf_id_pair *idmap) { bool equal; @@ -7135,6 +7141,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; switch (rold->type) { case SCALAR_VALUE: + if (env->explore_alu_limits) + return false; if (rcur->type == SCALAR_VALUE) { if (!rold->precise && !rcur->precise) return true; @@ -7224,9 +7232,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; } -static bool stacksafe(struct bpf_func_state *old, - struct bpf_func_state *cur, - struct bpf_id_pair *idmap) +static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, + struct bpf_func_state *cur, struct bpf_id_pair *idmap) { int i, spi; @@ -7271,9 +7278,8 @@ static bool stacksafe(struct bpf_func_state *old, continue; if (old->stack[spi].slot_type[0] != STACK_SPILL) continue; - if (!regsafe(&old->stack[spi].spilled_ptr, - &cur->stack[spi].spilled_ptr, - idmap)) + if (!regsafe(env, &old->stack[spi].spilled_ptr, + &cur->stack[spi].spilled_ptr, idmap)) /* when explored and current stack slot are both storing * spilled registers, check that stored pointers types * are the same as well. @@ -7330,10 +7336,11 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch)); for (i = 0; i < MAX_BPF_REG; i++) - if (!regsafe(&old->regs[i], &cur->regs[i], env->idmap_scratch)) + if (!regsafe(env, &old->regs[i], &cur->regs[i], + env->idmap_scratch)) return false; - if (!stacksafe(old, cur, env->idmap_scratch)) + if (!stacksafe(env, old, cur, env->idmap_scratch)) return false; if (!refsafe(old, cur)) From 2983657c74f1730ba0a81ef241f948b9e4248907 Mon Sep 17 00:00:00 2001 From: Nguyen Dinh Phi Date: Mon, 23 Aug 2021 08:06:41 +0800 Subject: [PATCH 0626/5344] tty: Fix data race between tiocsti() and flush_to_ldisc() commit bb2853a6a421a052268eee00fd5d3f6b3504b2b1 upstream. The ops->receive_buf() may be accessed concurrently from these two functions. If the driver flushes data to the line discipline receive_buf() method while tiocsti() is waiting for the ops->receive_buf() to finish its work, the data race will happen. For example: tty_ioctl |tty_ldisc_receive_buf ->tioctsi | ->tty_port_default_receive_buf | ->tty_ldisc_receive_buf ->hci_uart_tty_receive | ->hci_uart_tty_receive ->h4_recv | ->h4_recv In this case, the h4 receive buffer will be overwritten by the latecomer, and we will lost the data. Hence, change tioctsi() function to use the exclusive lock interface from tty_buffer to avoid the data race. Reported-by: syzbot+97388eb9d31b997fe1d0@syzkaller.appspotmail.com Reviewed-by: Jiri Slaby Signed-off-by: Nguyen Dinh Phi Link: https://lore.kernel.org/r/20210823000641.2082292-1-phind.uet@gmail.com Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/tty/tty_io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index cee7514c3aaf2..ddfe873b5fccb 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -2176,8 +2176,6 @@ static int tty_fasync(int fd, struct file *filp, int on) * Locking: * Called functions take tty_ldiscs_lock * current->signal->tty check is safe without locks - * - * FIXME: may race normal receive processing */ static int tiocsti(struct tty_struct *tty, char __user *p) @@ -2193,8 +2191,10 @@ static int tiocsti(struct tty_struct *tty, char __user *p) ld = tty_ldisc_ref_wait(tty); if (!ld) return -EIO; + tty_buffer_lock_exclusive(tty->port); if (ld->ops->receive_buf) ld->ops->receive_buf(tty, &ch, &mbz, 1); + tty_buffer_unlock_exclusive(tty->port); tty_ldisc_deref(ld); return 0; } From c16d14f4867a796b3811eb5e4c913d8061d3414a Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Fri, 27 Aug 2021 14:54:29 +0200 Subject: [PATCH 0627/5344] KVM: s390: index kvm->arch.idle_mask by vcpu_idx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit a3e03bc1368c1bc16e19b001fc96dc7430573cc8 upstream. While in practice vcpu->vcpu_idx == vcpu->vcp_id is often true, it may not always be, and we must not rely on this. Reason is that KVM decides the vcpu_idx, userspace decides the vcpu_id, thus the two might not match. Currently kvm->arch.idle_mask is indexed by vcpu_id, which implies that code like for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) { vcpu = kvm_get_vcpu(kvm, vcpu_id); do_stuff(vcpu); } is not legit. Reason is that kvm_get_vcpu expects an vcpu_idx, not an vcpu_id. The trouble is, we do actually use kvm->arch.idle_mask like this. To fix this problem we have two options. Either use kvm_get_vcpu_by_id(vcpu_id), which would loop to find the right vcpu_id, or switch to indexing via vcpu_idx. The latter is preferable for obvious reasons. Let us make switch from indexing kvm->arch.idle_mask by vcpu_id to indexing it by vcpu_idx. To keep gisa_int.kicked_mask indexed by the same index as idle_mask lets make the same change for it as well. Fixes: 1ee0bc559dc3 ("KVM: s390: get rid of local_int array") Signed-off-by: Halil Pasic Reviewed-by: Christian Bornträger Reviewed-by: Claudio Imbrenda Cc: # 3.15+ Link: https://lore.kernel.org/r/20210827125429.1912577-1-pasic@linux.ibm.com Signed-off-by: Christian Borntraeger Signed-off-by: Greg Kroah-Hartman --- arch/s390/include/asm/kvm_host.h | 1 + arch/s390/kvm/interrupt.c | 12 ++++++------ arch/s390/kvm/kvm-s390.c | 2 +- arch/s390/kvm/kvm-s390.h | 2 +- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 0fe5600a037e4..4d59d11e6813d 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -873,6 +873,7 @@ struct kvm_arch{ atomic64_t cmma_dirty_pages; /* subset of available cpu features enabled by user space */ DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); + /* indexed by vcpu_idx */ DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS); struct kvm_s390_gisa_interrupt gisa_int; }; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 62388a678b91a..fa9483aa4f575 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -408,13 +408,13 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu) static void __set_cpu_idle(struct kvm_vcpu *vcpu) { kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT); - set_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); + set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask); } static void __unset_cpu_idle(struct kvm_vcpu *vcpu) { kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT); - clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); + clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask); } static void __reset_intercept_indicators(struct kvm_vcpu *vcpu) @@ -2984,18 +2984,18 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len) static void __airqs_kick_single_vcpu(struct kvm *kvm, u8 deliverable_mask) { - int vcpu_id, online_vcpus = atomic_read(&kvm->online_vcpus); + int vcpu_idx, online_vcpus = atomic_read(&kvm->online_vcpus); struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; struct kvm_vcpu *vcpu; - for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) { - vcpu = kvm_get_vcpu(kvm, vcpu_id); + for_each_set_bit(vcpu_idx, kvm->arch.idle_mask, online_vcpus) { + vcpu = kvm_get_vcpu(kvm, vcpu_idx); if (psw_ioint_disabled(vcpu)) continue; deliverable_mask &= (u8)(vcpu->arch.sie_block->gcr[6] >> 24); if (deliverable_mask) { /* lately kicked but not yet running */ - if (test_and_set_bit(vcpu_id, gi->kicked_mask)) + if (test_and_set_bit(vcpu_idx, gi->kicked_mask)) return; kvm_s390_vcpu_wakeup(vcpu); return; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index b88cb948734f6..04967831e0be1 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3726,7 +3726,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu) kvm_s390_patch_guest_per_regs(vcpu); } - clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.gisa_int.kicked_mask); + clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.gisa_int.kicked_mask); vcpu->arch.sie_block->icptcode = 0; cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 6d9448dbd052b..63d94a5253a8f 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -67,7 +67,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu) static inline int is_vcpu_idle(struct kvm_vcpu *vcpu) { - return test_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); + return test_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask); } static inline int kvm_is_ucontrol(struct kvm *kvm) From 3d97490402ade9b4659d66eaff9bf3f5dcf292d4 Mon Sep 17 00:00:00 2001 From: Zelin Deng Date: Wed, 28 Apr 2021 10:22:01 +0800 Subject: [PATCH 0628/5344] KVM: x86: Update vCPU's hv_clock before back to guest when tsc_offset is adjusted commit d9130a2dfdd4b21736c91b818f87dbc0ccd1e757 upstream. When MSR_IA32_TSC_ADJUST is written by guest due to TSC ADJUST feature especially there's a big tsc warp (like a new vCPU is hot-added into VM which has been up for a long time), tsc_offset is added by a large value then go back to guest. This causes system time jump as tsc_timestamp is not adjusted in the meantime and pvclock monotonic character. To fix this, just notify kvm to update vCPU's guest time before back to guest. Cc: stable@vger.kernel.org Signed-off-by: Zelin Deng Signed-off-by: Paolo Bonzini Message-Id: <1619576521-81399-2-git-send-email-zelin.deng@linux.alibaba.com> Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/x86.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a816763f97d5f..c9ca600258a09 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2827,6 +2827,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated) { s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; adjust_tsc_offset_guest(vcpu, adj); + /* Before back to guest, tsc_timestamp must be adjusted + * as well, otherwise guest's percpu pvclock time could jump. + */ + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); } vcpu->arch.ia32_tsc_adjust_msr = data; } From a73cd1692add054dbe10196b1d7a7c30ab42df65 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 10 Aug 2021 07:45:26 -0700 Subject: [PATCH 0629/5344] KVM: nVMX: Unconditionally clear nested.pi_pending on nested VM-Enter commit f7782bb8d818d8f47c26b22079db10599922787a upstream. Clear nested.pi_pending on nested VM-Enter even if L2 will run without posted interrupts enabled. If nested.pi_pending is left set from a previous L2, vmx_complete_nested_posted_interrupt() will pick up the stale flag and exit to userspace with an "internal emulation error" due the new L2 not having a valid nested.pi_desc. Arguably, vmx_complete_nested_posted_interrupt() should first check for posted interrupts being enabled, but it's also completely reasonable that KVM wouldn't screw up a fundamental flag. Not to mention that the mere existence of nested.pi_pending is a long-standing bug as KVM shouldn't move the posted interrupt out of the IRR until it's actually processed, e.g. KVM effectively drops an interrupt when it performs a nested VM-Exit with a "pending" posted interrupt. Fixing the mess is a future problem. Prior to vmx_complete_nested_posted_interrupt() interpreting a null PI descriptor as an error, this was a benign bug as the null PI descriptor effectively served as a check on PI not being enabled. Even then, the new flow did not become problematic until KVM started checking the result of kvm_check_nested_events(). Fixes: 705699a13994 ("KVM: nVMX: Enable nested posted interrupt processing") Fixes: 966eefb89657 ("KVM: nVMX: Disable vmcs02 posted interrupts if vmcs12 PID isn't mappable") Fixes: 47d3530f86c0 ("KVM: x86: Exit to userspace when kvm_check_nested_events fails") Cc: stable@vger.kernel.org Cc: Jim Mattson Signed-off-by: Sean Christopherson Message-Id: <20210810144526.2662272-1-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx/nested.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 023bd3e1aa0d2..3041015b05f71 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2057,12 +2057,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) ~PIN_BASED_VMX_PREEMPTION_TIMER); /* Posted interrupts setting is only taken from vmcs12. */ - if (nested_cpu_has_posted_intr(vmcs12)) { + vmx->nested.pi_pending = false; + if (nested_cpu_has_posted_intr(vmcs12)) vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; - vmx->nested.pi_pending = false; - } else { + else exec_control &= ~PIN_BASED_POSTED_INTR; - } pin_controls_set(vmx, exec_control); /* From 1d636027a439625e848085533cec47567bee3c85 Mon Sep 17 00:00:00 2001 From: Austin Kim Date: Tue, 29 Jun 2021 14:50:50 +0100 Subject: [PATCH 0630/5344] IMA: remove -Wmissing-prototypes warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit a32ad90426a9c8eb3915eed26e08ce133bd9e0da upstream. With W=1 build, the compiler throws warning message as below: security/integrity/ima/ima_mok.c:24:12: warning: no previous prototype for ‘ima_mok_init’ [-Wmissing-prototypes] __init int ima_mok_init(void) Silence the warning by adding static keyword to ima_mok_init(). Signed-off-by: Austin Kim Fixes: 41c89b64d718 ("IMA: create machine owner and blacklist keyrings") Cc: stable@vger.kernel.org Signed-off-by: Mimi Zohar Signed-off-by: Greg Kroah-Hartman --- security/integrity/ima/ima_mok.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/integrity/ima/ima_mok.c b/security/integrity/ima/ima_mok.c index 1e5c019161738..95cc31525c573 100644 --- a/security/integrity/ima/ima_mok.c +++ b/security/integrity/ima/ima_mok.c @@ -21,7 +21,7 @@ struct key *ima_blacklist_keyring; /* * Allocate the IMA blacklist keyring */ -__init int ima_mok_init(void) +static __init int ima_mok_init(void) { struct key_restriction *restriction; From d1ca507611ac3b3545759b96c474d2964ef48c30 Mon Sep 17 00:00:00 2001 From: THOBY Simon Date: Mon, 16 Aug 2021 08:10:59 +0000 Subject: [PATCH 0631/5344] IMA: remove the dependency on CRYPTO_MD5 commit 8510505d55e194d3f6c9644c9f9d12c4f6b0395a upstream. MD5 is a weak digest algorithm that shouldn't be used for cryptographic operation. It hinders the efficiency of a patch set that aims to limit the digests allowed for the extended file attribute namely security.ima. MD5 is no longer a requirement for IMA, nor should it be used there. The sole place where we still use the MD5 algorithm inside IMA is setting the ima_hash algorithm to MD5, if the user supplies 'ima_hash=md5' parameter on the command line. With commit ab60368ab6a4 ("ima: Fallback to the builtin hash algorithm"), setting "ima_hash=md5" fails gracefully when CRYPTO_MD5 is not set: ima: Can not allocate md5 (reason: -2) ima: Allocating md5 failed, going to use default hash algorithm sha256 Remove the CRYPTO_MD5 dependency for IMA. Signed-off-by: THOBY Simon Reviewed-by: Lakshmi Ramasubramanian [zohar@linux.ibm.com: include commit number in patch description for stable.] Cc: stable@vger.kernel.org # 4.17 Signed-off-by: Mimi Zohar Signed-off-by: Greg Kroah-Hartman --- security/integrity/ima/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/security/integrity/ima/Kconfig b/security/integrity/ima/Kconfig index d2054bec49094..748f3ee27b23d 100644 --- a/security/integrity/ima/Kconfig +++ b/security/integrity/ima/Kconfig @@ -6,7 +6,6 @@ config IMA select SECURITYFS select CRYPTO select CRYPTO_HMAC - select CRYPTO_MD5 select CRYPTO_SHA1 select CRYPTO_HASH_INFO select TCG_TPM if HAS_IOMEM && !UML From 1b46cad633093b3552c59b52f3a5a12e41f174cb Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 8 Sep 2021 19:27:49 +0900 Subject: [PATCH 0632/5344] fbmem: don't allow too huge resolutions commit 8c28051cdcbe9dfcec6bd0a4709d67a09df6edae upstream. syzbot is reporting page fault at vga16fb_fillrect() [1], for vga16fb_check_var() is failing to detect multiplication overflow. if (vxres * vyres > maxmem) { vyres = maxmem / vxres; if (vyres < yres) return -ENOMEM; } Since no module would accept too huge resolutions where multiplication overflow happens, let's reject in the common path. Link: https://syzkaller.appspot.com/bug?extid=04168c8063cfdde1db5e [1] Reported-by: syzbot Debugged-by: Randy Dunlap Signed-off-by: Tetsuo Handa Reviewed-by: Geert Uytterhoeven Cc: stable@vger.kernel.org Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/185175d6-227a-7b55-433d-b070929b262c@i-love.sakura.ne.jp Signed-off-by: Greg Kroah-Hartman --- drivers/video/fbdev/core/fbmem.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c index d87de5d467189..03b1bf994cc90 100644 --- a/drivers/video/fbdev/core/fbmem.c +++ b/drivers/video/fbdev/core/fbmem.c @@ -957,6 +957,7 @@ fb_set_var(struct fb_info *info, struct fb_var_screeninfo *var) struct fb_var_screeninfo old_var; struct fb_videomode mode; struct fb_event event; + u32 unused; if (var->activate & FB_ACTIVATE_INV_MODE) { struct fb_videomode mode1, mode2; @@ -1003,6 +1004,11 @@ fb_set_var(struct fb_info *info, struct fb_var_screeninfo *var) if (var->xres < 8 || var->yres < 8) return -EINVAL; + /* Too huge resolution causes multiplication overflow. */ + if (check_mul_overflow(var->xres, var->yres, &unused) || + check_mul_overflow(var->xres_virtual, var->yres_virtual, &unused)) + return -EINVAL; + ret = info->fbops->fb_check_var(var, info); if (ret) From a8a5ecfe37f8217cc4e892031273ff114ae6345c Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Thu, 22 Jul 2021 15:46:23 +0100 Subject: [PATCH 0633/5344] backlight: pwm_bl: Improve bootloader/kernel device handover commit 79fad92f2e596f5a8dd085788a24f540263ef887 upstream. Currently there are (at least) two problems in the way pwm_bl starts managing the enable_gpio pin. Both occur when the backlight is initially off and the driver finds the pin not already in output mode and, as a result, unconditionally switches it to output-mode and asserts the signal. Problem 1: This could cause the backlight to flicker since, at this stage in driver initialisation, we have no idea what the PWM and regulator are doing (an unconfigured PWM could easily "rest" at 100% duty cycle). Problem 2: This will cause us not to correctly honour the post_pwm_on_delay (which also risks flickers). Fix this by moving the code to configure the GPIO output mode until after we have examines the handover state. That allows us to initialize enable_gpio to off if the backlight is currently off and on if the backlight is on. Cc: stable@vger.kernel.org Reported-by: Marek Vasut Signed-off-by: Daniel Thompson Acked-by: Marek Vasut Tested-by: Marek Vasut Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- drivers/video/backlight/pwm_bl.c | 54 +++++++++++++++++--------------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c index 746eebc411dfa..047f80ee37e81 100644 --- a/drivers/video/backlight/pwm_bl.c +++ b/drivers/video/backlight/pwm_bl.c @@ -415,6 +415,33 @@ static bool pwm_backlight_is_linear(struct platform_pwm_backlight_data *data) static int pwm_backlight_initial_power_state(const struct pwm_bl_data *pb) { struct device_node *node = pb->dev->of_node; + bool active = true; + + /* + * If the enable GPIO is present, observable (either as input + * or output) and off then the backlight is not currently active. + * */ + if (pb->enable_gpio && gpiod_get_value_cansleep(pb->enable_gpio) == 0) + active = false; + + if (!regulator_is_enabled(pb->power_supply)) + active = false; + + if (!pwm_is_enabled(pb->pwm)) + active = false; + + /* + * Synchronize the enable_gpio with the observed state of the + * hardware. + */ + if (pb->enable_gpio) + gpiod_direction_output(pb->enable_gpio, active); + + /* + * Do not change pb->enabled here! pb->enabled essentially + * tells us if we own one of the regulator's use counts and + * right now we do not. + */ /* Not booted with device tree or no phandle link to the node */ if (!node || !node->phandle) @@ -426,20 +453,7 @@ static int pwm_backlight_initial_power_state(const struct pwm_bl_data *pb) * assume that another driver will enable the backlight at the * appropriate time. Therefore, if it is disabled, keep it so. */ - - /* if the enable GPIO is disabled, do not enable the backlight */ - if (pb->enable_gpio && gpiod_get_value_cansleep(pb->enable_gpio) == 0) - return FB_BLANK_POWERDOWN; - - /* The regulator is disabled, do not enable the backlight */ - if (!regulator_is_enabled(pb->power_supply)) - return FB_BLANK_POWERDOWN; - - /* The PWM is disabled, keep it like this */ - if (!pwm_is_enabled(pb->pwm)) - return FB_BLANK_POWERDOWN; - - return FB_BLANK_UNBLANK; + return active ? FB_BLANK_UNBLANK: FB_BLANK_POWERDOWN; } static int pwm_backlight_probe(struct platform_device *pdev) @@ -508,18 +522,6 @@ static int pwm_backlight_probe(struct platform_device *pdev) pb->enable_gpio = gpio_to_desc(data->enable_gpio); } - /* - * If the GPIO is not known to be already configured as output, that - * is, if gpiod_get_direction returns either 1 or -EINVAL, change the - * direction to output and set the GPIO as active. - * Do not force the GPIO to active when it was already output as it - * could cause backlight flickering or we would enable the backlight too - * early. Leave the decision of the initial backlight state for later. - */ - if (pb->enable_gpio && - gpiod_get_direction(pb->enable_gpio) != 0) - gpiod_direction_output(pb->enable_gpio, 1); - pb->power_supply = devm_regulator_get(&pdev->dev, "power"); if (IS_ERR(pb->power_supply)) { ret = PTR_ERR(pb->power_supply); From 6b153233f3171c5c620bf3019d749aec2df46061 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sun, 15 Aug 2021 01:55:14 +0200 Subject: [PATCH 0634/5344] clk: kirkwood: Fix a clocking boot regression commit aaedb9e00e5400220a8871180d23a83e67f29f63 upstream. Since a few kernel releases the Pogoplug 4 has crashed like this during boot: Unable to handle kernel NULL pointer dereference at virtual address 00000002 (...) [] (strlen) from [] (kstrdup+0x1c/0x4c) [] (kstrdup) from [] (__clk_register+0x44/0x37c) [] (__clk_register) from [] (clk_hw_register+0x20/0x44) [] (clk_hw_register) from [] (__clk_hw_register_mux+0x198/0x1e4) [] (__clk_hw_register_mux) from [] (clk_register_mux_table+0x5c/0x6c) [] (clk_register_mux_table) from [] (kirkwood_clk_muxing_setup.constprop.0+0x13c/0x1ac) [] (kirkwood_clk_muxing_setup.constprop.0) from [] (of_clk_init+0x12c/0x214) [] (of_clk_init) from [] (time_init+0x20/0x2c) [] (time_init) from [] (start_kernel+0x3dc/0x56c) [] (start_kernel) from [<00000000>] (0x0) Code: e3130020 1afffffb e12fff1e c08a1078 (e5d03000) This is because the "powersave" mux clock 0 was provided in an unterminated array, which is required by the loop in the driver: /* Count, allocate, and register clock muxes */ for (n = 0; desc[n].name;) n++; Here n will go out of bounds and then call clk_register_mux() on random memory contents after the mux clock. Fix this by terminating the array with a blank entry. Fixes: 105299381d87 ("cpufreq: kirkwood: use the powersave multiplexer") Cc: stable@vger.kernel.org Cc: Andrew Lunn Cc: Chris Packham Cc: Gregory CLEMENT Cc: Sebastian Hesselbarth Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20210814235514.403426-1-linus.walleij@linaro.org Reviewed-by: Andrew Lunn Signed-off-by: Stephen Boyd Signed-off-by: Greg Kroah-Hartman --- drivers/clk/mvebu/kirkwood.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/clk/mvebu/kirkwood.c b/drivers/clk/mvebu/kirkwood.c index 47680237d0beb..8bc893df47364 100644 --- a/drivers/clk/mvebu/kirkwood.c +++ b/drivers/clk/mvebu/kirkwood.c @@ -265,6 +265,7 @@ static const char *powersave_parents[] = { static const struct clk_muxing_soc_desc kirkwood_mux_desc[] __initconst = { { "powersave", powersave_parents, ARRAY_SIZE(powersave_parents), 11, 1, 0 }, + { } }; static struct clk *clk_muxing_get_src( From 0abc6644efc0bef3f6bc52cf801880e068e5bfac Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 15 Sep 2021 09:47:41 +0200 Subject: [PATCH 0635/5344] Linux 5.4.146 Link: https://lore.kernel.org/r/20210913131047.974309396@linuxfoundation.org Tested-by: Florian Fainelli Tested-by: Shuah Khan Tested-by: Jon Hunter Tested-by: Linux Kernel Functional Testing Tested-by: Guenter Roeck Tested-by: Sudip Mukherjee Tested-by: Hulk Robot Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index c32a36c8ffc90..48d0c03acfc55 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 4 -SUBLEVEL = 145 +SUBLEVEL = 146 EXTRAVERSION = NAME = Kleptomaniac Octopus From 49d14a24bedf8a98d2570ec2027baf4f0b57975c Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 15 Sep 2021 21:16:27 -0400 Subject: [PATCH 0636/5344] Revert "Bluetooth: Move shutdown callback before flushing tx and rx queue" This reverts commit abbcd61d091f69ec98013dc0ae9c992e152fc303. Botched backport, dropping to reword for next release. Reported-by: Guenter Roeck Signed-off-by: Sasha Levin --- net/bluetooth/hci_core.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index bdd330527cfa2..c50e3e8afbd34 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1691,14 +1691,6 @@ int hci_dev_do_close(struct hci_dev *hdev) hci_request_cancel_all(hdev); hci_req_sync_lock(hdev); - if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) && - !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && - test_bit(HCI_UP, &hdev->flags)) { - /* Execute vendor specific shutdown routine */ - if (hdev->shutdown) - hdev->shutdown(hdev); - } - if (!test_and_clear_bit(HCI_UP, &hdev->flags)) { cancel_delayed_work_sync(&hdev->cmd_timer); hci_req_sync_unlock(hdev); From ce02b7a40b1d19db1568d6bfd23c529269f42c89 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 16 Sep 2021 10:17:11 +0200 Subject: [PATCH 0637/5344] Revert "block: nbd: add sanity check for first_minor" This reverts commit b3fa499d72a0db612f12645265a36751955c0037 which is commit b1a811633f7321cf1ae2bb76a66805b7720e44c9 upstream. The backport of this is reported to be causing some problems, so revert this for now until they are worked out. Link: https://lore.kernel.org/r/CACPK8XfUWoOHr-0RwRoYoskia4fbAbZ7DYf5wWBnv6qUnGq18w@mail.gmail.com Reported-by: Joel Stanley Cc: Christoph Hellwig Cc: Pavel Skripkin Cc: Jens Axboe Cc: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/block/nbd.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index bc83c75596d47..bdc750c5d6dda 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1786,17 +1786,7 @@ static int nbd_dev_add(int index) refcount_set(&nbd->refs, 1); INIT_LIST_HEAD(&nbd->list); disk->major = NBD_MAJOR; - - /* Too big first_minor can cause duplicate creation of - * sysfs files/links, since first_minor will be truncated to - * byte in __device_add_disk(). - */ disk->first_minor = index << part_shift; - if (disk->first_minor > 0xff) { - err = -EINVAL; - goto out_free_idr; - } - disk->fops = &nbd_fops; disk->private_data = nbd; sprintf(disk->disk_name, "nbd%d", index); From 26832dd34d14550dc0bc2f6da05b5b89db46c75f Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 16 Sep 2021 10:51:42 +0200 Subject: [PATCH 0638/5344] Revert "posix-cpu-timers: Force next expiration recalc after itimer reset" This reverts commit c322a963d522e9a4273e18c9d7bd6fd40a25160f which is commit 406dd42bd1ba0c01babf9cde169bb319e52f6147 upstream. It is reported to cause regressions. A proposed fix has been posted, but it is not in a released kernel yet. So just revert this from the stable release so that the bug is fixed. If it's really needed we can add it back in in a future release. Link: https://lore.kernel.org/r/87ilz1pwaq.fsf@wylie.me.uk Reported-by: "Alan J. Wylie" Cc: Linus Torvalds Cc: Frederic Weisbecker Cc: Thomas Gleixner Cc: Peter Zijlstra (Intel) Cc: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- kernel/time/posix-cpu-timers.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 30e061b210b7c..eacb0ca301932 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1201,6 +1201,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid, } } + if (!*newval) + return; *newval += now; } From 6ebb6c0d11d274d489e8056e810219c845083732 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 16 Sep 2021 11:12:30 +0200 Subject: [PATCH 0639/5344] Revert "time: Handle negative seconds correctly in timespec64_to_ns()" This reverts commit 7a25a0a94c8b49759582ac6141c06af4f3e8ae8f which is commit 39ff83f2f6cc5cc1458dfcea9697f96338210beb upstream. Arnd reports that this needs more review before being merged into all of the trees. Link: https://lore.kernel.org/r/CAK8P3a0z5jE=Z3Ps5bFTCFT7CHZR1JQ8VhdntDJAfsUxSPCcEw@mail.gmail.com Reported-by: Arnd Bergmann Cc: Lukas Hannen Cc: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- include/linux/time64.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/include/linux/time64.h b/include/linux/time64.h index f6059c505986b..5eab3f2635186 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -33,9 +33,7 @@ struct itimerspec64 { #define TIME64_MIN (-TIME64_MAX - 1) #define KTIME_MAX ((s64)~((u64)1 << 63)) -#define KTIME_MIN (-KTIME_MAX - 1) #define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) -#define KTIME_SEC_MIN (KTIME_MIN / NSEC_PER_SEC) /* * Limits for settimeofday(): @@ -134,13 +132,10 @@ static inline bool timespec64_valid_settod(const struct timespec64 *ts) */ static inline s64 timespec64_to_ns(const struct timespec64 *ts) { - /* Prevent multiplication overflow / underflow */ - if (ts->tv_sec >= KTIME_SEC_MAX) + /* Prevent multiplication overflow */ + if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) return KTIME_MAX; - if (ts->tv_sec <= KTIME_SEC_MIN) - return KTIME_MIN; - return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec; } From 8030d68af79506706cd040548fccefd44534a3ba Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 16 Sep 2021 12:56:14 +0200 Subject: [PATCH 0640/5344] Linux 5.4.147 Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 48d0c03acfc55..98227dae34947 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 4 -SUBLEVEL = 146 +SUBLEVEL = 147 EXTRAVERSION = NAME = Kleptomaniac Octopus From 232585aa25bc600486a11d3955178e33cda941b0 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Sun, 8 Aug 2021 19:00:30 +0300 Subject: [PATCH 0641/5344] rtc: tps65910: Correct driver module alias commit 8d448fa0a8bb1c8d94eef7647edffe9ac81a281e upstream. The TPS65910 RTC driver module doesn't auto-load because of the wrong module alias that doesn't match the device name, fix it. Cc: stable@vger.kernel.org Reported-by: Anton Bambura Tested-by: Anton Bambura Signed-off-by: Dmitry Osipenko Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20210808160030.8556-1-digetx@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/rtc/rtc-tps65910.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-tps65910.c b/drivers/rtc/rtc-tps65910.c index 2c0467a9e7179..8d1b1fda62dd1 100644 --- a/drivers/rtc/rtc-tps65910.c +++ b/drivers/rtc/rtc-tps65910.c @@ -460,6 +460,6 @@ static struct platform_driver tps65910_rtc_driver = { }; module_platform_driver(tps65910_rtc_driver); -MODULE_ALIAS("platform:rtc-tps65910"); +MODULE_ALIAS("platform:tps65910-rtc"); MODULE_AUTHOR("Venu Byravarasu "); MODULE_LICENSE("GPL"); From d1fc87e69ef205f0599bfb58e66e4abb98d9f054 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Jul 2021 14:47:17 -0400 Subject: [PATCH 0642/5344] btrfs: wake up async_delalloc_pages waiters after submit commit ac98141d140444fe93e26471d3074c603b70e2ca upstream. We use the async_delalloc_pages mechanism to make sure that we've completed our async work before trying to continue our delalloc flushing. The reason for this is we need to see any ordered extents that were created by our delalloc flushing. However we're waking up before we do the submit work, which is before we create the ordered extents. This is a pretty wide race window where we could potentially think there are no ordered extents and thus exit shrink_delalloc prematurely. Fix this by waking us up after we've done the work to create ordered extents. CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/inode.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 33b8fedab6c67..b859ed50cf46c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1200,11 +1200,6 @@ static noinline void async_cow_submit(struct btrfs_work *work) nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >> PAGE_SHIFT; - /* atomic_sub_return implies a barrier */ - if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < - 5 * SZ_1M) - cond_wake_up_nomb(&fs_info->async_submit_wait); - /* * ->inode could be NULL if async_chunk_start has failed to compress, * in which case we don't have anything to submit, yet we need to @@ -1213,6 +1208,11 @@ static noinline void async_cow_submit(struct btrfs_work *work) */ if (async_chunk->inode) submit_compressed_extents(async_chunk); + + /* atomic_sub_return implies a barrier */ + if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < + 5 * SZ_1M) + cond_wake_up_nomb(&fs_info->async_submit_wait); } static noinline void async_cow_free(struct btrfs_work *work) From 8335871b5203e56e5fd1920cc791deb24c4cc0a6 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Wed, 11 Aug 2021 11:05:18 +0000 Subject: [PATCH 0643/5344] blk-zoned: allow zone management send operations without CAP_SYS_ADMIN commit ead3b768bb51259e3a5f2287ff5fc9041eb6f450 upstream. Zone management send operations (BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE) should be allowed under the same permissions as write(). (write() does not require CAP_SYS_ADMIN). Additionally, other ioctls like BLKSECDISCARD and BLKZEROOUT only check if the fd was successfully opened with FMODE_WRITE. (They do not require CAP_SYS_ADMIN). Currently, zone management send operations require both CAP_SYS_ADMIN and that the fd was successfully opened with FMODE_WRITE. Remove the CAP_SYS_ADMIN requirement, so that zone management send operations match the access control requirement of write(), BLKSECDISCARD and BLKZEROOUT. Fixes: 3ed05a987e0f ("blk-zoned: implement ioctls") Signed-off-by: Niklas Cassel Reviewed-by: Damien Le Moal Reviewed-by: Aravind Ramesh Reviewed-by: Adam Manzanares Reviewed-by: Himanshu Madhani Reviewed-by: Johannes Thumshirn Cc: stable@vger.kernel.org # v4.10+ Link: https://lore.kernel.org/r/20210811110505.29649-2-Niklas.Cassel@wdc.com Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- block/blk-zoned.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index b17c094cb977c..0a760845691e4 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -374,9 +374,6 @@ int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode, if (!blk_queue_is_zoned(q)) return -ENOTTY; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (!(mode & FMODE_WRITE)) return -EBADF; From 4498164fd4d87695c95c54b9fd9fbec81d4241b0 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Wed, 11 Aug 2021 11:05:19 +0000 Subject: [PATCH 0644/5344] blk-zoned: allow BLKREPORTZONE without CAP_SYS_ADMIN commit 4d643b66089591b4769bcdb6fd1bfeff2fe301b8 upstream. A user space process should not need the CAP_SYS_ADMIN capability set in order to perform a BLKREPORTZONE ioctl. Getting the zone report is required in order to get the write pointer. Neither read() nor write() requires CAP_SYS_ADMIN, so it is reasonable that a user space process that can read/write from/to the device, also can get the write pointer. (Since e.g. writes have to be at the write pointer.) Fixes: 3ed05a987e0f ("blk-zoned: implement ioctls") Signed-off-by: Niklas Cassel Reviewed-by: Damien Le Moal Reviewed-by: Aravind Ramesh Reviewed-by: Adam Manzanares Reviewed-by: Himanshu Madhani Reviewed-by: Johannes Thumshirn Cc: stable@vger.kernel.org # v4.10+ Link: https://lore.kernel.org/r/20210811110505.29649-3-Niklas.Cassel@wdc.com Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- block/blk-zoned.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 0a760845691e4..a85d0a06a6ff2 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -316,9 +316,6 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, if (!blk_queue_is_zoned(q)) return -ENOTTY; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report))) return -EFAULT; From d75bdc144cf67d7fbf8492f20bd754a092f3377c Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Fri, 13 Aug 2021 13:51:58 +0530 Subject: [PATCH 0645/5344] powerpc/perf/hv-gpci: Fix counter value parsing commit f9addd85fbfacf0d155e83dbee8696d6df5ed0c7 upstream. H_GetPerformanceCounterInfo (0xF080) hcall returns the counter data in the result buffer. Result buffer has specific format defined in the PAPR specification. One of the fields is counter offset and width of the counter data returned. Counter data are returned in a unsigned char array in big endian byte order. To get the final counter data, the values must be left shifted byte at a time. But commit 220a0c609ad17 ("powerpc/perf: Add support for the hv gpci (get performance counter info) interface") made the shifting bitwise and also assumed little endian order. Because of that, hcall counters values are reported incorrectly. In particular this can lead to counters go backwards which messes up the counter prev vs now calculation and leads to huge counter value reporting: #: perf stat -e hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ -C 0 -I 1000 time counts unit events 1.000078854 18,446,744,073,709,535,232 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 2.000213293 0 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 3.000320107 0 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 4.000428392 0 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 5.000537864 0 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 6.000649087 0 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 7.000760312 0 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 8.000865218 16,448 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 9.000978985 18,446,744,073,709,535,232 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 10.001088891 16,384 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 11.001201435 0 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 12.001307937 18,446,744,073,709,535,232 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ Fix the shifting logic to correct match the format, ie. read bytes in big endian order. Fixes: e4f226b1580b ("powerpc/perf/hv-gpci: Increase request buffer size") Cc: stable@vger.kernel.org # v4.6+ Reported-by: Nageswara R Sastry Signed-off-by: Kajol Jain Tested-by: Nageswara R Sastry Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210813082158.429023-1-kjain@linux.ibm.com Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/perf/hv-gpci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/perf/hv-gpci.c b/arch/powerpc/perf/hv-gpci.c index 6884d16ec19b9..732cfc53e260d 100644 --- a/arch/powerpc/perf/hv-gpci.c +++ b/arch/powerpc/perf/hv-gpci.c @@ -164,7 +164,7 @@ static unsigned long single_gpci_request(u32 req, u32 starting_index, */ count = 0; for (i = offset; i < offset + length; i++) - count |= arg->bytes[i] << (i - offset); + count |= (u64)(arg->bytes[i]) << ((length - 1 - (i - offset)) * 8); *value = count; out: From 5404c02758a108ced9eabc43df4e0ff49be3563c Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 30 Jul 2021 11:26:21 +0200 Subject: [PATCH 0646/5344] xen: fix setting of max_pfn in shared_info commit 4b511d5bfa74b1926daefd1694205c7f1bcf677f upstream. Xen PV guests are specifying the highest used PFN via the max_pfn field in shared_info. This value is used by the Xen tools when saving or migrating the guest. Unfortunately this field is misnamed, as in reality it is specifying the number of pages (including any memory holes) of the guest, so it is the highest used PFN + 1. Renaming isn't possible, as this is a public Xen hypervisor interface which needs to be kept stable. The kernel will set the value correctly initially at boot time, but when adding more pages (e.g. due to memory hotplug or ballooning) a real PFN number is stored in max_pfn. This is done when expanding the p2m array, and the PFN stored there is even possibly wrong, as it should be the last possible PFN of the just added P2M frame, and not one which led to the P2M expansion. Fix that by setting shared_info->max_pfn to the last possible PFN + 1. Fixes: 98dd166ea3a3c3 ("x86/xen/p2m: hint at the last populated P2M entry") Cc: stable@vger.kernel.org Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich Link: https://lore.kernel.org/r/20210730092622.9973-2-jgross@suse.com Signed-off-by: Juergen Gross Signed-off-by: Greg Kroah-Hartman --- arch/x86/xen/p2m.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 12fcb3858303a..8b1e40ec58f65 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -622,8 +622,8 @@ int xen_alloc_p2m_entry(unsigned long pfn) } /* Expanded the p2m? */ - if (pfn > xen_p2m_last_pfn) { - xen_p2m_last_pfn = pfn; + if (pfn >= xen_p2m_last_pfn) { + xen_p2m_last_pfn = ALIGN(pfn + 1, P2M_PER_PAGE); HYPERVISOR_shared_info->arch.max_pfn = xen_p2m_last_pfn; } From 267eb31cb0be0bcec5e720eefc9de92c6a28a83d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 15 Oct 2020 20:11:31 -0700 Subject: [PATCH 0647/5344] include/linux/list.h: add a macro to test if entry is pointing to the head commit e130816164e244b692921de49771eeb28205152d upstream. Add a macro to test if entry is pointing to the head of the list which is useful in cases like: list_for_each_entry(pos, &head, member) { if (cond) break; } if (list_entry_is_head(pos, &head, member)) return -ERRNO; that allows to avoid additional variable to be added to track if loop has not been stopped in the middle. While here, convert list_for_each_entry*() family of macros to use a new one. Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Reviewed-by: Cezary Rojewski Link: https://lkml.kernel.org/r/20200929134342.51489-1-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/list.h | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/include/linux/list.h b/include/linux/list.h index f471c6d56c58e..231ff089f7d1c 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -585,6 +585,15 @@ static inline void list_splice_tail_init(struct list_head *list, pos != (head); \ pos = n, n = pos->prev) +/** + * list_entry_is_head - test if the entry points to the head of the list + * @pos: the type * to cursor + * @head: the head for your list. + * @member: the name of the list_head within the struct. + */ +#define list_entry_is_head(pos, head, member) \ + (&pos->member == (head)) + /** * list_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. @@ -593,7 +602,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_entry(pos, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** @@ -604,7 +613,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_entry_reverse(pos, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** @@ -629,7 +638,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_entry_continue(pos, head, member) \ for (pos = list_next_entry(pos, member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** @@ -643,7 +652,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_entry_continue_reverse(pos, head, member) \ for (pos = list_prev_entry(pos, member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** @@ -655,7 +664,7 @@ static inline void list_splice_tail_init(struct list_head *list, * Iterate over list of given type, continuing from current position. */ #define list_for_each_entry_from(pos, head, member) \ - for (; &pos->member != (head); \ + for (; !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** @@ -668,7 +677,7 @@ static inline void list_splice_tail_init(struct list_head *list, * Iterate backwards over list of given type, continuing from current position. */ #define list_for_each_entry_from_reverse(pos, head, member) \ - for (; &pos->member != (head); \ + for (; !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** @@ -681,7 +690,7 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_for_each_entry_safe(pos, n, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member), \ n = list_next_entry(pos, member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** @@ -697,7 +706,7 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_for_each_entry_safe_continue(pos, n, head, member) \ for (pos = list_next_entry(pos, member), \ n = list_next_entry(pos, member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** @@ -712,7 +721,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_entry_safe_from(pos, n, head, member) \ for (n = list_next_entry(pos, member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** @@ -728,7 +737,7 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_for_each_entry_safe_reverse(pos, n, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member), \ n = list_prev_entry(pos, member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = n, n = list_prev_entry(n, member)) /** From 6206f233b5fc339aa1b4190392447a89df05e321 Mon Sep 17 00:00:00 2001 From: Harshvardhan Jha Date: Tue, 27 Jul 2021 05:37:10 +0530 Subject: [PATCH 0648/5344] 9p/xen: Fix end of loop tests for list_for_each_entry commit 732b33d0dbf17e9483f0b50385bf606f724f50a2 upstream. This patch addresses the following problems: - priv can never be NULL, so this part of the check is useless - if the loop ran through the whole list, priv->client is invalid and it is more appropriate and sufficient to check for the end of list_for_each_entry loop condition. Link: http://lkml.kernel.org/r/20210727000709.225032-1-harshvardhan.jha@oracle.com Signed-off-by: Harshvardhan Jha Reviewed-by: Stefano Stabellini Tested-by: Stefano Stabellini Cc: Signed-off-by: Dominique Martinet Signed-off-by: Greg Kroah-Hartman --- net/9p/trans_xen.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index 3963eb11c3fbd..44e6c74ed4288 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -138,7 +138,7 @@ static bool p9_xen_write_todo(struct xen_9pfs_dataring *ring, RING_IDX size) static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req) { - struct xen_9pfs_front_priv *priv = NULL; + struct xen_9pfs_front_priv *priv; RING_IDX cons, prod, masked_cons, masked_prod; unsigned long flags; u32 size = p9_req->tc.size; @@ -151,7 +151,7 @@ static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req) break; } read_unlock(&xen_9pfs_lock); - if (!priv || priv->client != client) + if (list_entry_is_head(priv, &xen_9pfs_devs, list)) return -EINVAL; num = p9_req->tc.tag % priv->num_rings; From 4f13965b75f325e0899bcc1a27bf24553be2733b Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Fri, 30 Jul 2021 13:51:54 +0200 Subject: [PATCH 0649/5344] tools/thermal/tmon: Add cross compiling support commit b5f7912bb604b47a0fe024560488a7556dce8ee7 upstream. Default to prefixed pkg-config when crosscompiling, this matches what other parts of the tools/ directory already do. [dlezcano] : Reworked description Signed-off-by: Rolf Eike Beer Cc: stable@vger.kernel.org Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/31302992.qZodDJZGDc@devpool47 Signed-off-by: Greg Kroah-Hartman --- tools/thermal/tmon/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/thermal/tmon/Makefile b/tools/thermal/tmon/Makefile index 59e417ec3e134..25d7f8f37cfd6 100644 --- a/tools/thermal/tmon/Makefile +++ b/tools/thermal/tmon/Makefile @@ -10,7 +10,7 @@ override CFLAGS+= $(call cc-option,-O3,-O1) ${WARNFLAGS} # Add "-fstack-protector" only if toolchain supports it. override CFLAGS+= $(call cc-option,-fstack-protector-strong) CC?= $(CROSS_COMPILE)gcc -PKG_CONFIG?= pkg-config +PKG_CONFIG?= $(CROSS_COMPILE)pkg-config override CFLAGS+=-D VERSION=\"$(VERSION)\" LDFLAGS+= From 7fea5adf6b57ff86556059beeeae6009aa6c41a2 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 25 Jul 2021 19:08:30 +0100 Subject: [PATCH 0650/5344] pinctrl: stmfx: Fix hazardous u8[] to unsigned long cast commit 1b73e588f47397dee6e4bdfd953e0306c60b5fe5 upstream. Casting a small array of u8 to an unsigned long is *never* OK: - it does funny thing when the array size is less than that of a long, as it accesses random places in the stack - it makes everything even more fun with a BE kernel Fix this by building the unsigned long used as a bitmap byte by byte, in a way that works across endianess and has no undefined behaviours. An extra BUILD_BUG_ON() catches the unlikely case where the array would be larger than a single unsigned long. Fixes: 1490d9f841b1 ("pinctrl: Add STMFX GPIO expander Pinctrl/GPIO driver") Signed-off-by: Marc Zyngier Cc: stable@vger.kernel.org Cc: Amelie Delaunay Cc: Linus Walleij Cc: Maxime Coquelin Cc: Alexandre Torgue Link: https://lore.kernel.org/r/20210725180830.250218-1-maz@kernel.org Signed-off-by: Linus Walleij Signed-off-by: Greg Kroah-Hartman --- drivers/pinctrl/pinctrl-stmfx.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/pinctrl/pinctrl-stmfx.c b/drivers/pinctrl/pinctrl-stmfx.c index ccdf0bb214149..835c14bb315bc 100644 --- a/drivers/pinctrl/pinctrl-stmfx.c +++ b/drivers/pinctrl/pinctrl-stmfx.c @@ -540,7 +540,7 @@ static irqreturn_t stmfx_pinctrl_irq_thread_fn(int irq, void *dev_id) u8 pending[NR_GPIO_REGS]; u8 src[NR_GPIO_REGS] = {0, 0, 0}; unsigned long n, status; - int ret; + int i, ret; ret = regmap_bulk_read(pctl->stmfx->map, STMFX_REG_IRQ_GPI_PENDING, &pending, NR_GPIO_REGS); @@ -550,7 +550,9 @@ static irqreturn_t stmfx_pinctrl_irq_thread_fn(int irq, void *dev_id) regmap_bulk_write(pctl->stmfx->map, STMFX_REG_IRQ_GPI_SRC, src, NR_GPIO_REGS); - status = *(unsigned long *)pending; + BUILD_BUG_ON(NR_GPIO_REGS > sizeof(status)); + for (i = 0, status = 0; i < NR_GPIO_REGS; i++) + status |= (unsigned long)pending[i] << (i * 8); for_each_set_bit(n, &status, gc->ngpio) { handle_nested_irq(irq_find_mapping(gc->irq.domain, n)); stmfx_pinctrl_irq_toggle_trigger(pctl, n); From 4dea8e915154547ecc3184d2898e8e6056e23001 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Sat, 17 Jul 2021 18:48:34 +0100 Subject: [PATCH 0651/5344] pinctrl: ingenic: Fix incorrect pull up/down info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit d5e931403942b3af39212960c2592b5ba741b2bf upstream. Fix the pull up/down info for both the JZ4760 and JZ4770 SoCs, as the previous values sometimes contradicted what's written in the programming manual. Fixes: b5c23aa46537 ("pinctrl: add a pinctrl driver for the Ingenic jz47xx SoCs") Cc: # v4.12 Signed-off-by: Paul Cercueil Tested-by: 周琰杰 (Zhou Yanjie) Link: https://lore.kernel.org/r/20210717174836.14776-1-paul@crapouillou.net Signed-off-by: Linus Walleij Signed-off-by: Greg Kroah-Hartman --- drivers/pinctrl/pinctrl-ingenic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/pinctrl/pinctrl-ingenic.c b/drivers/pinctrl/pinctrl-ingenic.c index 91596eee0bda1..ba078a7098468 100644 --- a/drivers/pinctrl/pinctrl-ingenic.c +++ b/drivers/pinctrl/pinctrl-ingenic.c @@ -348,7 +348,7 @@ static const struct ingenic_chip_info jz4725b_chip_info = { }; static const u32 jz4760_pull_ups[6] = { - 0xffffffff, 0xfffcf3ff, 0xffffffff, 0xffffcfff, 0xfffffb7c, 0xfffff00f, + 0xffffffff, 0xfffcf3ff, 0xffffffff, 0xffffcfff, 0xfffffb7c, 0x0000000f, }; static const u32 jz4760_pull_downs[6] = { @@ -611,11 +611,11 @@ static const struct ingenic_chip_info jz4760b_chip_info = { }; static const u32 jz4770_pull_ups[6] = { - 0x3fffffff, 0xfff0030c, 0xffffffff, 0xffff4fff, 0xfffffb7c, 0xffa7f00f, + 0x3fffffff, 0xfff0f3fc, 0xffffffff, 0xffff4fff, 0xfffffb7c, 0x0024f00f, }; static const u32 jz4770_pull_downs[6] = { - 0x00000000, 0x000f0c03, 0x00000000, 0x0000b000, 0x00000483, 0x00580ff0, + 0x00000000, 0x000f0c03, 0x00000000, 0x0000b000, 0x00000483, 0x005b0ff0, }; static int jz4770_uart0_data_pins[] = { 0xa0, 0xa3, }; From 07af3cc39395638cbf6a7a6e7811c0d690baf52a Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Tue, 29 Jun 2021 21:02:49 +0530 Subject: [PATCH 0652/5344] soc: qcom: aoss: Fix the out of bound usage of cooling_devs commit a89f355e469dcda129c2522be4fdba00c1c74c83 upstream. In "qmp_cooling_devices_register", the count value is initially QMP_NUM_COOLING_RESOURCES, which is 2. Based on the initial count value, the memory for cooling_devs is allocated. Then while calling the "qmp_cooling_device_add" function, count value is post-incremented for each child node. This makes the out of bound access to the cooling_dev array. Fix it by passing the QMP_NUM_COOLING_RESOURCES definition to devm_kzalloc() and initializing the count to 0. While at it, let's also free the memory allocated to cooling_dev if no cooling device is found in DT and during unroll phase. Cc: stable@vger.kernel.org # 5.4 Fixes: 05589b30b21a ("soc: qcom: Extend AOSS QMP driver to support resources that are used to wake up the SoC.") Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20210629153249.73428-1-manivannan.sadhasivam@linaro.org Signed-off-by: Bjorn Andersson Signed-off-by: Greg Kroah-Hartman --- drivers/soc/qcom/qcom_aoss.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/soc/qcom/qcom_aoss.c b/drivers/soc/qcom/qcom_aoss.c index 33a27e6c6d67d..45c5aa712edac 100644 --- a/drivers/soc/qcom/qcom_aoss.c +++ b/drivers/soc/qcom/qcom_aoss.c @@ -472,12 +472,12 @@ static int qmp_cooling_device_add(struct qmp *qmp, static int qmp_cooling_devices_register(struct qmp *qmp) { struct device_node *np, *child; - int count = QMP_NUM_COOLING_RESOURCES; + int count = 0; int ret; np = qmp->dev->of_node; - qmp->cooling_devs = devm_kcalloc(qmp->dev, count, + qmp->cooling_devs = devm_kcalloc(qmp->dev, QMP_NUM_COOLING_RESOURCES, sizeof(*qmp->cooling_devs), GFP_KERNEL); @@ -493,12 +493,16 @@ static int qmp_cooling_devices_register(struct qmp *qmp) goto unroll; } + if (!count) + devm_kfree(qmp->dev, qmp->cooling_devs); + return 0; unroll: while (--count >= 0) thermal_cooling_device_unregister (qmp->cooling_devs[count].cdev); + devm_kfree(qmp->dev, qmp->cooling_devs); return ret; } From 20b13189861f8c9f8d09de841791f2099a0c8efb Mon Sep 17 00:00:00 2001 From: Iwona Winiarska Date: Wed, 4 Aug 2021 01:48:18 +0200 Subject: [PATCH 0653/5344] soc: aspeed: lpc-ctrl: Fix boundary check for mmap commit b49a0e69a7b1a68c8d3f64097d06dabb770fec96 upstream. The check mixes pages (vm_pgoff) with bytes (vm_start, vm_end) on one side of the comparison, and uses resource address (rather than just the resource size) on the other side of the comparison. This can allow malicious userspace to easily bypass the boundary check and map pages that are located outside memory-region reserved by the driver. Fixes: 6c4e97678501 ("drivers/misc: Add Aspeed LPC control driver") Cc: stable@vger.kernel.org Signed-off-by: Iwona Winiarska Reviewed-by: Andrew Jeffery Tested-by: Andrew Jeffery Reviewed-by: Joel Stanley Signed-off-by: Joel Stanley Signed-off-by: Greg Kroah-Hartman --- drivers/soc/aspeed/aspeed-lpc-ctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/soc/aspeed/aspeed-lpc-ctrl.c b/drivers/soc/aspeed/aspeed-lpc-ctrl.c index 01ed21e8bfee5..040c7dc1d4792 100644 --- a/drivers/soc/aspeed/aspeed-lpc-ctrl.c +++ b/drivers/soc/aspeed/aspeed-lpc-ctrl.c @@ -46,7 +46,7 @@ static int aspeed_lpc_ctrl_mmap(struct file *file, struct vm_area_struct *vma) unsigned long vsize = vma->vm_end - vma->vm_start; pgprot_t prot = vma->vm_page_prot; - if (vma->vm_pgoff + vsize > lpc_ctrl->mem_base + lpc_ctrl->mem_size) + if (vma->vm_pgoff + vma_pages(vma) > lpc_ctrl->mem_size >> PAGE_SHIFT) return -EINVAL; /* ast2400/2500 AHB accesses are not cache coherent */ From 17b94b919be08c67c1f90d2d87d04e5ab3a72de0 Mon Sep 17 00:00:00 2001 From: Iwona Winiarska Date: Wed, 4 Aug 2021 01:48:19 +0200 Subject: [PATCH 0654/5344] soc: aspeed: p2a-ctrl: Fix boundary check for mmap commit 8b07e990fb254fcbaa919616ac77f981cb48c73d upstream. The check mixes pages (vm_pgoff) with bytes (vm_start, vm_end) on one side of the comparison, and uses resource address (rather than just the resource size) on the other side of the comparison. This can allow malicious userspace to easily bypass the boundary check and map pages that are located outside memory-region reserved by the driver. Fixes: 01c60dcea9f7 ("drivers/misc: Add Aspeed P2A control driver") Cc: stable@vger.kernel.org Signed-off-by: Iwona Winiarska Reviewed-by: Andrew Jeffery Tested-by: Andrew Jeffery Reviewed-by: Joel Stanley Signed-off-by: Joel Stanley Signed-off-by: Greg Kroah-Hartman --- drivers/soc/aspeed/aspeed-p2a-ctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/soc/aspeed/aspeed-p2a-ctrl.c b/drivers/soc/aspeed/aspeed-p2a-ctrl.c index b60fbeaffcbd0..20b5fb2a207cc 100644 --- a/drivers/soc/aspeed/aspeed-p2a-ctrl.c +++ b/drivers/soc/aspeed/aspeed-p2a-ctrl.c @@ -110,7 +110,7 @@ static int aspeed_p2a_mmap(struct file *file, struct vm_area_struct *vma) vsize = vma->vm_end - vma->vm_start; prot = vma->vm_page_prot; - if (vma->vm_pgoff + vsize > ctrl->mem_base + ctrl->mem_size) + if (vma->vm_pgoff + vma_pages(vma) > ctrl->mem_size >> PAGE_SHIFT) return -EINVAL; /* ast2400/2500 AHB accesses are not cache coherent */ From 37fe72bd19774ce0fa3ca6ae9f0917565dd17f7d Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 23 Aug 2021 11:12:53 +0100 Subject: [PATCH 0655/5344] arm64: head: avoid over-mapping in map_memory commit 90268574a3e8a6b883bd802d702a2738577e1006 upstream. The `compute_indices` and `populate_entries` macros operate on inclusive bounds, and thus the `map_memory` macro which uses them also operates on inclusive bounds. We pass `_end` and `_idmap_text_end` to `map_memory`, but these are exclusive bounds, and if one of these is sufficiently aligned (as a result of kernel configuration, physical placement, and KASLR), then: * In `compute_indices`, the computed `iend` will be in the page/block *after* the final byte of the intended mapping. * In `populate_entries`, an unnecessary entry will be created at the end of each level of table. At the leaf level, this entry will map up to SWAPPER_BLOCK_SIZE bytes of physical addresses that we did not intend to map. As we may map up to SWAPPER_BLOCK_SIZE bytes more than intended, we may violate the boot protocol and map physical address past the 2MiB-aligned end address we are permitted to map. As we map these with Normal memory attributes, this may result in further problems depending on what these physical addresses correspond to. The final entry at each level may require an additional table at that level. As EARLY_ENTRIES() calculates an inclusive bound, we allocate enough memory for this. Avoid the extraneous mapping by having map_memory convert the exclusive end address to an inclusive end address by subtracting one, and do likewise in EARLY_ENTRIES() when calculating the number of required tables. For clarity, comments are updated to more clearly document which boundaries the macros operate on. For consistency with the other macros, the comments in map_memory are also updated to describe `vstart` and `vend` as virtual addresses. Fixes: 0370b31e4845 ("arm64: Extend early page table code to allow for larger kernels") Cc: # 4.16.x Signed-off-by: Mark Rutland Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Steve Capper Cc: Will Deacon Acked-by: Will Deacon Link: https://lore.kernel.org/r/20210823101253.55567-1-mark.rutland@arm.com Signed-off-by: Catalin Marinas Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/kernel-pgtable.h | 4 ++-- arch/arm64/kernel/head.S | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index 817efd95d539f..9679b74a20817 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -65,8 +65,8 @@ #define EARLY_KASLR (0) #endif -#define EARLY_ENTRIES(vstart, vend, shift) (((vend) >> (shift)) \ - - ((vstart) >> (shift)) + 1 + EARLY_KASLR) +#define EARLY_ENTRIES(vstart, vend, shift) \ + ((((vend) - 1) >> (shift)) - ((vstart) >> (shift)) + 1 + EARLY_KASLR) #define EARLY_PGDS(vstart, vend) (EARLY_ENTRIES(vstart, vend, PGDIR_SHIFT)) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index a2e0b37549433..2f784d3b4b390 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -194,7 +194,7 @@ ENDPROC(preserve_boot_args) * to be composed of multiple pages. (This effectively scales the end index). * * vstart: virtual address of start of range - * vend: virtual address of end of range + * vend: virtual address of end of range - we map [vstart, vend] * shift: shift used to transform virtual address into index * ptrs: number of entries in page table * istart: index in table corresponding to vstart @@ -231,17 +231,18 @@ ENDPROC(preserve_boot_args) * * tbl: location of page table * rtbl: address to be used for first level page table entry (typically tbl + PAGE_SIZE) - * vstart: start address to map - * vend: end address to map - we map [vstart, vend] + * vstart: virtual address of start of range + * vend: virtual address of end of range - we map [vstart, vend - 1] * flags: flags to use to map last level entries * phys: physical address corresponding to vstart - physical memory is contiguous * pgds: the number of pgd entries * * Temporaries: istart, iend, tmp, count, sv - these need to be different registers - * Preserves: vstart, vend, flags - * Corrupts: tbl, rtbl, istart, iend, tmp, count, sv + * Preserves: vstart, flags + * Corrupts: tbl, rtbl, vend, istart, iend, tmp, count, sv */ .macro map_memory, tbl, rtbl, vstart, vend, flags, phys, pgds, istart, iend, tmp, count, sv + sub \vend, \vend, #1 add \rtbl, \tbl, #PAGE_SIZE mov \sv, \rtbl mov \count, #0 From 9f910f094fffbd7f7435134a909ff4e11e89df0d Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Thu, 19 Aug 2021 20:37:10 +0800 Subject: [PATCH 0656/5344] crypto: public_key: fix overflow during implicit conversion commit f985911b7bc75d5c98ed24d8aaa8b94c590f7c6a upstream. Hit kernel warning like this, it can be reproduced by verifying 256 bytes datafile by keyctl command, run script: RAWDATA=rawdata SIGDATA=sigdata modprobe pkcs8_key_parser rm -rf *.der *.pem *.pfx rm -rf $RAWDATA dd if=/dev/random of=$RAWDATA bs=256 count=1 openssl req -nodes -x509 -newkey rsa:4096 -keyout key.pem -out cert.pem \ -subj "/C=CN/ST=GD/L=SZ/O=vihoo/OU=dev/CN=xx.com/emailAddress=yy@xx.com" KEY_ID=`openssl pkcs8 -in key.pem -topk8 -nocrypt -outform DER | keyctl \ padd asymmetric 123 @s` keyctl pkey_sign $KEY_ID 0 $RAWDATA enc=pkcs1 hash=sha1 > $SIGDATA keyctl pkey_verify $KEY_ID 0 $RAWDATA $SIGDATA enc=pkcs1 hash=sha1 Then the kernel reports: WARNING: CPU: 5 PID: 344556 at crypto/rsa-pkcs1pad.c:540 pkcs1pad_verify+0x160/0x190 ... Call Trace: public_key_verify_signature+0x282/0x380 ? software_key_query+0x12d/0x180 ? keyctl_pkey_params_get+0xd6/0x130 asymmetric_key_verify_signature+0x66/0x80 keyctl_pkey_verify+0xa5/0x100 do_syscall_64+0x35/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae The reason of this issue, in function 'asymmetric_key_verify_signature': '.digest_size(u8) = params->in_len(u32)' leads overflow of an u8 value, so use u32 instead of u8 for digest_size field. And reorder struct public_key_signature, it saves 8 bytes on a 64-bit machine. Cc: stable@vger.kernel.org Signed-off-by: zhenwei pi Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen Signed-off-by: Greg Kroah-Hartman --- include/crypto/public_key.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/crypto/public_key.h b/include/crypto/public_key.h index 0588ef3bc6ff6..48722f2b8543b 100644 --- a/include/crypto/public_key.h +++ b/include/crypto/public_key.h @@ -38,9 +38,9 @@ extern void public_key_free(struct public_key *key); struct public_key_signature { struct asymmetric_key_id *auth_ids[2]; u8 *s; /* Signature */ - u32 s_size; /* Number of bytes in signature */ u8 *digest; - u8 digest_size; /* Number of bytes in digest */ + u32 s_size; /* Number of bytes in signature */ + u32 digest_size; /* Number of bytes in digest */ const char *pkey_algo; const char *hash_algo; const char *encoding; From 03b5c30ba7905298a2107fd59eab31140f664827 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 11 Aug 2021 12:36:57 +0900 Subject: [PATCH 0657/5344] block: bfq: fix bfq_set_next_ioprio_data() commit a680dd72ec336b81511e3bff48efac6dbfa563e7 upstream. For a request that has a priority level equal to or larger than IOPRIO_BE_NR, bfq_set_next_ioprio_data() prints a critical warning but defaults to setting the request new_ioprio field to IOPRIO_BE_NR. This is not consistent with the warning and the allowed values for priority levels. Fix this by setting the request new_ioprio field to IOPRIO_BE_NR - 1, the lowest priority level allowed. Cc: Fixes: aee69d78dec0 ("block, bfq: introduce the BFQ-v0 I/O scheduler as an extra scheduler") Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210811033702.368488-2-damien.lemoal@wdc.com Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index efb190b4d955a..c5da1955b1c94 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5007,7 +5007,7 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) if (bfqq->new_ioprio >= IOPRIO_BE_NR) { pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", bfqq->new_ioprio); - bfqq->new_ioprio = IOPRIO_BE_NR; + bfqq->new_ioprio = IOPRIO_BE_NR - 1; } bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); From 12e3d2d2680ef7a9ff6698087cc4d56e1e7419f0 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 16 Aug 2021 10:27:14 +0200 Subject: [PATCH 0658/5344] power: supply: max17042: handle fails of reading status register commit 54784ffa5b267f57161eb8fbb811499f22a0a0bf upstream. Reading status register can fail in the interrupt handler. In such case, the regmap_read() will not store anything useful under passed 'val' variable and random stack value will be used to determine type of interrupt. Handle the regmap_read() failure to avoid handling interrupt type and triggering changed power supply event based on random stack value. Fixes: 39e7213edc4f ("max17042_battery: Support regmap to access device's registers") Cc: Signed-off-by: Krzysztof Kozlowski Reviewed-by: Hans de Goede Signed-off-by: Sebastian Reichel Signed-off-by: Greg Kroah-Hartman --- drivers/power/supply/max17042_battery.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/power/supply/max17042_battery.c b/drivers/power/supply/max17042_battery.c index ab4740c3bf573..f8f8207a1895e 100644 --- a/drivers/power/supply/max17042_battery.c +++ b/drivers/power/supply/max17042_battery.c @@ -842,8 +842,12 @@ static irqreturn_t max17042_thread_handler(int id, void *dev) { struct max17042_chip *chip = dev; u32 val; + int ret; + + ret = regmap_read(chip->regmap, MAX17042_STATUS, &val); + if (ret) + return IRQ_HANDLED; - regmap_read(chip->regmap, MAX17042_STATUS, &val); if ((val & STATUS_INTR_SOCMIN_BIT) || (val & STATUS_INTR_SOCMAX_BIT)) { dev_info(&chip->client->dev, "SOC threshold INTR\n"); From ebf753d9e932801e58df73adc2ce1246a31b3bfd Mon Sep 17 00:00:00 2001 From: Arne Welzel Date: Sat, 14 Aug 2021 00:40:38 +0200 Subject: [PATCH 0659/5344] dm crypt: Avoid percpu_counter spinlock contention in crypt_page_alloc() commit 528b16bfc3ae5f11638e71b3b63a81f9999df727 upstream. On systems with many cores using dm-crypt, heavy spinlock contention in percpu_counter_compare() can be observed when the page allocation limit for a given device is reached or close to be reached. This is due to percpu_counter_compare() taking a spinlock to compute an exact result on potentially many CPUs at the same time. Switch to non-exact comparison of allocated and allowed pages by using the value returned by percpu_counter_read_positive() to avoid taking the percpu_counter spinlock. This may over/under estimate the actual number of allocated pages by at most (batch-1) * num_online_cpus(). Currently, batch is bounded by 32. The system on which this issue was first observed has 256 CPUs and 512GB of RAM. With a 4k page size, this change may over/under estimate by 31MB. With ~10G (2%) allowed dm-crypt allocations, this seems an acceptable error. Certainly preferred over running into the spinlock contention. This behavior was reproduced on an EC2 c5.24xlarge instance with 96 CPUs and 192GB RAM as follows, but can be provoked on systems with less CPUs as well. * Disable swap * Tune vm settings to promote regular writeback $ echo 50 > /proc/sys/vm/dirty_expire_centisecs $ echo 25 > /proc/sys/vm/dirty_writeback_centisecs $ echo $((128 * 1024 * 1024)) > /proc/sys/vm/dirty_background_bytes * Create 8 dmcrypt devices based on files on a tmpfs * Create and mount an ext4 filesystem on each crypt devices * Run stress-ng --hdd 8 within one of above filesystems Total %system usage collected from sysstat goes to ~35%. Write throughput on the underlying loop device is ~2GB/s. perf profiling an individual kworker kcryptd thread shows the following profile, indicating spinlock contention in percpu_counter_compare(): 99.98% 0.00% kworker/u193:46 [kernel.kallsyms] [k] ret_from_fork | --ret_from_fork kthread worker_thread | --99.92%--process_one_work | |--80.52%--kcryptd_crypt | | | |--62.58%--mempool_alloc | | | | | --62.24%--crypt_page_alloc | | | | | --61.51%--__percpu_counter_compare | | | | | --61.34%--__percpu_counter_sum | | | | | |--58.68%--_raw_spin_lock_irqsave | | | | | | | --58.30%--native_queued_spin_lock_slowpath | | | | | --0.69%--cpumask_next | | | | | --0.51%--_find_next_bit | | | |--10.61%--crypt_convert | | | | | |--6.05%--xts_crypt ... After applying this patch and running the same test, %system usage is lowered to ~7% and write throughput on the loop device increases to ~2.7GB/s. perf report shows mempool_alloc() as ~8% rather than ~62% in the profile and not hitting the percpu_counter() spinlock anymore. |--8.15%--mempool_alloc | | | |--3.93%--crypt_page_alloc | | | | | --3.75%--__alloc_pages | | | | | --3.62%--get_page_from_freelist | | | | | --3.22%--rmqueue_bulk | | | | | --2.59%--_raw_spin_lock | | | | | --2.57%--native_queued_spin_lock_slowpath | | | --3.05%--_raw_spin_lock_irqsave | | | --2.49%--native_queued_spin_lock_slowpath Suggested-by: DJ Gregor Reviewed-by: Mikulas Patocka Signed-off-by: Arne Welzel Fixes: 5059353df86e ("dm crypt: limit the number of allocated pages") Cc: stable@vger.kernel.org Signed-off-by: Mike Snitzer Signed-off-by: Greg Kroah-Hartman --- drivers/md/dm-crypt.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index d85648b9c247a..571c04e70343a 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -2092,7 +2092,12 @@ static void *crypt_page_alloc(gfp_t gfp_mask, void *pool_data) struct crypt_config *cc = pool_data; struct page *page; - if (unlikely(percpu_counter_compare(&cc->n_allocated_pages, dm_crypt_pages_per_client) >= 0) && + /* + * Note, percpu_counter_read_positive() may over (and under) estimate + * the current usage by at most (batch - 1) * num_online_cpus() pages, + * but avoids potential spinlock contention of an exact result. + */ + if (unlikely(percpu_counter_read_positive(&cc->n_allocated_pages) >= dm_crypt_pages_per_client) && likely(gfp_mask & __GFP_NORETRY)) return NULL; From e8c16782c79e30c154ab3468fb9ed6e46a3514ea Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Wed, 18 Aug 2021 20:48:45 +0800 Subject: [PATCH 0660/5344] VMCI: fix NULL pointer dereference when unmapping queue pair commit a30dc6cf0dc51419021550152e435736aaef8799 upstream. I got a NULL pointer dereference report when doing fuzz test: Call Trace: qp_release_pages+0xae/0x130 qp_host_unregister_user_memory.isra.25+0x2d/0x80 vmci_qp_broker_unmap+0x191/0x320 ? vmci_host_do_alloc_queuepair.isra.9+0x1c0/0x1c0 vmci_host_unlocked_ioctl+0x59f/0xd50 ? do_vfs_ioctl+0x14b/0xa10 ? tomoyo_file_ioctl+0x28/0x30 ? vmci_host_do_alloc_queuepair.isra.9+0x1c0/0x1c0 __x64_sys_ioctl+0xea/0x120 do_syscall_64+0x34/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae When a queue pair is created by the following call, it will not register the user memory if the page_store is NULL, and the entry->state will be set to VMCIQPB_CREATED_NO_MEM. vmci_host_unlocked_ioctl vmci_host_do_alloc_queuepair vmci_qp_broker_alloc qp_broker_alloc qp_broker_create // set entry->state = VMCIQPB_CREATED_NO_MEM; When unmapping this queue pair, qp_host_unregister_user_memory() will be called to unregister the non-existent user memory, which will result in a null pointer reference. It will also change VMCIQPB_CREATED_NO_MEM to VMCIQPB_CREATED_MEM, which should not be present in this operation. Only when the qp broker has mem, it can unregister the user memory when unmapping the qp broker. Only when the qp broker has no mem, it can register the user memory when mapping the qp broker. Fixes: 06164d2b72aa ("VMCI: queue pairs implementation.") Cc: stable Reported-by: Hulk Robot Reviewed-by: Jorgen Hansen Signed-off-by: Wang Hai Link: https://lore.kernel.org/r/20210818124845.488312-1-wanghai38@huawei.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/vmw_vmci/vmci_queue_pair.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/misc/vmw_vmci/vmci_queue_pair.c b/drivers/misc/vmw_vmci/vmci_queue_pair.c index c2338750313c4..a49782dd903cd 100644 --- a/drivers/misc/vmw_vmci/vmci_queue_pair.c +++ b/drivers/misc/vmw_vmci/vmci_queue_pair.c @@ -2238,7 +2238,8 @@ int vmci_qp_broker_map(struct vmci_handle handle, result = VMCI_SUCCESS; - if (context_id != VMCI_HOST_CONTEXT_ID) { + if (context_id != VMCI_HOST_CONTEXT_ID && + !QPBROKERSTATE_HAS_MEM(entry)) { struct vmci_qp_page_store page_store; page_store.pages = guest_mem; @@ -2345,7 +2346,8 @@ int vmci_qp_broker_unmap(struct vmci_handle handle, goto out; } - if (context_id != VMCI_HOST_CONTEXT_ID) { + if (context_id != VMCI_HOST_CONTEXT_ID && + QPBROKERSTATE_HAS_MEM(entry)) { qp_acquire_queue_mutex(entry->produce_q); result = qp_save_headers(entry); if (result < VMCI_SUCCESS) From 4c33a5734fd73d13065f1be8267d0807828e9f88 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 17 Jun 2021 14:33:29 +0200 Subject: [PATCH 0661/5344] media: uvc: don't do DMA on stack commit 1a10d7fdb6d0e235e9d230916244cc2769d3f170 upstream. As warned by smatch: drivers/media/usb/uvc/uvc_v4l2.c:911 uvc_ioctl_g_input() error: doing dma on the stack (&i) drivers/media/usb/uvc/uvc_v4l2.c:943 uvc_ioctl_s_input() error: doing dma on the stack (&i) those two functions call uvc_query_ctrl passing a pointer to a data at the DMA stack. those are used to send URBs via usb_control_msg(). Using DMA stack is not supported and should not work anymore on modern Linux versions. So, use a kmalloc'ed buffer. Cc: stable@vger.kernel.org # Kernel 4.9 and upper Reviewed-by: Laurent Pinchart Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/usb/uvc/uvc_v4l2.c | 34 +++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/drivers/media/usb/uvc/uvc_v4l2.c b/drivers/media/usb/uvc/uvc_v4l2.c index 7d60dd3b0bd85..db7f8f8ee2f9f 100644 --- a/drivers/media/usb/uvc/uvc_v4l2.c +++ b/drivers/media/usb/uvc/uvc_v4l2.c @@ -894,8 +894,8 @@ static int uvc_ioctl_g_input(struct file *file, void *fh, unsigned int *input) { struct uvc_fh *handle = fh; struct uvc_video_chain *chain = handle->chain; + u8 *buf; int ret; - u8 i; if (chain->selector == NULL || (chain->dev->quirks & UVC_QUIRK_IGNORE_SELECTOR_UNIT)) { @@ -903,22 +903,27 @@ static int uvc_ioctl_g_input(struct file *file, void *fh, unsigned int *input) return 0; } + buf = kmalloc(1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + ret = uvc_query_ctrl(chain->dev, UVC_GET_CUR, chain->selector->id, chain->dev->intfnum, UVC_SU_INPUT_SELECT_CONTROL, - &i, 1); - if (ret < 0) - return ret; + buf, 1); + if (!ret) + *input = *buf - 1; - *input = i - 1; - return 0; + kfree(buf); + + return ret; } static int uvc_ioctl_s_input(struct file *file, void *fh, unsigned int input) { struct uvc_fh *handle = fh; struct uvc_video_chain *chain = handle->chain; + u8 *buf; int ret; - u32 i; ret = uvc_acquire_privileges(handle); if (ret < 0) @@ -934,10 +939,17 @@ static int uvc_ioctl_s_input(struct file *file, void *fh, unsigned int input) if (input >= chain->selector->bNrInPins) return -EINVAL; - i = input + 1; - return uvc_query_ctrl(chain->dev, UVC_SET_CUR, chain->selector->id, - chain->dev->intfnum, UVC_SU_INPUT_SELECT_CONTROL, - &i, 1); + buf = kmalloc(1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + *buf = input + 1; + ret = uvc_query_ctrl(chain->dev, UVC_SET_CUR, chain->selector->id, + chain->dev->intfnum, UVC_SU_INPUT_SELECT_CONTROL, + buf, 1); + kfree(buf); + + return ret; } static int uvc_ioctl_queryctrl(struct file *file, void *fh, From c8b8694ee7cad9575b2aceb530c9f23e7efad465 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sat, 3 Jul 2021 15:37:17 +0200 Subject: [PATCH 0662/5344] media: rc-loopback: return number of emitters rather than error commit 6b7f554be8c92319d7e6df92fd247ebb9beb4a45 upstream. The LIRC_SET_TRANSMITTER_MASK ioctl should return the number of emitters if an invalid list was set. Cc: stable@vger.kernel.org Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/rc/rc-loopback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/rc/rc-loopback.c b/drivers/media/rc/rc-loopback.c index ef8b83b707df0..13ab7312fa3b5 100644 --- a/drivers/media/rc/rc-loopback.c +++ b/drivers/media/rc/rc-loopback.c @@ -42,7 +42,7 @@ static int loop_set_tx_mask(struct rc_dev *dev, u32 mask) if ((mask & (RXMASK_REGULAR | RXMASK_LEARNING)) != mask) { dprintk("invalid tx mask: %u\n", mask); - return -EINVAL; + return 2; } dprintk("setting tx mask: %u\n", mask); From 5ab54ddb32a65aef91befc2ac8b08212e299e5b2 Mon Sep 17 00:00:00 2001 From: Robin Gong Date: Wed, 14 Jul 2021 18:20:43 +0800 Subject: [PATCH 0663/5344] Revert "dmaengine: imx-sdma: refine to load context only once" commit 8592f02464d52776c5cfae4627c6413b0ae7602d upstream. This reverts commit ad0d92d7ba6aecbe2705907c38ff8d8be4da1e9c, because in spi-imx case, burst length may be changed dynamically. Fixes: ad0d92d7ba6a ("dmaengine: imx-sdma: refine to load context only once") Cc: Signed-off-by: Robin Gong Acked-by: Sascha Hauer Tested-by: Richard Leitner Signed-off-by: Shawn Guo Signed-off-by: Greg Kroah-Hartman --- drivers/dma/imx-sdma.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c index 67736c801f3ca..d1220899735e3 100644 --- a/drivers/dma/imx-sdma.c +++ b/drivers/dma/imx-sdma.c @@ -377,7 +377,6 @@ struct sdma_channel { unsigned long watermark_level; u32 shp_addr, per_addr; enum dma_status status; - bool context_loaded; struct imx_dma_data data; struct work_struct terminate_worker; }; @@ -988,9 +987,6 @@ static int sdma_load_context(struct sdma_channel *sdmac) int ret; unsigned long flags; - if (sdmac->context_loaded) - return 0; - if (sdmac->direction == DMA_DEV_TO_MEM) load_address = sdmac->pc_from_device; else if (sdmac->direction == DMA_DEV_TO_DEV) @@ -1033,8 +1029,6 @@ static int sdma_load_context(struct sdma_channel *sdmac) spin_unlock_irqrestore(&sdma->channel_0_lock, flags); - sdmac->context_loaded = true; - return ret; } @@ -1074,7 +1068,6 @@ static void sdma_channel_terminate_work(struct work_struct *work) sdmac->desc = NULL; spin_unlock_irqrestore(&sdmac->vc.lock, flags); vchan_dma_desc_free_list(&sdmac->vc, &head); - sdmac->context_loaded = false; } static int sdma_disable_channel_async(struct dma_chan *chan) @@ -1335,7 +1328,6 @@ static void sdma_free_chan_resources(struct dma_chan *chan) sdmac->event_id0 = 0; sdmac->event_id1 = 0; - sdmac->context_loaded = false; sdma_set_channel_priority(sdmac, 0); From 34c9916604ffbd91cf9b3e801b2052104cae34af Mon Sep 17 00:00:00 2001 From: Robin Gong Date: Wed, 14 Jul 2021 18:20:44 +0800 Subject: [PATCH 0664/5344] dmaengine: imx-sdma: remove duplicated sdma_load_context commit e555a03b112838883fdd8185d613c35d043732f2 upstream. Since sdma_transfer_init() will do sdma_load_context before any sdma transfer, no need once more in sdma_config_channel(). Fixes: ad0d92d7ba6a ("dmaengine: imx-sdma: refine to load context only once") Cc: Signed-off-by: Robin Gong Acked-by: Vinod Koul Tested-by: Richard Leitner Signed-off-by: Shawn Guo Signed-off-by: Greg Kroah-Hartman --- drivers/dma/imx-sdma.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c index d1220899735e3..cc70da05db4b5 100644 --- a/drivers/dma/imx-sdma.c +++ b/drivers/dma/imx-sdma.c @@ -1134,7 +1134,6 @@ static void sdma_set_watermarklevel_for_p2p(struct sdma_channel *sdmac) static int sdma_config_channel(struct dma_chan *chan) { struct sdma_channel *sdmac = to_sdma_chan(chan); - int ret; sdma_disable_channel(chan); @@ -1174,9 +1173,7 @@ static int sdma_config_channel(struct dma_chan *chan) sdmac->watermark_level = 0; /* FIXME: M3_BASE_ADDRESS */ } - ret = sdma_load_context(sdmac); - - return ret; + return 0; } static int sdma_set_channel_priority(struct sdma_channel *sdmac, From 039ddfef7226a243da48e7d89d1cfddcfbb64cb9 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 23 Aug 2021 11:52:20 +0200 Subject: [PATCH 0665/5344] libata: add ATA_HORKAGE_NO_NCQ_TRIM for Samsung 860 and 870 SSDs commit 8a6430ab9c9c87cb64c512e505e8690bbaee190b upstream. Commit ca6bfcb2f6d9 ("libata: Enable queued TRIM for Samsung SSD 860") limited the existing ATA_HORKAGE_NO_NCQ_TRIM quirk from "Samsung SSD 8*", covering all Samsung 800 series SSDs, to only apply to "Samsung SSD 840*" and "Samsung SSD 850*" series based on information from Samsung. But there is a large number of users which is still reporting issues with the Samsung 860 and 870 SSDs combined with Intel, ASmedia or Marvell SATA controllers and all reporters also report these problems going away when disabling queued trims. Note that with AMD SATA controllers users are reporting even worse issues and only completely disabling NCQ helps there, this will be addressed in a separate patch. Fixes: ca6bfcb2f6d9 ("libata: Enable queued TRIM for Samsung SSD 860") BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=203475 Cc: stable@vger.kernel.org Cc: Kate Hsuan Signed-off-by: Hans de Goede Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210823095220.30157-1-hdegoede@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- drivers/ata/libata-core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 7788af0ca1090..5c354c7aff946 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -4556,6 +4556,10 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { ATA_HORKAGE_ZERO_AFTER_TRIM, }, { "Samsung SSD 850*", NULL, ATA_HORKAGE_NO_NCQ_TRIM | ATA_HORKAGE_ZERO_AFTER_TRIM, }, + { "Samsung SSD 860*", NULL, ATA_HORKAGE_NO_NCQ_TRIM | + ATA_HORKAGE_ZERO_AFTER_TRIM, }, + { "Samsung SSD 870*", NULL, ATA_HORKAGE_NO_NCQ_TRIM | + ATA_HORKAGE_ZERO_AFTER_TRIM, }, { "FCCT*M500*", NULL, ATA_HORKAGE_NO_NCQ_TRIM | ATA_HORKAGE_ZERO_AFTER_TRIM, }, From 7905bc3474216a642517c5b3b85e14aa97cf0015 Mon Sep 17 00:00:00 2001 From: David Heidelberg Date: Mon, 9 Aug 2021 19:07:30 +0100 Subject: [PATCH 0666/5344] ARM: 9105/1: atags_to_fdt: don't warn about stack size commit b30d0289de72c62516df03fdad8d53f552c69839 upstream. The merge_fdt_bootargs() function by definition consumes more than 1024 bytes of stack because it has a 1024 byte command line on the stack, meaning that we always get a warning when building this file: arch/arm/boot/compressed/atags_to_fdt.c: In function 'merge_fdt_bootargs': arch/arm/boot/compressed/atags_to_fdt.c:98:1: warning: the frame size of 1032 bytes is larger than 1024 bytes [-Wframe-larger-than=] However, as this is the decompressor and we know that it has a very shallow call chain, and we do not actually risk overflowing the kernel stack at runtime here. This just shuts up the warning by disabling the warning flag for this file. Tested on Nexus 7 2012 builds. Acked-by: Nicolas Pitre Signed-off-by: David Heidelberg Signed-off-by: Arnd Bergmann Cc: Signed-off-by: Russell King (Oracle) Signed-off-by: Greg Kroah-Hartman --- arch/arm/boot/compressed/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index f0b3a9281d69b..fb6cb24bde5c9 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -90,6 +90,8 @@ $(addprefix $(obj)/,$(libfdt_objs) atags_to_fdt.o): \ $(addprefix $(obj)/,$(libfdt_hdrs)) ifeq ($(CONFIG_ARM_ATAG_DTB_COMPAT),y) +CFLAGS_REMOVE_atags_to_fdt.o += -Wframe-larger-than=${CONFIG_FRAME_WARN} +CFLAGS_atags_to_fdt.o += -Wframe-larger-than=1280 OBJS += $(libfdt_objs) atags_to_fdt.o endif From 5d93158f5bd30864efa5ba2ba84d61e43ea39f94 Mon Sep 17 00:00:00 2001 From: Stuart Hayes Date: Thu, 13 May 2021 03:03:14 +0530 Subject: [PATCH 0667/5344] PCI/portdrv: Enable Bandwidth Notification only if port supports it commit 00823dcbdd415c868390feaca16f0265101efab4 upstream. Previously we assumed that all Root Ports and Switch Downstream Ports supported Link Bandwidth Notification. Per spec, this is only required for Ports supporting Links wider than x1 and/or multiple Link speeds (PCIe r5.0, sec 7.5.3.6). Because we assumed all Ports supported it, we tried to set up a Bandwidth Notification IRQ, which failed for devices that don't support IRQs at all, which meant pcieport didn't attach to the Port at all. Check the Link Bandwidth Notification Capability bit and enable the service only when the Port supports it. [bhelgaas: commit log] Fixes: e8303bb7a75c ("PCI/LINK: Report degraded links via link bandwidth notification") Link: https://lore.kernel.org/r/20210512213314.7778-1-stuart.w.hayes@gmail.com Signed-off-by: Stuart Hayes Signed-off-by: Bjorn Helgaas Reviewed-by: Lukas Wunner Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/pci/pcie/portdrv_core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c index 50a9522ab07df..3779b264dbec3 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv_core.c @@ -260,8 +260,13 @@ static int get_port_device_capability(struct pci_dev *dev) services |= PCIE_PORT_SERVICE_DPC; if (pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM || - pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT) - services |= PCIE_PORT_SERVICE_BWNOTIF; + pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT) { + u32 linkcap; + + pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &linkcap); + if (linkcap & PCI_EXP_LNKCAP_LBNC) + services |= PCIE_PORT_SERVICE_BWNOTIF; + } return services; } From a441057087e0155eb02de1ae7394a6c5852318d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Thu, 24 Jun 2021 19:14:18 +0200 Subject: [PATCH 0668/5344] PCI: Restrict ASMedia ASM1062 SATA Max Payload Size Supported MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit b12d93e9958e028856cbcb061b6e64728ca07755 upstream. The ASMedia ASM1062 SATA controller advertises Max_Payload_Size_Supported of 512, but in fact it cannot handle incoming TLPs with payload size of 512. We discovered this issue on PCIe controllers capable of MPS = 512 (Aardvark and DesignWare), where the issue presents itself as an External Abort. Bjorn Helgaas says: Probably ASM1062 reports a Malformed TLP error when it receives a data payload of 512 bytes, and Aardvark, DesignWare, etc convert this to an arm64 External Abort. [1] To avoid this problem, limit the ASM1062 Max Payload Size Supported to 256 bytes, so we set the Max Payload Size of devices that may send TLPs to the ASM1062 to 256 or less. [1] https://lore.kernel.org/linux-pci/20210601170907.GA1949035@bjorn-Precision-5520/ BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=212695 Link: https://lore.kernel.org/r/20210624171418.27194-2-kabel@kernel.org Reported-by: Rötti Signed-off-by: Marek Behún Signed-off-by: Bjorn Helgaas Reviewed-by: Krzysztof Wilczyński Reviewed-by: Pali Rohár Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/pci/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 97c343d31f989..391316a2bfb59 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3252,6 +3252,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SOLARFLARE, PCI_DEVICE_ID_SOLARFLARE_SFC4000A_1, fixup_mpss_256); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SOLARFLARE, PCI_DEVICE_ID_SOLARFLARE_SFC4000B, fixup_mpss_256); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_ASMEDIA, 0x0612, fixup_mpss_256); /* * Intel 5000 and 5100 Memory controllers have an erratum with read completion From 2ca623a2a2cd9220589f19fa5d308955a21fa75b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Thu, 29 Jul 2021 23:37:54 +0000 Subject: [PATCH 0669/5344] PCI: Return ~0 data on pciconfig_read() CAP_SYS_ADMIN failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit a8bd29bd49c4156ea0ec5a97812333e2aeef44e7 upstream. The pciconfig_read() syscall reads PCI configuration space using hardware-dependent config accessors. If the read fails on PCI, most accessors don't return an error; they pretend the read was successful and got ~0 data from the device, so the syscall returns success with ~0 data in the buffer. When the accessor does return an error, pciconfig_read() normally fills the user's buffer with ~0 and returns an error in errno. But after e4585da22ad0 ("pci syscall.c: Switch to refcounting API"), we don't fill the buffer with ~0 for the EPERM "user lacks CAP_SYS_ADMIN" error. Userspace may rely on the ~0 data to detect errors, but after e4585da22ad0, that would not detect CAP_SYS_ADMIN errors. Restore the original behaviour of filling the buffer with ~0 when the CAP_SYS_ADMIN check fails. [bhelgaas: commit log, fold in Nathan's fix https://lore.kernel.org/r/20210803200836.500658-1-nathan@kernel.org] Fixes: e4585da22ad0 ("pci syscall.c: Switch to refcounting API") Link: https://lore.kernel.org/r/20210729233755.1509616-1-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/pci/syscall.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/pci/syscall.c b/drivers/pci/syscall.c index 8b003c890b87b..c9f03418e71e0 100644 --- a/drivers/pci/syscall.c +++ b/drivers/pci/syscall.c @@ -22,8 +22,10 @@ SYSCALL_DEFINE5(pciconfig_read, unsigned long, bus, unsigned long, dfn, long err; int cfg_ret; + err = -EPERM; + dev = NULL; if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + goto error; err = -ENODEV; dev = pci_get_domain_bus_and_slot(0, bus, dfn); From e1ea49a55c21962239059eaa603b448b1e56500f Mon Sep 17 00:00:00 2001 From: Hyun Kwon Date: Fri, 25 Jun 2021 12:48:23 +0200 Subject: [PATCH 0670/5344] PCI: xilinx-nwl: Enable the clock through CCF commit de0a01f5296651d3a539f2d23d0db8f359483696 upstream. Enable PCIe reference clock. There is no remove function that's why this should be enough for simple operation. Normally this clock is enabled by default by firmware but there are usecases where this clock should be enabled by driver itself. It is also good that PCIe clock is recorded in a clock framework. Link: https://lore.kernel.org/r/ee6997a08fab582b1c6de05f8be184f3fe8d5357.1624618100.git.michal.simek@xilinx.com Fixes: ab597d35ef11 ("PCI: xilinx-nwl: Add support for Xilinx NWL PCIe Host Controller") Signed-off-by: Hyun Kwon Signed-off-by: Bharat Kumar Gogada Signed-off-by: Michal Simek Signed-off-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pcie-xilinx-nwl.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/pci/controller/pcie-xilinx-nwl.c b/drivers/pci/controller/pcie-xilinx-nwl.c index 45c0f344ccd16..11b046b20b92a 100644 --- a/drivers/pci/controller/pcie-xilinx-nwl.c +++ b/drivers/pci/controller/pcie-xilinx-nwl.c @@ -6,6 +6,7 @@ * (C) Copyright 2014 - 2015, Xilinx, Inc. */ +#include #include #include #include @@ -169,6 +170,7 @@ struct nwl_pcie { u8 root_busno; struct nwl_msi msi; struct irq_domain *legacy_irq_domain; + struct clk *clk; raw_spinlock_t leg_mask_lock; }; @@ -839,6 +841,16 @@ static int nwl_pcie_probe(struct platform_device *pdev) return err; } + pcie->clk = devm_clk_get(dev, NULL); + if (IS_ERR(pcie->clk)) + return PTR_ERR(pcie->clk); + + err = clk_prepare_enable(pcie->clk); + if (err) { + dev_err(dev, "can't enable PCIe ref clock\n"); + return err; + } + err = nwl_pcie_bridge_init(pcie); if (err) { dev_err(dev, "HW Initialization failed\n"); From 49957ede1743af439b7d158d79d42a5b4584d906 Mon Sep 17 00:00:00 2001 From: Evan Wang Date: Thu, 22 Jul 2021 16:40:38 +0200 Subject: [PATCH 0671/5344] PCI: aardvark: Fix checking for PIO status MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit fcb461e2bc8b83b7eaca20cb2221e8b940f2189c upstream. There is an issue that when PCIe switch is connected to an Armada 3700 board, there will be lots of warnings about PIO errors when reading the config space. According to Aardvark PIO read and write sequence in HW specification, the current way to check PIO status has the following issues: 1) For PIO read operation, it reports the error message, which should be avoided according to HW specification. 2) For PIO read and write operations, it only checks PIO operation complete status, which is not enough, and error status should also be checked. This patch aligns the code with Aardvark PIO read and write sequence in HW specification on PIO status check and fix the warnings when reading config space. [pali: Fix CRS handling when CRSSVE is not enabled] Link: https://lore.kernel.org/r/20210722144041.12661-2-pali@kernel.org Tested-by: Victor Gu Signed-off-by: Evan Wang Signed-off-by: Pali Rohár Signed-off-by: Lorenzo Pieralisi Reviewed-by: Victor Gu Reviewed-by: Marek Behún Cc: stable@vger.kernel.org # b1bd5714472c ("PCI: aardvark: Indicate error in 'val' when config read fails") Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-aardvark.c | 62 +++++++++++++++++++++++---- 1 file changed, 54 insertions(+), 8 deletions(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 0a2902569f140..13000d7904c70 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -62,6 +62,7 @@ #define PIO_COMPLETION_STATUS_CRS 2 #define PIO_COMPLETION_STATUS_CA 4 #define PIO_NON_POSTED_REQ BIT(10) +#define PIO_ERR_STATUS BIT(11) #define PIO_ADDR_LS (PIO_BASE_ADDR + 0x8) #define PIO_ADDR_MS (PIO_BASE_ADDR + 0xc) #define PIO_WR_DATA (PIO_BASE_ADDR + 0x10) @@ -363,7 +364,7 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) advk_writel(pcie, reg, PCIE_CORE_CMD_STATUS_REG); } -static void advk_pcie_check_pio_status(struct advk_pcie *pcie) +static int advk_pcie_check_pio_status(struct advk_pcie *pcie, u32 *val) { struct device *dev = &pcie->pdev->dev; u32 reg; @@ -374,14 +375,49 @@ static void advk_pcie_check_pio_status(struct advk_pcie *pcie) status = (reg & PIO_COMPLETION_STATUS_MASK) >> PIO_COMPLETION_STATUS_SHIFT; - if (!status) - return; - + /* + * According to HW spec, the PIO status check sequence as below: + * 1) even if COMPLETION_STATUS(bit9:7) indicates successful, + * it still needs to check Error Status(bit11), only when this bit + * indicates no error happen, the operation is successful. + * 2) value Unsupported Request(1) of COMPLETION_STATUS(bit9:7) only + * means a PIO write error, and for PIO read it is successful with + * a read value of 0xFFFFFFFF. + * 3) value Completion Retry Status(CRS) of COMPLETION_STATUS(bit9:7) + * only means a PIO write error, and for PIO read it is successful + * with a read value of 0xFFFF0001. + * 4) value Completer Abort (CA) of COMPLETION_STATUS(bit9:7) means + * error for both PIO read and PIO write operation. + * 5) other errors are indicated as 'unknown'. + */ switch (status) { + case PIO_COMPLETION_STATUS_OK: + if (reg & PIO_ERR_STATUS) { + strcomp_status = "COMP_ERR"; + break; + } + /* Get the read result */ + if (val) + *val = advk_readl(pcie, PIO_RD_DATA); + /* No error */ + strcomp_status = NULL; + break; case PIO_COMPLETION_STATUS_UR: strcomp_status = "UR"; break; case PIO_COMPLETION_STATUS_CRS: + /* PCIe r4.0, sec 2.3.2, says: + * If CRS Software Visibility is not enabled, the Root Complex + * must re-issue the Configuration Request as a new Request. + * A Root Complex implementation may choose to limit the number + * of Configuration Request/CRS Completion Status loops before + * determining that something is wrong with the target of the + * Request and taking appropriate action, e.g., complete the + * Request to the host as a failed transaction. + * + * To simplify implementation do not re-issue the Configuration + * Request and complete the Request as a failed transaction. + */ strcomp_status = "CRS"; break; case PIO_COMPLETION_STATUS_CA: @@ -392,6 +428,9 @@ static void advk_pcie_check_pio_status(struct advk_pcie *pcie) break; } + if (!strcomp_status) + return 0; + if (reg & PIO_NON_POSTED_REQ) str_posted = "Non-posted"; else @@ -399,6 +438,8 @@ static void advk_pcie_check_pio_status(struct advk_pcie *pcie) dev_err(dev, "%s PIO Response Status: %s, %#x @ %#x\n", str_posted, strcomp_status, reg, advk_readl(pcie, PIO_ADDR_LS)); + + return -EFAULT; } static int advk_pcie_wait_pio(struct advk_pcie *pcie) @@ -625,10 +666,13 @@ static int advk_pcie_rd_conf(struct pci_bus *bus, u32 devfn, if (ret < 0) return PCIBIOS_SET_FAILED; - advk_pcie_check_pio_status(pcie); + /* Check PIO status and get the read result */ + ret = advk_pcie_check_pio_status(pcie, val); + if (ret < 0) { + *val = 0xffffffff; + return PCIBIOS_SET_FAILED; + } - /* Get the read result */ - *val = advk_readl(pcie, PIO_RD_DATA); if (size == 1) *val = (*val >> (8 * (where & 3))) & 0xff; else if (size == 2) @@ -692,7 +736,9 @@ static int advk_pcie_wr_conf(struct pci_bus *bus, u32 devfn, if (ret < 0) return PCIBIOS_SET_FAILED; - advk_pcie_check_pio_status(pcie); + ret = advk_pcie_check_pio_status(pcie, NULL); + if (ret < 0) + return PCIBIOS_SET_FAILED; return PCIBIOS_SUCCESSFUL; } From c660206ebeb7750c0efd2f595327722ca36ad243 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 22 Jul 2021 16:40:39 +0200 Subject: [PATCH 0672/5344] PCI: aardvark: Increase polling delay to 1.5s while waiting for PIO response MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 02bcec3ea5591720114f586960490b04b093a09e upstream. Measurements in different conditions showed that aardvark hardware PIO response can take up to 1.44s. Increase wait timeout from 1ms to 1.5s to ensure that we do not miss responses from hardware. After 1.44s hardware returns errors (e.g. Completer abort). The previous two patches fixed checking for PIO status, so now we can use it to also catch errors which are reported by hardware after 1.44s. After applying this patch, kernel can detect and print PIO errors to dmesg: [ 6.879999] advk-pcie d0070000.pcie: Non-posted PIO Response Status: CA, 0xe00 @ 0x100004 [ 6.896436] advk-pcie d0070000.pcie: Posted PIO Response Status: COMP_ERR, 0x804 @ 0x100004 [ 6.913049] advk-pcie d0070000.pcie: Posted PIO Response Status: COMP_ERR, 0x804 @ 0x100010 [ 6.929663] advk-pcie d0070000.pcie: Non-posted PIO Response Status: CA, 0xe00 @ 0x100010 [ 6.953558] advk-pcie d0070000.pcie: Posted PIO Response Status: COMP_ERR, 0x804 @ 0x100014 [ 6.970170] advk-pcie d0070000.pcie: Non-posted PIO Response Status: CA, 0xe00 @ 0x100014 [ 6.994328] advk-pcie d0070000.pcie: Posted PIO Response Status: COMP_ERR, 0x804 @ 0x100004 Without this patch kernel prints only a generic error to dmesg: [ 5.246847] advk-pcie d0070000.pcie: config read/write timed out Link: https://lore.kernel.org/r/20210722144041.12661-3-pali@kernel.org Signed-off-by: Pali Rohár Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marek Behún Cc: stable@vger.kernel.org # 7fbcb5da811b ("PCI: aardvark: Don't rely on jiffies while holding spinlock") Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-aardvark.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 13000d7904c70..99ae0a9008204 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -177,7 +177,7 @@ (PCIE_CONF_BUS(bus) | PCIE_CONF_DEV(PCI_SLOT(devfn)) | \ PCIE_CONF_FUNC(PCI_FUNC(devfn)) | PCIE_CONF_REG(where)) -#define PIO_RETRY_CNT 500 +#define PIO_RETRY_CNT 750000 /* 1.5 s */ #define PIO_RETRY_DELAY 2 /* 2 us*/ #define LINK_WAIT_MAX_RETRIES 10 From e9083b238a2c21f04d3aea6002f1547810a78ad6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Fri, 20 Aug 2021 17:50:20 +0200 Subject: [PATCH 0673/5344] PCI: aardvark: Fix masking and unmasking legacy INTx interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit d212dcee27c1f89517181047e5485fcbba4a25c2 upstream. irq_mask and irq_unmask callbacks need to be properly guarded by raw spin locks as masking/unmasking procedure needs atomic read-modify-write operation on hardware register. Link: https://lore.kernel.org/r/20210820155020.3000-1-pali@kernel.org Reported-by: Marc Zyngier Signed-off-by: Pali Rohár Signed-off-by: Lorenzo Pieralisi Acked-by: Marc Zyngier Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-aardvark.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 99ae0a9008204..0538348ed843f 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -194,6 +194,7 @@ struct advk_pcie { struct list_head resources; struct irq_domain *irq_domain; struct irq_chip irq_chip; + raw_spinlock_t irq_lock; struct irq_domain *msi_domain; struct irq_domain *msi_inner_domain; struct irq_chip msi_bottom_irq_chip; @@ -812,22 +813,28 @@ static void advk_pcie_irq_mask(struct irq_data *d) { struct advk_pcie *pcie = d->domain->host_data; irq_hw_number_t hwirq = irqd_to_hwirq(d); + unsigned long flags; u32 mask; + raw_spin_lock_irqsave(&pcie->irq_lock, flags); mask = advk_readl(pcie, PCIE_ISR1_MASK_REG); mask |= PCIE_ISR1_INTX_ASSERT(hwirq); advk_writel(pcie, mask, PCIE_ISR1_MASK_REG); + raw_spin_unlock_irqrestore(&pcie->irq_lock, flags); } static void advk_pcie_irq_unmask(struct irq_data *d) { struct advk_pcie *pcie = d->domain->host_data; irq_hw_number_t hwirq = irqd_to_hwirq(d); + unsigned long flags; u32 mask; + raw_spin_lock_irqsave(&pcie->irq_lock, flags); mask = advk_readl(pcie, PCIE_ISR1_MASK_REG); mask &= ~PCIE_ISR1_INTX_ASSERT(hwirq); advk_writel(pcie, mask, PCIE_ISR1_MASK_REG); + raw_spin_unlock_irqrestore(&pcie->irq_lock, flags); } static int advk_pcie_irq_map(struct irq_domain *h, @@ -911,6 +918,8 @@ static int advk_pcie_init_irq_domain(struct advk_pcie *pcie) struct irq_chip *irq_chip; int ret = 0; + raw_spin_lock_init(&pcie->irq_lock); + pcie_intc_node = of_get_next_child(node, NULL); if (!pcie_intc_node) { dev_err(dev, "No PCIe Intc node found\n"); From cc33e5599d4b2fafa247d35ba1ceb543b37c2650 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Tue, 29 Jun 2021 11:25:50 -0700 Subject: [PATCH 0674/5344] HID: input: do not report stylus battery state as "full" [ Upstream commit f4abaa9eebde334045ed6ac4e564d050f1df3013 ] The power supply states of discharging, charging, full, etc, represent state of charging, not the capacity level of the battery (for which we have a separate property). Current HID usage tables to not allow for expressing charging state of the batteries found in generic styli, so we should simply assume that the battery is discharging even if current capacity is at 100% when battery strength reporting is done via HID interface. In fact, we were doing just that before commit 581c4484769e. This change helps UIs to not mis-represent fully charged batteries in styli as being charging/topping-off. Fixes: 581c4484769e ("HID: input: map digitizer battery usage") Reported-by: Kenneth Albanowski Signed-off-by: Dmitry Torokhov Signed-off-by: Jiri Kosina Signed-off-by: Sasha Levin --- drivers/hid/hid-input.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index 6d551ae251c0a..ea4c97f5b0736 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -415,8 +415,6 @@ static int hidinput_get_battery_property(struct power_supply *psy, if (dev->battery_status == HID_BATTERY_UNKNOWN) val->intval = POWER_SUPPLY_STATUS_UNKNOWN; - else if (dev->battery_capacity == 100) - val->intval = POWER_SUPPLY_STATUS_FULL; else val->intval = POWER_SUPPLY_STATUS_DISCHARGING; break; From c3d3f8bb367c4ddb86266dd0e6be437ea24d8cf5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 19 Jul 2021 16:46:47 +0800 Subject: [PATCH 0675/5344] f2fs: quota: fix potential deadlock [ Upstream commit 9de71ede81e6d1a111fdd868b2d78d459fa77f80 ] xfstest generic/587 reports a deadlock issue as below: ====================================================== WARNING: possible circular locking dependency detected 5.14.0-rc1 #69 Not tainted ------------------------------------------------------ repquota/8606 is trying to acquire lock: ffff888022ac9320 (&sb->s_type->i_mutex_key#18){+.+.}-{3:3}, at: f2fs_quota_sync+0x207/0x300 [f2fs] but task is already holding lock: ffff8880084bcde8 (&sbi->quota_sem){.+.+}-{3:3}, at: f2fs_quota_sync+0x59/0x300 [f2fs] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (&sbi->quota_sem){.+.+}-{3:3}: __lock_acquire+0x648/0x10b0 lock_acquire+0x128/0x470 down_read+0x3b/0x2a0 f2fs_quota_sync+0x59/0x300 [f2fs] f2fs_quota_on+0x48/0x100 [f2fs] do_quotactl+0x5e3/0xb30 __x64_sys_quotactl+0x23a/0x4e0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae -> #1 (&sbi->cp_rwsem){++++}-{3:3}: __lock_acquire+0x648/0x10b0 lock_acquire+0x128/0x470 down_read+0x3b/0x2a0 f2fs_unlink+0x353/0x670 [f2fs] vfs_unlink+0x1c7/0x380 do_unlinkat+0x413/0x4b0 __x64_sys_unlinkat+0x50/0xb0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae -> #0 (&sb->s_type->i_mutex_key#18){+.+.}-{3:3}: check_prev_add+0xdc/0xb30 validate_chain+0xa67/0xb20 __lock_acquire+0x648/0x10b0 lock_acquire+0x128/0x470 down_write+0x39/0xc0 f2fs_quota_sync+0x207/0x300 [f2fs] do_quotactl+0xaff/0xb30 __x64_sys_quotactl+0x23a/0x4e0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae other info that might help us debug this: Chain exists of: &sb->s_type->i_mutex_key#18 --> &sbi->cp_rwsem --> &sbi->quota_sem Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&sbi->quota_sem); lock(&sbi->cp_rwsem); lock(&sbi->quota_sem); lock(&sb->s_type->i_mutex_key#18); *** DEADLOCK *** 3 locks held by repquota/8606: #0: ffff88801efac0e0 (&type->s_umount_key#53){++++}-{3:3}, at: user_get_super+0xd9/0x190 #1: ffff8880084bc380 (&sbi->cp_rwsem){++++}-{3:3}, at: f2fs_quota_sync+0x3e/0x300 [f2fs] #2: ffff8880084bcde8 (&sbi->quota_sem){.+.+}-{3:3}, at: f2fs_quota_sync+0x59/0x300 [f2fs] stack backtrace: CPU: 6 PID: 8606 Comm: repquota Not tainted 5.14.0-rc1 #69 Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 Call Trace: dump_stack_lvl+0xce/0x134 dump_stack+0x17/0x20 print_circular_bug.isra.0.cold+0x239/0x253 check_noncircular+0x1be/0x1f0 check_prev_add+0xdc/0xb30 validate_chain+0xa67/0xb20 __lock_acquire+0x648/0x10b0 lock_acquire+0x128/0x470 down_write+0x39/0xc0 f2fs_quota_sync+0x207/0x300 [f2fs] do_quotactl+0xaff/0xb30 __x64_sys_quotactl+0x23a/0x4e0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f883b0b4efe The root cause is ABBA deadlock of inode lock and cp_rwsem, reorder locks in f2fs_quota_sync() as below to fix this issue: - lock inode - lock cp_rwsem - lock quota_sem Fixes: db6ec53b7e03 ("f2fs: add a rw_sem to cover quota flag changes") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/super.c | 84 ++++++++++++++++++++++++++++--------------------- 1 file changed, 48 insertions(+), 36 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6d904dc9bd199..41bf656658ba8 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1994,6 +1994,33 @@ static int f2fs_enable_quotas(struct super_block *sb) return 0; } +static int f2fs_quota_sync_file(struct f2fs_sb_info *sbi, int type) +{ + struct quota_info *dqopt = sb_dqopt(sbi->sb); + struct address_space *mapping = dqopt->files[type]->i_mapping; + int ret = 0; + + ret = dquot_writeback_dquots(sbi->sb, type); + if (ret) + goto out; + + ret = filemap_fdatawrite(mapping); + if (ret) + goto out; + + /* if we are using journalled quota */ + if (is_journalled_quota(sbi)) + goto out; + + ret = filemap_fdatawait(mapping); + + truncate_inode_pages(&dqopt->files[type]->i_data, 0); +out: + if (ret) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + return ret; +} + int f2fs_quota_sync(struct super_block *sb, int type) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -2001,57 +2028,42 @@ int f2fs_quota_sync(struct super_block *sb, int type) int cnt; int ret; - /* - * do_quotactl - * f2fs_quota_sync - * down_read(quota_sem) - * dquot_writeback_dquots() - * f2fs_dquot_commit - * block_operation - * down_read(quota_sem) - */ - f2fs_lock_op(sbi); - - down_read(&sbi->quota_sem); - ret = dquot_writeback_dquots(sb, type); - if (ret) - goto out; - /* * Now when everything is written we can discard the pagecache so * that userspace sees the changes. */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - struct address_space *mapping; if (type != -1 && cnt != type) continue; - if (!sb_has_quota_active(sb, cnt)) - continue; - mapping = dqopt->files[cnt]->i_mapping; + if (!sb_has_quota_active(sb, type)) + return 0; - ret = filemap_fdatawrite(mapping); - if (ret) - goto out; + inode_lock(dqopt->files[cnt]); - /* if we are using journalled quota */ - if (is_journalled_quota(sbi)) - continue; + /* + * do_quotactl + * f2fs_quota_sync + * down_read(quota_sem) + * dquot_writeback_dquots() + * f2fs_dquot_commit + * block_operation + * down_read(quota_sem) + */ + f2fs_lock_op(sbi); + down_read(&sbi->quota_sem); - ret = filemap_fdatawait(mapping); - if (ret) - set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); + ret = f2fs_quota_sync_file(sbi, cnt); + + up_read(&sbi->quota_sem); + f2fs_unlock_op(sbi); - inode_lock(dqopt->files[cnt]); - truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); inode_unlock(dqopt->files[cnt]); + + if (ret) + break; } -out: - if (ret) - set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); - up_read(&sbi->quota_sem); - f2fs_unlock_op(sbi); return ret; } From 0e9e47863f817a20b00a436c083073606eb2e19b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 24 Jul 2021 09:20:10 +0200 Subject: [PATCH 0676/5344] scsi: bsg: Remove support for SCSI_IOCTL_SEND_COMMAND [ Upstream commit beec64d0c9749afedf51c3c10cf52de1d9a89cc0 ] SCSI_IOCTL_SEND_COMMAND has been deprecated longer than bsg exists and has been warning for just as long. More importantly it harcodes SCSI CDBs and thus will do the wrong thing on non-SCSI bsg nodes. Link: https://lore.kernel.org/r/20210724072033.1284840-2-hch@lst.de Fixes: aa387cc89567 ("block: add bsg helper library") Reviewed-by: Bart Van Assche Acked-by: Jens Axboe Signed-off-by: Christoph Hellwig Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- block/bsg.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/block/bsg.c b/block/bsg.c index 0d012efef5274..c8b9714e69232 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -371,10 +371,13 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case SG_GET_RESERVED_SIZE: case SG_SET_RESERVED_SIZE: case SG_EMULATED_HOST: - case SCSI_IOCTL_SEND_COMMAND: return scsi_cmd_ioctl(bd->queue, NULL, file->f_mode, cmd, uarg); case SG_IO: return bsg_sg_io(bd->queue, file->f_mode, uarg); + case SCSI_IOCTL_SEND_COMMAND: + pr_warn_ratelimited("%s: calling unsupported SCSI_IOCTL_SEND_COMMAND\n", + current->comm); + return -EINVAL; default: return -ENOTTY; } From 86e372c4fd47798820deb0637e88913fcb285621 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Thu, 15 Jul 2021 12:04:45 -0400 Subject: [PATCH 0677/5344] IB/hfi1: Adjust pkey entry in index 0 [ Upstream commit 62004871e1fa7f9a60797595c03477af5b5ec36f ] It is possible for the primary IPoIB network device associated with any RDMA device to fail to join certain multicast groups preventing IPv6 neighbor discovery and possibly other network ULPs from working correctly. The IPv4 broadcast group is not affected as the IPoIB network device handles joining that multicast group directly. This is because the primary IPoIB network device uses the pkey at ndex 0 in the associated RDMA device's pkey table. Anytime the pkey value of index 0 changes, the primary IPoIB network device automatically modifies it's broadcast address (i.e. /sys/class/net/[ib0]/broadcast), since the broadcast address includes the pkey value, and then bounces carrier. This includes initial pkey assignment, such as when the pkey at index 0 transitions from the opa default of invalid (0x0000) to some value such as the OPA default pkey for Virtual Fabric 0: 0x8001 or when the fabric manager is restarted with a configuration change causing the pkey at index 0 to change. Many network ULPs are not sensitive to the carrier bounce and are not expecting the broadcast address to change including the linux IPv6 stack. This problem does not affect IPoIB child network devices as their pkey value is constant for all time. To mitigate this issue, change the default pkey in at index 0 to 0x8001 to cover the predominant case and avoid issues as ipoib comes up and the FM sweeps. At some point, ipoib multicast support should automatically fix non-broadcast addresses as it does with the primary broadcast address. Fixes: 7724105686e7 ("IB/hfi1: add driver files") Link: https://lore.kernel.org/r/20210715160445.142451.47651.stgit@awfm-01.cornelisnetworks.com Suggested-by: Josh Collier Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe Signed-off-by: Sasha Levin --- drivers/infiniband/hw/hfi1/init.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index fbff6b2f00e71..1256dbd5b2ef0 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -664,12 +664,7 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, ppd->pkeys[default_pkey_idx] = DEFAULT_P_KEY; ppd->part_enforce |= HFI1_PART_ENFORCE_IN; - - if (loopback) { - dd_dev_err(dd, "Faking data partition 0x8001 in idx %u\n", - !default_pkey_idx); - ppd->pkeys[!default_pkey_idx] = 0x8001; - } + ppd->pkeys[0] = 0x8001; INIT_WORK(&ppd->link_vc_work, handle_verify_cap); INIT_WORK(&ppd->link_up_work, handle_link_up); From 75aacb73e762cd6d3d438ac0b4712b13470e6af5 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 23 Jul 2021 17:08:55 +0300 Subject: [PATCH 0678/5344] RDMA/iwcm: Release resources if iw_cm module initialization fails [ Upstream commit e677b72a0647249370f2635862bf0241c86f66ad ] The failure during iw_cm module initialization partially left the system with unreleased memory and other resources. Rewrite the module init/exit routines in such way that netlink commands will be opened only after successful initialization. Fixes: b493d91d333e ("iwcm: common code for port mapper") Link: https://lore.kernel.org/r/b01239f99cb1a3e6d2b0694c242d89e6410bcd93.1627048781.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe Signed-off-by: Sasha Levin --- drivers/infiniband/core/iwcm.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index da8adadf47559..75b6da00065a3 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -1187,29 +1187,34 @@ static int __init iw_cm_init(void) ret = iwpm_init(RDMA_NL_IWCM); if (ret) - pr_err("iw_cm: couldn't init iwpm\n"); - else - rdma_nl_register(RDMA_NL_IWCM, iwcm_nl_cb_table); + return ret; + iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", 0); if (!iwcm_wq) - return -ENOMEM; + goto err_alloc; iwcm_ctl_table_hdr = register_net_sysctl(&init_net, "net/iw_cm", iwcm_ctl_table); if (!iwcm_ctl_table_hdr) { pr_err("iw_cm: couldn't register sysctl paths\n"); - destroy_workqueue(iwcm_wq); - return -ENOMEM; + goto err_sysctl; } + rdma_nl_register(RDMA_NL_IWCM, iwcm_nl_cb_table); return 0; + +err_sysctl: + destroy_workqueue(iwcm_wq); +err_alloc: + iwpm_exit(RDMA_NL_IWCM); + return -ENOMEM; } static void __exit iw_cm_cleanup(void) { + rdma_nl_unregister(RDMA_NL_IWCM); unregister_net_sysctl_table(iwcm_ctl_table_hdr); destroy_workqueue(iwcm_wq); - rdma_nl_unregister(RDMA_NL_IWCM); iwpm_exit(RDMA_NL_IWCM); } From 61fdf319240969e146a2bce2e4bf25ae72f07825 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 28 Jul 2021 16:04:12 +0300 Subject: [PATCH 0679/5344] docs: Fix infiniband uverbs minor number [ Upstream commit 8d7e415d55610d503fdb8815344846b72d194a40 ] Starting from the beginning of infiniband subsystem, the uverbs char devices start from 192 as a minor number, see commit bc38a6abdd5a ("[PATCH] IB uverbs: core implementation"). This patch updates the admin guide documentation to reflect it. Fixes: 9d85025b0418 ("docs-rst: create an user's manual book") Link: https://lore.kernel.org/r/bad03e6bcde45550c01e12908a6fe7dfa4770703.1627477347.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe Signed-off-by: Sasha Levin --- Documentation/admin-guide/devices.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/devices.txt b/Documentation/admin-guide/devices.txt index 1c5d2281efc97..771d9e7ae082b 100644 --- a/Documentation/admin-guide/devices.txt +++ b/Documentation/admin-guide/devices.txt @@ -3002,10 +3002,10 @@ 65 = /dev/infiniband/issm1 Second InfiniBand IsSM device ... 127 = /dev/infiniband/issm63 63rd InfiniBand IsSM device - 128 = /dev/infiniband/uverbs0 First InfiniBand verbs device - 129 = /dev/infiniband/uverbs1 Second InfiniBand verbs device + 192 = /dev/infiniband/uverbs0 First InfiniBand verbs device + 193 = /dev/infiniband/uverbs1 Second InfiniBand verbs device ... - 159 = /dev/infiniband/uverbs31 31st InfiniBand verbs device + 223 = /dev/infiniband/uverbs31 31st InfiniBand verbs device 232 char Biometric Devices 0 = /dev/biometric/sensor0/fingerprint first fingerprint sensor on first device From 6385fb22eaf02d41bd45d50339643fb164a81dce Mon Sep 17 00:00:00 2001 From: Jaehyoung Choi Date: Fri, 30 Jul 2021 22:29:05 +0300 Subject: [PATCH 0680/5344] pinctrl: samsung: Fix pinctrl bank pin count [ Upstream commit 70115558ab02fe8d28a6634350b3491a542aaa02 ] Commit 1abd18d1a51a ("pinctrl: samsung: Register pinctrl before GPIO") changes the order of GPIO and pinctrl registration: now pinctrl is registered before GPIO. That means gpio_chip->ngpio is not set when samsung_pinctrl_register() called, and one cannot rely on that value anymore. Use `pin_bank->nr_pins' instead of `pin_bank->gpio_chip.ngpio' to fix mentioned inconsistency. Fixes: 1abd18d1a51a ("pinctrl: samsung: Register pinctrl before GPIO") Signed-off-by: Jaehyoung Choi Signed-off-by: Sam Protsenko Link: https://lore.kernel.org/r/20210730192905.7173-1-semen.protsenko@linaro.org Signed-off-by: Krzysztof Kozlowski Signed-off-by: Sasha Levin --- drivers/pinctrl/samsung/pinctrl-samsung.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/samsung/pinctrl-samsung.c b/drivers/pinctrl/samsung/pinctrl-samsung.c index f26574ef234ab..601fffeba39fe 100644 --- a/drivers/pinctrl/samsung/pinctrl-samsung.c +++ b/drivers/pinctrl/samsung/pinctrl-samsung.c @@ -918,7 +918,7 @@ static int samsung_pinctrl_register(struct platform_device *pdev, pin_bank->grange.pin_base = drvdata->pin_base + pin_bank->pin_base; pin_bank->grange.base = pin_bank->grange.pin_base; - pin_bank->grange.npins = pin_bank->gpio_chip.ngpio; + pin_bank->grange.npins = pin_bank->nr_pins; pin_bank->grange.gc = &pin_bank->gpio_chip; pinctrl_add_gpio_range(drvdata->pctl_dev, &pin_bank->grange); } From ebbae9de793bb4d32ddc5df1027ddef42984bda8 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 16 Jul 2021 15:39:12 -0300 Subject: [PATCH 0681/5344] vfio: Use config not menuconfig for VFIO_NOIOMMU [ Upstream commit 26c22cfde5dd6e63f25c48458b0185dcb0fbb2fd ] VFIO_NOIOMMU is supposed to be an element in the VFIO menu, not start a new menu. Correct this copy-paste mistake. Fixes: 03a76b60f8ba ("vfio: Include No-IOMMU mode") Signed-off-by: Jason Gunthorpe Reviewed-by: Cornelia Huck Link: https://lore.kernel.org/r/0-v1-3f0b685c3679+478-vfio_menuconfig_jgg@nvidia.com Signed-off-by: Alex Williamson Signed-off-by: Sasha Levin --- drivers/vfio/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 503ed2f3fbb5e..65743de8aad11 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -29,7 +29,7 @@ menuconfig VFIO If you don't know what to do here, say N. -menuconfig VFIO_NOIOMMU +config VFIO_NOIOMMU bool "VFIO No-IOMMU support" depends on VFIO help From a76b5f7129ea098c39895775c4c21a667af60c4d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 23 Jul 2021 14:39:45 +0300 Subject: [PATCH 0682/5344] RDMA/efa: Remove double QP type assignment [ Upstream commit f9193d266347fe9bed5c173e7a1bf96268142a79 ] The QP type is set by the IB/core and shouldn't be set in the driver. Fixes: 40909f664d27 ("RDMA/efa: Add EFA verbs implementation") Link: https://lore.kernel.org/r/838c40134c1590167b888ca06ad51071139ff2ae.1627040189.git.leonro@nvidia.com Acked-by: Gal Pressman Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe Signed-off-by: Sasha Levin --- drivers/infiniband/hw/efa/efa_verbs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 4edae89e8e3ca..17f1e59ab12ee 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -745,7 +745,6 @@ struct ib_qp *efa_create_qp(struct ib_pd *ibpd, rq_entry_inserted = true; qp->qp_handle = create_qp_resp.qp_handle; qp->ibqp.qp_num = create_qp_resp.qp_num; - qp->ibqp.qp_type = init_attr->qp_type; qp->max_send_wr = init_attr->cap.max_send_wr; qp->max_recv_wr = init_attr->cap.max_recv_wr; qp->max_send_sge = init_attr->cap.max_send_sge; From a961aac73027cd1eca4123f17d67e883ed78cd1c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 1 Nov 2019 17:53:23 +0800 Subject: [PATCH 0683/5344] f2fs: show f2fs instance in printk_ratelimited [ Upstream commit c45d6002ff7a322022560e9b19ad867b01fec77f ] As Eric mentioned, bare printk{,_ratelimited} won't show which filesystem instance these message is coming from, this patch tries to show fs instance with sb->s_id field in all places we missed before. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/data.c | 9 +++++---- fs/f2fs/dir.c | 7 ++++--- fs/f2fs/f2fs.h | 24 +++++++++++++----------- fs/f2fs/file.c | 2 +- fs/f2fs/gc.c | 2 +- fs/f2fs/inode.c | 2 +- fs/f2fs/node.c | 2 +- fs/f2fs/segment.c | 9 +++++---- 9 files changed, 32 insertions(+), 27 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index a57219c51c01a..f7d27cbbeb860 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -583,7 +583,7 @@ int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi) if (time_to_inject(sbi, FAULT_ORPHAN)) { spin_unlock(&im->ino_lock); - f2fs_show_injection_info(FAULT_ORPHAN); + f2fs_show_injection_info(sbi, FAULT_ORPHAN); return -ENOSPC; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 64ee2a064e339..df2c361feade6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -167,9 +167,10 @@ static bool f2fs_bio_post_read_required(struct bio *bio) static void f2fs_read_end_io(struct bio *bio) { - if (time_to_inject(F2FS_P_SB(bio_first_page_all(bio)), - FAULT_READ_IO)) { - f2fs_show_injection_info(FAULT_READ_IO); + struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio)); + + if (time_to_inject(sbi, FAULT_READ_IO)) { + f2fs_show_injection_info(sbi, FAULT_READ_IO); bio->bi_status = BLK_STS_IOERR; } @@ -191,7 +192,7 @@ static void f2fs_write_end_io(struct bio *bio) struct bvec_iter_all iter_all; if (time_to_inject(sbi, FAULT_WRITE_IO)) { - f2fs_show_injection_info(FAULT_WRITE_IO); + f2fs_show_injection_info(sbi, FAULT_WRITE_IO); bio->bi_status = BLK_STS_IOERR; } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 78d041f9775a4..7b3fbfe68f8cf 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -618,7 +618,7 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, start: if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) { - f2fs_show_injection_info(FAULT_DIR_DEPTH); + f2fs_show_injection_info(F2FS_I_SB(dir), FAULT_DIR_DEPTH); return -ENOSPC; } @@ -909,8 +909,9 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, bit_pos++; ctx->pos = start_pos + bit_pos; printk_ratelimited( - "%s, invalid namelen(0), ino:%u, run fsck to fix.", - KERN_WARNING, le32_to_cpu(de->ino)); + "%sF2FS-fs (%s): invalid namelen(0), ino:%u, run fsck to fix.", + KERN_WARNING, sbi->sb->s_id, + le32_to_cpu(de->ino)); set_sbi_flag(sbi, SBI_NEED_FSCK); continue; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4ca3c2a0a0f5b..031a17bf52a24 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1374,9 +1374,10 @@ struct f2fs_private_dio { }; #ifdef CONFIG_F2FS_FAULT_INJECTION -#define f2fs_show_injection_info(type) \ - printk_ratelimited("%sF2FS-fs : inject %s in %s of %pS\n", \ - KERN_INFO, f2fs_fault_name[type], \ +#define f2fs_show_injection_info(sbi, type) \ + printk_ratelimited("%sF2FS-fs (%s) : inject %s in %s of %pS\n", \ + KERN_INFO, sbi->sb->s_id, \ + f2fs_fault_name[type], \ __func__, __builtin_return_address(0)) static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) { @@ -1396,7 +1397,7 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) return false; } #else -#define f2fs_show_injection_info(type) do { } while (0) +#define f2fs_show_injection_info(sbi, type) do { } while (0) static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) { return false; @@ -1781,7 +1782,7 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, return ret; if (time_to_inject(sbi, FAULT_BLOCK)) { - f2fs_show_injection_info(FAULT_BLOCK); + f2fs_show_injection_info(sbi, FAULT_BLOCK); release = *count; goto release_quota; } @@ -2033,7 +2034,7 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, } if (time_to_inject(sbi, FAULT_BLOCK)) { - f2fs_show_injection_info(FAULT_BLOCK); + f2fs_show_injection_info(sbi, FAULT_BLOCK); goto enospc; } @@ -2148,7 +2149,8 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, return page; if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) { - f2fs_show_injection_info(FAULT_PAGE_ALLOC); + f2fs_show_injection_info(F2FS_M_SB(mapping), + FAULT_PAGE_ALLOC); return NULL; } } @@ -2163,7 +2165,7 @@ static inline struct page *f2fs_pagecache_get_page( int fgp_flags, gfp_t gfp_mask) { if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) { - f2fs_show_injection_info(FAULT_PAGE_GET); + f2fs_show_injection_info(F2FS_M_SB(mapping), FAULT_PAGE_GET); return NULL; } @@ -2232,7 +2234,7 @@ static inline struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, return bio; } if (time_to_inject(sbi, FAULT_ALLOC_BIO)) { - f2fs_show_injection_info(FAULT_ALLOC_BIO); + f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO); return NULL; } @@ -2797,7 +2799,7 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { if (time_to_inject(sbi, FAULT_KMALLOC)) { - f2fs_show_injection_info(FAULT_KMALLOC); + f2fs_show_injection_info(sbi, FAULT_KMALLOC); return NULL; } @@ -2814,7 +2816,7 @@ static inline void *f2fs_kvmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { if (time_to_inject(sbi, FAULT_KVMALLOC)) { - f2fs_show_injection_info(FAULT_KVMALLOC); + f2fs_show_injection_info(sbi, FAULT_KVMALLOC); return NULL; } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6e58b2e62b189..f98dce4d07b30 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -682,7 +682,7 @@ int f2fs_truncate(struct inode *inode) trace_f2fs_truncate(inode); if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) { - f2fs_show_injection_info(FAULT_TRUNCATE); + f2fs_show_injection_info(F2FS_I_SB(inode), FAULT_TRUNCATE); return -EIO; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index a78aa5480454f..6b13a1a892066 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -54,7 +54,7 @@ static int gc_thread_func(void *data) } if (time_to_inject(sbi, FAULT_CHECKPOINT)) { - f2fs_show_injection_info(FAULT_CHECKPOINT); + f2fs_show_injection_info(sbi, FAULT_CHECKPOINT); f2fs_stop_checkpoint(sbi, false); } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 386ad54c13c3a..502bd491336a8 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -681,7 +681,7 @@ void f2fs_evict_inode(struct inode *inode) err = f2fs_truncate(inode); if (time_to_inject(sbi, FAULT_EVICT_INODE)) { - f2fs_show_injection_info(FAULT_EVICT_INODE); + f2fs_show_injection_info(sbi, FAULT_EVICT_INODE); err = -EIO; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 48bb5d3c709db..4cb182c20eedd 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2406,7 +2406,7 @@ bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct free_nid *i = NULL; retry: if (time_to_inject(sbi, FAULT_ALLOC_NID)) { - f2fs_show_injection_info(FAULT_ALLOC_NID); + f2fs_show_injection_info(sbi, FAULT_ALLOC_NID); return false; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5ba677f85533c..78c54bb7898df 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -489,7 +489,7 @@ int f2fs_commit_inmem_pages(struct inode *inode) void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { if (time_to_inject(sbi, FAULT_CHECKPOINT)) { - f2fs_show_injection_info(FAULT_CHECKPOINT); + f2fs_show_injection_info(sbi, FAULT_CHECKPOINT); f2fs_stop_checkpoint(sbi, false); } @@ -1017,8 +1017,9 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, if (dc->error) printk_ratelimited( - "%sF2FS-fs: Issue discard(%u, %u, %u) failed, ret: %d", - KERN_INFO, dc->lstart, dc->start, dc->len, dc->error); + "%sF2FS-fs (%s): Issue discard(%u, %u, %u) failed, ret: %d", + KERN_INFO, sbi->sb->s_id, + dc->lstart, dc->start, dc->len, dc->error); __detach_discard_cmd(dcc, dc); } @@ -1158,7 +1159,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, dc->len += len; if (time_to_inject(sbi, FAULT_DISCARD)) { - f2fs_show_injection_info(FAULT_DISCARD); + f2fs_show_injection_info(sbi, FAULT_DISCARD); err = -EIO; goto submit; } From aedcb5453ef05464ad2ab91c07b01365b746b728 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Wed, 4 Aug 2021 11:29:46 +0800 Subject: [PATCH 0684/5344] f2fs: reduce the scope of setting fsck tag when de->name_len is zero [ Upstream commit d4bf15a7ce172d186d400d606adf4f34a59130d6 ] I recently found a case where de->name_len is 0 in f2fs_fill_dentries() easily reproduced, and finally set the fsck flag. Thread A Thread B - f2fs_readdir - f2fs_read_inline_dir - ctx->pos = d.max - f2fs_add_dentry - f2fs_add_inline_entry - do_convert_inline_dir - f2fs_add_regular_entry - f2fs_readdir - f2fs_fill_dentries - set_sbi_flag(sbi, SBI_NEED_FSCK) Process A opens the folder, and has been reading without closing it. During this period, Process B created a file under the folder (occupying multiple f2fs_dir_entry, exceeding the d.max of the inline dir). After creation, process A uses the d.max of inline dir to read it again, and it will read that de->name_len is 0. And Chao pointed out that w/o inline conversion, the race condition still can happen as below: dir_entry1: A dir_entry2: B dir_entry3: C free slot: _ ctx->pos: ^ Thread A is traversing directory, ctx-pos moves to below position after readdir() by thread A: AAAABBBB___ ^ Then thread B delete dir_entry2, and create dir_entry3. Thread A calls readdir() to lookup dirents starting from middle of new dirent slots as below: AAAACCCCCC_ ^ In these scenarios, the file system is not damaged, and it's hard to avoid it. But we can bypass tagging FSCK flag if: a) bit_pos (:= ctx->pos % d->max) is non-zero and b) before bit_pos moves to first valid dir_entry. Fixes: ddf06b753a85 ("f2fs: fix to trigger fsck if dirent.name_len is zero") Signed-off-by: Yangtao Li [Chao: clean up description] Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/dir.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 7b3fbfe68f8cf..99c4a868d73b0 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -892,6 +892,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode); struct blk_plug plug; bool readdir_ra = sbi->readdir_ra == 1; + bool found_valid_dirent = false; int err = 0; bit_pos = ((unsigned long)ctx->pos % d->max); @@ -906,13 +907,15 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, de = &d->dentry[bit_pos]; if (de->name_len == 0) { + if (found_valid_dirent || !bit_pos) { + printk_ratelimited( + "%sF2FS-fs (%s): invalid namelen(0), ino:%u, run fsck to fix.", + KERN_WARNING, sbi->sb->s_id, + le32_to_cpu(de->ino)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } bit_pos++; ctx->pos = start_pos + bit_pos; - printk_ratelimited( - "%sF2FS-fs (%s): invalid namelen(0), ino:%u, run fsck to fix.", - KERN_WARNING, sbi->sb->s_id, - le32_to_cpu(de->ino)); - set_sbi_flag(sbi, SBI_NEED_FSCK); continue; } @@ -955,6 +958,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, f2fs_ra_node_page(sbi, le32_to_cpu(de->ino)); ctx->pos = start_pos + bit_pos; + found_valid_dirent = true; } out: if (readdir_ra) From 5d04ac0b8fe2e6d509ca63084d80d960dd014790 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 18 Jul 2021 19:33:09 -0700 Subject: [PATCH 0685/5344] openrisc: don't printk() unconditionally [ Upstream commit 946e1052cdcc7e585ee5d1e72528ca49fb295243 ] Don't call printk() when CONFIG_PRINTK is not set. Fixes the following build errors: or1k-linux-ld: arch/openrisc/kernel/entry.o: in function `_external_irq_handler': (.text+0x804): undefined reference to `printk' (.text+0x804): relocation truncated to fit: R_OR1K_INSN_REL_26 against undefined symbol `printk' Fixes: 9d02a4283e9c ("OpenRISC: Boot code") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Jonas Bonn Cc: Stefan Kristiansson Cc: Stafford Horne Cc: openrisc@lists.librecores.org Signed-off-by: Stafford Horne Signed-off-by: Sasha Levin --- arch/openrisc/kernel/entry.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/openrisc/kernel/entry.S b/arch/openrisc/kernel/entry.S index c6481cfc5220f..6b27cf4a0d786 100644 --- a/arch/openrisc/kernel/entry.S +++ b/arch/openrisc/kernel/entry.S @@ -547,6 +547,7 @@ EXCEPTION_ENTRY(_external_irq_handler) l.bnf 1f // ext irq enabled, all ok. l.nop +#ifdef CONFIG_PRINTK l.addi r1,r1,-0x8 l.movhi r3,hi(42f) l.ori r3,r3,lo(42f) @@ -560,6 +561,7 @@ EXCEPTION_ENTRY(_external_irq_handler) .string "\n\rESR interrupt bug: in _external_irq_handler (ESR %x)\n\r" .align 4 .previous +#endif l.ori r4,r4,SPR_SR_IEE // fix the bug // l.sw PT_SR(r1),r4 From 33417e68042b2c1fe08d6b13d5a018be4acb4ce2 Mon Sep 17 00:00:00 2001 From: Anthony Iliopoulos Date: Thu, 22 Jul 2021 16:10:55 +0200 Subject: [PATCH 0686/5344] dma-debug: fix debugfs initialization order [ Upstream commit 173735c346c412d9f084825ecb04f24ada0e2986 ] Due to link order, dma_debug_init is called before debugfs has a chance to initialize (via debugfs_init which also happens in the core initcall stage), so the directories for dma-debug are never created. Decouple dma_debug_fs_init from dma_debug_init and defer its init until core_initcall_sync (after debugfs has been initialized) while letting dma-debug initialization occur as soon as possible to catch any early mappings, as suggested in [1]. [1] https://lore.kernel.org/linux-iommu/YIgGa6yF%2Fadg8OSN@kroah.com/ Fixes: 15b28bbcd567 ("dma-debug: move initialization to common code") Signed-off-by: Anthony Iliopoulos Signed-off-by: Christoph Hellwig Signed-off-by: Sasha Levin --- kernel/dma/debug.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index cb6425e52bf7a..01e893cf9b9f7 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -846,7 +846,7 @@ static int dump_show(struct seq_file *seq, void *v) } DEFINE_SHOW_ATTRIBUTE(dump); -static void dma_debug_fs_init(void) +static int __init dma_debug_fs_init(void) { struct dentry *dentry = debugfs_create_dir("dma-api", NULL); @@ -859,7 +859,10 @@ static void dma_debug_fs_init(void) debugfs_create_u32("nr_total_entries", 0444, dentry, &nr_total_entries); debugfs_create_file("driver_filter", 0644, dentry, NULL, &filter_fops); debugfs_create_file("dump", 0444, dentry, NULL, &dump_fops); + + return 0; } +core_initcall_sync(dma_debug_fs_init); static int device_dma_allocations(struct device *dev, struct dma_debug_entry **out_entry) { @@ -944,8 +947,6 @@ static int dma_debug_init(void) spin_lock_init(&dma_entry_hash[i].lock); } - dma_debug_fs_init(); - nr_pages = DIV_ROUND_UP(nr_prealloc_entries, DMA_DEBUG_DYNAMIC_ENTRIES); for (i = 0; i < nr_pages; ++i) dma_debug_create_entries(GFP_KERNEL); From 7b51e70cda491b91e62f6b5f7bf15db16446e413 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 26 Jul 2021 07:59:23 -0400 Subject: [PATCH 0687/5344] SUNRPC: Fix potential memory corruption [ Upstream commit c2dc3e5fad13aca5d7bdf4bcb52b1a1d707c8555 ] We really should not call rpc_wake_up_queued_task_set_status() with xprt->snd_task as an argument unless we are certain that is actually an rpc_task. Fixes: 0445f92c5d53 ("SUNRPC: Fix disconnection races") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker Signed-off-by: Sasha Levin --- include/linux/sunrpc/xprt.h | 1 + net/sunrpc/xprt.c | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index d7ef5b97174ce..3c6c4b1dbf1a4 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -419,6 +419,7 @@ void xprt_unlock_connect(struct rpc_xprt *, void *); #define XPRT_CONGESTED (9) #define XPRT_CWND_WAIT (10) #define XPRT_WRITE_SPACE (11) +#define XPRT_SND_IS_COOKIE (12) static inline void xprt_set_connected(struct rpc_xprt *xprt) { diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 639837b3a5d90..3653898f465ff 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -729,9 +729,9 @@ void xprt_force_disconnect(struct rpc_xprt *xprt) /* Try to schedule an autoclose RPC call */ if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) queue_work(xprtiod_workqueue, &xprt->task_cleanup); - else if (xprt->snd_task) + else if (xprt->snd_task && !test_bit(XPRT_SND_IS_COOKIE, &xprt->state)) rpc_wake_up_queued_task_set_status(&xprt->pending, - xprt->snd_task, -ENOTCONN); + xprt->snd_task, -ENOTCONN); spin_unlock(&xprt->transport_lock); } EXPORT_SYMBOL_GPL(xprt_force_disconnect); @@ -820,6 +820,7 @@ bool xprt_lock_connect(struct rpc_xprt *xprt, goto out; if (xprt->snd_task != task) goto out; + set_bit(XPRT_SND_IS_COOKIE, &xprt->state); xprt->snd_task = cookie; ret = true; out: @@ -835,6 +836,7 @@ void xprt_unlock_connect(struct rpc_xprt *xprt, void *cookie) if (!test_bit(XPRT_LOCKED, &xprt->state)) goto out; xprt->snd_task =NULL; + clear_bit(XPRT_SND_IS_COOKIE, &xprt->state); xprt->ops->release_xprt(xprt, NULL); xprt_schedule_autodisconnect(xprt); out: From eeeb40f9b026f5187f12320ee09061fd08614be4 Mon Sep 17 00:00:00 2001 From: Wei Li Date: Thu, 15 Jul 2021 11:26:25 +0800 Subject: [PATCH 0688/5344] scsi: fdomain: Fix error return code in fdomain_probe() [ Upstream commit 632c4ae6da1d629eddf9da1e692d7617c568c256 ] If request_region() fails the return value is not set. Return -EBUSY on error. Link: https://lore.kernel.org/r/20210715032625.1395495-1-liwei391@huawei.com Fixes: 8674a8aa2c39 ("scsi: fdomain: Add PCMCIA support") Reported-by: Hulk Robot Signed-off-by: Wei Li Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/pcmcia/fdomain_cs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/pcmcia/fdomain_cs.c b/drivers/scsi/pcmcia/fdomain_cs.c index e42acf314d068..33df6a9ba9b5f 100644 --- a/drivers/scsi/pcmcia/fdomain_cs.c +++ b/drivers/scsi/pcmcia/fdomain_cs.c @@ -45,8 +45,10 @@ static int fdomain_probe(struct pcmcia_device *link) goto fail_disable; if (!request_region(link->resource[0]->start, FDOMAIN_REGION_SIZE, - "fdomain_cs")) + "fdomain_cs")) { + ret = -EBUSY; goto fail_disable; + } sh = fdomain_create(link->resource[0]->start, link->irq, 7, &link->dev); if (!sh) { From 25cd2dc9102e34db8e155ab354699719ad8ec04d Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Thu, 22 Jul 2021 11:39:29 +0800 Subject: [PATCH 0689/5344] pinctrl: single: Fix error return code in pcs_parse_bits_in_pinctrl_entry() [ Upstream commit d789a490d32fdf0465275e3607f8a3bc87d3f3ba ] Fix to return -ENOTSUPP instead of 0 when PCS_HAS_PINCONF is true, which is the same as that returned in pcs_parse_pinconf(). Fixes: 4e7e8017a80e ("pinctrl: pinctrl-single: enhance to configure multiple pins of different modules") Reported-by: Hulk Robot Signed-off-by: Zhen Lei Link: https://lore.kernel.org/r/20210722033930.4034-2-thunder.leizhen@huawei.com Signed-off-by: Linus Walleij Signed-off-by: Sasha Levin --- drivers/pinctrl/pinctrl-single.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pinctrl/pinctrl-single.c b/drivers/pinctrl/pinctrl-single.c index a9d511982780c..fb1c8965cb991 100644 --- a/drivers/pinctrl/pinctrl-single.c +++ b/drivers/pinctrl/pinctrl-single.c @@ -1201,6 +1201,7 @@ static int pcs_parse_bits_in_pinctrl_entry(struct pcs_device *pcs, if (PCS_HAS_PINCONF) { dev_err(pcs->dev, "pinconf not supported\n"); + res = -ENOTSUPP; goto free_pingroups; } From 6a840b07a7715d4a3fabc46da8599e851643b903 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 10 Aug 2021 11:46:13 +0300 Subject: [PATCH 0690/5344] scsi: smartpqi: Fix an error code in pqi_get_raid_map() [ Upstream commit d1f6581a6796c4e9fd8a4a24e8b77463d18f0df1 ] Return -EINVAL on failure instead of success. Link: https://lore.kernel.org/r/20210810084613.GB23810@kili Fixes: a91aaae0243b ("scsi: smartpqi: allow for larger raid maps") Signed-off-by: Dan Carpenter Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/smartpqi/smartpqi_init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/smartpqi/smartpqi_init.c b/drivers/scsi/smartpqi/smartpqi_init.c index 9bc451004184f..80ff00025c03d 100644 --- a/drivers/scsi/smartpqi/smartpqi_init.c +++ b/drivers/scsi/smartpqi/smartpqi_init.c @@ -1192,6 +1192,7 @@ static int pqi_get_raid_map(struct pqi_ctrl_info *ctrl_info, "Requested %d bytes, received %d bytes", raid_map_size, get_unaligned_le32(&raid_map->structure_size)); + rc = -EINVAL; goto error; } } From e1e5b3046c3959bc918e60b6e9db55ac4d06d4c5 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 10 Aug 2021 11:47:53 +0300 Subject: [PATCH 0691/5344] scsi: qedi: Fix error codes in qedi_alloc_global_queues() [ Upstream commit 4dbe57d46d54a847875fa33e7d05877bb341585e ] This function had some left over code that returned 1 on error instead negative error codes. Convert everything to use negative error codes. The caller treats all non-zero returns the same so this does not affect run time. A couple places set "rc" instead of "status" so those error paths ended up returning success by mistake. Get rid of the "rc" variable and use "status" everywhere. Remove the bogus "status = 0" initialization, as a future proofing measure so the compiler will warn about uninitialized error codes. Link: https://lore.kernel.org/r/20210810084753.GD23810@kili Fixes: ace7f46ba5fd ("scsi: qedi: Add QLogic FastLinQ offload iSCSI driver framework.") Acked-by: Manish Rangankar Signed-off-by: Dan Carpenter Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/qedi/qedi_main.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/scsi/qedi/qedi_main.c b/drivers/scsi/qedi/qedi_main.c index 1ec42c5f0b2a0..92c4a367b7bd7 100644 --- a/drivers/scsi/qedi/qedi_main.c +++ b/drivers/scsi/qedi/qedi_main.c @@ -1553,7 +1553,7 @@ static int qedi_alloc_global_queues(struct qedi_ctx *qedi) { u32 *list; int i; - int status = 0, rc; + int status; u32 *pbl; dma_addr_t page; int num_pages; @@ -1564,14 +1564,14 @@ static int qedi_alloc_global_queues(struct qedi_ctx *qedi) */ if (!qedi->num_queues) { QEDI_ERR(&qedi->dbg_ctx, "No MSI-X vectors available!\n"); - return 1; + return -ENOMEM; } /* Make sure we allocated the PBL that will contain the physical * addresses of our queues */ if (!qedi->p_cpuq) { - status = 1; + status = -EINVAL; goto mem_alloc_failure; } @@ -1586,13 +1586,13 @@ static int qedi_alloc_global_queues(struct qedi_ctx *qedi) "qedi->global_queues=%p.\n", qedi->global_queues); /* Allocate DMA coherent buffers for BDQ */ - rc = qedi_alloc_bdq(qedi); - if (rc) + status = qedi_alloc_bdq(qedi); + if (status) goto mem_alloc_failure; /* Allocate DMA coherent buffers for NVM_ISCSI_CFG */ - rc = qedi_alloc_nvm_iscsi_cfg(qedi); - if (rc) + status = qedi_alloc_nvm_iscsi_cfg(qedi); + if (status) goto mem_alloc_failure; /* Allocate a CQ and an associated PBL for each MSI-X From dee1f9a1a097e7ed6ee0926f650e8c1c43d28611 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 10 Aug 2021 11:51:49 +0300 Subject: [PATCH 0692/5344] scsi: qedf: Fix error codes in qedf_alloc_global_queues() [ Upstream commit ccc89737aa6b9f248cf1623014038beb6c2b7f56 ] This driver has some left over "return 1" on failure style code mixed with "return negative error codes" style code. The caller doesn't care so we should just convert everything to return negative error codes. Then there was a problem that there were two variables used to store error codes which just resulted in confusion. If qedf_alloc_bdq() returned a negative error code, we accidentally returned success instead of propagating the error code. So get rid of the "rc" variable and use "status" every where. Also remove the "status = 0" initialization so that these sorts of bugs will be detected by the compiler in the future. Link: https://lore.kernel.org/r/20210810085023.GA23998@kili Fixes: 61d8658b4a43 ("scsi: qedf: Add QLogic FastLinQ offload FCoE driver framework.") Acked-by: Manish Rangankar Signed-off-by: Dan Carpenter Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/qedf/qedf_main.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/qedf/qedf_main.c b/drivers/scsi/qedf/qedf_main.c index 7a6306f8483ec..c95e04cc64240 100644 --- a/drivers/scsi/qedf/qedf_main.c +++ b/drivers/scsi/qedf/qedf_main.c @@ -2894,7 +2894,7 @@ static int qedf_alloc_global_queues(struct qedf_ctx *qedf) { u32 *list; int i; - int status = 0, rc; + int status; u32 *pbl; dma_addr_t page; int num_pages; @@ -2906,7 +2906,7 @@ static int qedf_alloc_global_queues(struct qedf_ctx *qedf) */ if (!qedf->num_queues) { QEDF_ERR(&(qedf->dbg_ctx), "No MSI-X vectors available!\n"); - return 1; + return -ENOMEM; } /* @@ -2914,7 +2914,7 @@ static int qedf_alloc_global_queues(struct qedf_ctx *qedf) * addresses of our queues */ if (!qedf->p_cpuq) { - status = 1; + status = -EINVAL; QEDF_ERR(&qedf->dbg_ctx, "p_cpuq is NULL.\n"); goto mem_alloc_failure; } @@ -2930,8 +2930,8 @@ static int qedf_alloc_global_queues(struct qedf_ctx *qedf) "qedf->global_queues=%p.\n", qedf->global_queues); /* Allocate DMA coherent buffers for BDQ */ - rc = qedf_alloc_bdq(qedf); - if (rc) { + status = qedf_alloc_bdq(qedf); + if (status) { QEDF_ERR(&qedf->dbg_ctx, "Unable to allocate bdq.\n"); goto mem_alloc_failure; } From cfbaf145e41bd02bfd39d9e407d1deeafe1fb258 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Tue, 17 Aug 2021 14:24:06 +0930 Subject: [PATCH 0693/5344] powerpc/config: Renable MTD_PHYSMAP_OF [ Upstream commit d0e28a6145c3455b69991245e7f6147eb914b34a ] CONFIG_MTD_PHYSMAP_OF is not longer enabled as it depends on MTD_PHYSMAP which is not enabled. This is a regression from commit 642b1e8dbed7 ("mtd: maps: Merge physmap_of.c into physmap-core.c"), which added the extra dependency. Add CONFIG_MTD_PHYSMAP=y so this stays in the config, as Christophe said it is useful for build coverage. Fixes: 642b1e8dbed7 ("mtd: maps: Merge physmap_of.c into physmap-core.c") Signed-off-by: Joel Stanley Acked-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817045407.2445664-3-joel@jms.id.au Signed-off-by: Sasha Levin --- arch/powerpc/configs/mpc885_ads_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/configs/mpc885_ads_defconfig b/arch/powerpc/configs/mpc885_ads_defconfig index 285d506c5a769..2f5e06309f096 100644 --- a/arch/powerpc/configs/mpc885_ads_defconfig +++ b/arch/powerpc/configs/mpc885_ads_defconfig @@ -39,6 +39,7 @@ CONFIG_MTD_CFI_GEOMETRY=y # CONFIG_MTD_CFI_I2 is not set CONFIG_MTD_CFI_I4=y CONFIG_MTD_CFI_AMDSTD=y +CONFIG_MTD_PHYSMAP=y CONFIG_MTD_PHYSMAP_OF=y # CONFIG_BLK_DEV is not set CONFIG_NETDEVICES=y From 4861ab67051996d739cb72e24827832e113f79c0 Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Fri, 27 Mar 2020 15:19:52 +0100 Subject: [PATCH 0694/5344] scsi: target: avoid per-loop XCOPY buffer allocations [ Upstream commit 0ad08996da05b6b735d4963dceab7d2a4043607c ] The main target_xcopy_do_work() loop unnecessarily allocates an I/O buffer with each synchronous READ / WRITE pair. This commit significantly reduces allocations by reusing the XCOPY I/O buffer when possible. Link: https://lore.kernel.org/r/20200327141954.955-4-ddiss@suse.de Reviewed-by: Christoph Hellwig Signed-off-by: David Disseldorp Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/target/target_core_xcopy.c | 96 ++++++++++-------------------- drivers/target/target_core_xcopy.h | 1 + 2 files changed, 31 insertions(+), 66 deletions(-) diff --git a/drivers/target/target_core_xcopy.c b/drivers/target/target_core_xcopy.c index 596ad3edec9c0..48fabece76443 100644 --- a/drivers/target/target_core_xcopy.c +++ b/drivers/target/target_core_xcopy.c @@ -533,7 +533,6 @@ void target_xcopy_release_pt(void) * @cdb: SCSI CDB to be copied into @xpt_cmd. * @remote_port: If false, use the LUN through which the XCOPY command has * been received. If true, use @se_dev->xcopy_lun. - * @alloc_mem: Whether or not to allocate an SGL list. * * Set up a SCSI command (READ or WRITE) that will be used to execute an * XCOPY command. @@ -543,12 +542,9 @@ static int target_xcopy_setup_pt_cmd( struct xcopy_op *xop, struct se_device *se_dev, unsigned char *cdb, - bool remote_port, - bool alloc_mem) + bool remote_port) { struct se_cmd *cmd = &xpt_cmd->se_cmd; - sense_reason_t sense_rc; - int ret = 0, rc; /* * Setup LUN+port to honor reservations based upon xop->op_origin for @@ -564,46 +560,17 @@ static int target_xcopy_setup_pt_cmd( cmd->se_cmd_flags |= SCF_SE_LUN_CMD; cmd->tag = 0; - sense_rc = target_setup_cmd_from_cdb(cmd, cdb); - if (sense_rc) { - ret = -EINVAL; - goto out; - } + if (target_setup_cmd_from_cdb(cmd, cdb)) + return -EINVAL; - if (alloc_mem) { - rc = target_alloc_sgl(&cmd->t_data_sg, &cmd->t_data_nents, - cmd->data_length, false, false); - if (rc < 0) { - ret = rc; - goto out; - } - /* - * Set this bit so that transport_free_pages() allows the - * caller to release SGLs + physical memory allocated by - * transport_generic_get_mem().. - */ - cmd->se_cmd_flags |= SCF_PASSTHROUGH_SG_TO_MEM_NOALLOC; - } else { - /* - * Here the previously allocated SGLs for the internal READ - * are mapped zero-copy to the internal WRITE. - */ - sense_rc = transport_generic_map_mem_to_cmd(cmd, - xop->xop_data_sg, xop->xop_data_nents, - NULL, 0); - if (sense_rc) { - ret = -EINVAL; - goto out; - } + if (transport_generic_map_mem_to_cmd(cmd, xop->xop_data_sg, + xop->xop_data_nents, NULL, 0)) + return -EINVAL; - pr_debug("Setup PASSTHROUGH_NOALLOC t_data_sg: %p t_data_nents:" - " %u\n", cmd->t_data_sg, cmd->t_data_nents); - } + pr_debug("Setup PASSTHROUGH_NOALLOC t_data_sg: %p t_data_nents:" + " %u\n", cmd->t_data_sg, cmd->t_data_nents); return 0; - -out: - return ret; } static int target_xcopy_issue_pt_cmd(struct xcopy_pt_cmd *xpt_cmd) @@ -660,15 +627,13 @@ static int target_xcopy_read_source( xop->src_pt_cmd = xpt_cmd; rc = target_xcopy_setup_pt_cmd(xpt_cmd, xop, src_dev, &cdb[0], - remote_port, true); + remote_port); if (rc < 0) { ec_cmd->scsi_status = xpt_cmd->se_cmd.scsi_status; transport_generic_free_cmd(se_cmd, 0); return rc; } - xop->xop_data_sg = se_cmd->t_data_sg; - xop->xop_data_nents = se_cmd->t_data_nents; pr_debug("XCOPY-READ: Saved xop->xop_data_sg: %p, num: %u for READ" " memory\n", xop->xop_data_sg, xop->xop_data_nents); @@ -678,12 +643,6 @@ static int target_xcopy_read_source( transport_generic_free_cmd(se_cmd, 0); return rc; } - /* - * Clear off the allocated t_data_sg, that has been saved for - * zero-copy WRITE submission reuse in struct xcopy_op.. - */ - se_cmd->t_data_sg = NULL; - se_cmd->t_data_nents = 0; return 0; } @@ -722,19 +681,9 @@ static int target_xcopy_write_destination( xop->dst_pt_cmd = xpt_cmd; rc = target_xcopy_setup_pt_cmd(xpt_cmd, xop, dst_dev, &cdb[0], - remote_port, false); + remote_port); if (rc < 0) { - struct se_cmd *src_cmd = &xop->src_pt_cmd->se_cmd; ec_cmd->scsi_status = xpt_cmd->se_cmd.scsi_status; - /* - * If the failure happened before the t_mem_list hand-off in - * target_xcopy_setup_pt_cmd(), Reset memory + clear flag so that - * core releases this memory on error during X-COPY WRITE I/O. - */ - src_cmd->se_cmd_flags &= ~SCF_PASSTHROUGH_SG_TO_MEM_NOALLOC; - src_cmd->t_data_sg = xop->xop_data_sg; - src_cmd->t_data_nents = xop->xop_data_nents; - transport_generic_free_cmd(se_cmd, 0); return rc; } @@ -742,7 +691,6 @@ static int target_xcopy_write_destination( rc = target_xcopy_issue_pt_cmd(xpt_cmd); if (rc < 0) { ec_cmd->scsi_status = xpt_cmd->se_cmd.scsi_status; - se_cmd->se_cmd_flags &= ~SCF_PASSTHROUGH_SG_TO_MEM_NOALLOC; transport_generic_free_cmd(se_cmd, 0); return rc; } @@ -758,7 +706,7 @@ static void target_xcopy_do_work(struct work_struct *work) sector_t src_lba, dst_lba, end_lba; unsigned int max_sectors; int rc = 0; - unsigned short nolb, cur_nolb, max_nolb, copied_nolb = 0; + unsigned short nolb, max_nolb, copied_nolb = 0; if (target_parse_xcopy_cmd(xop) != TCM_NO_SENSE) goto err_free; @@ -788,7 +736,23 @@ static void target_xcopy_do_work(struct work_struct *work) (unsigned long long)src_lba, (unsigned long long)dst_lba); while (src_lba < end_lba) { - cur_nolb = min(nolb, max_nolb); + unsigned short cur_nolb = min(nolb, max_nolb); + u32 cur_bytes = cur_nolb * src_dev->dev_attrib.block_size; + + if (cur_bytes != xop->xop_data_bytes) { + /* + * (Re)allocate a buffer large enough to hold the XCOPY + * I/O size, which can be reused each read / write loop. + */ + target_free_sgl(xop->xop_data_sg, xop->xop_data_nents); + rc = target_alloc_sgl(&xop->xop_data_sg, + &xop->xop_data_nents, + cur_bytes, + false, false); + if (rc < 0) + goto out; + xop->xop_data_bytes = cur_bytes; + } pr_debug("target_xcopy_do_work: Calling read src_dev: %p src_lba: %llu," " cur_nolb: %hu\n", src_dev, (unsigned long long)src_lba, cur_nolb); @@ -819,12 +783,11 @@ static void target_xcopy_do_work(struct work_struct *work) nolb -= cur_nolb; transport_generic_free_cmd(&xop->src_pt_cmd->se_cmd, 0); - xop->dst_pt_cmd->se_cmd.se_cmd_flags &= ~SCF_PASSTHROUGH_SG_TO_MEM_NOALLOC; - transport_generic_free_cmd(&xop->dst_pt_cmd->se_cmd, 0); } xcopy_pt_undepend_remotedev(xop); + target_free_sgl(xop->xop_data_sg, xop->xop_data_nents); kfree(xop); pr_debug("target_xcopy_do_work: Final src_lba: %llu, dst_lba: %llu\n", @@ -838,6 +801,7 @@ static void target_xcopy_do_work(struct work_struct *work) out: xcopy_pt_undepend_remotedev(xop); + target_free_sgl(xop->xop_data_sg, xop->xop_data_nents); err_free: kfree(xop); diff --git a/drivers/target/target_core_xcopy.h b/drivers/target/target_core_xcopy.h index 974bc1e19ff2b..a1805a14eea07 100644 --- a/drivers/target/target_core_xcopy.h +++ b/drivers/target/target_core_xcopy.h @@ -41,6 +41,7 @@ struct xcopy_op { struct xcopy_pt_cmd *src_pt_cmd; struct xcopy_pt_cmd *dst_pt_cmd; + u32 xop_data_bytes; u32 xop_data_nents; struct scatterlist *xop_data_sg; struct work_struct xop_work; From b3f512fcf821bceaa99eac44105c622421a71764 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 12 Aug 2021 02:00:43 +1000 Subject: [PATCH 0695/5344] KVM: PPC: Book3S HV Nested: Reflect guest PMU in-use to L0 when guest SPRs are live [ Upstream commit 1782663897945a5cf28e564ba5eed730098e9aa4 ] After the L1 saves its PMU SPRs but before loading the L2's PMU SPRs, switch the pmcregs_in_use field in the L1 lppaca to the value advertised by the L2 in its VPA. On the way out of the L2, set it back after saving the L2 PMU registers (if they were in-use). This transfers the PMU liveness indication between the L1 and L2 at the points where the registers are not live. This fixes the nested HV bug for which a workaround was added to the L0 HV by commit 63279eeb7f93a ("KVM: PPC: Book3S HV: Always save guest pmu for guest capable of nesting"), which explains the problem in detail. That workaround is no longer required for guests that include this bug fix. Fixes: 360cae313702 ("KVM: PPC: Book3S HV: Nested guest entry via hypercall") Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20210811160134.904987-10-npiggin@gmail.com Signed-off-by: Sasha Levin --- arch/powerpc/include/asm/pmc.h | 7 +++++++ arch/powerpc/kvm/book3s_hv.c | 20 ++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/arch/powerpc/include/asm/pmc.h b/arch/powerpc/include/asm/pmc.h index c6bbe9778d3cd..3c09109e708ef 100644 --- a/arch/powerpc/include/asm/pmc.h +++ b/arch/powerpc/include/asm/pmc.h @@ -34,6 +34,13 @@ static inline void ppc_set_pmu_inuse(int inuse) #endif } +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +static inline int ppc_get_pmu_inuse(void) +{ + return get_paca()->pmcregs_in_use; +} +#endif + extern void power4_enable_pmcs(void); #else /* CONFIG_PPC64 */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 6f48e68f98b98..e51cf193a9633 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include #include @@ -3559,6 +3560,18 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true); +#ifdef CONFIG_PPC_PSERIES + if (kvmhv_on_pseries()) { + barrier(); + if (vcpu->arch.vpa.pinned_addr) { + struct lppaca *lp = vcpu->arch.vpa.pinned_addr; + get_lppaca()->pmcregs_in_use = lp->pmcregs_in_use; + } else { + get_lppaca()->pmcregs_in_use = 1; + } + barrier(); + } +#endif kvmhv_load_guest_pmu(vcpu); msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX); @@ -3693,6 +3706,13 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, save_pmu |= nesting_enabled(vcpu->kvm); kvmhv_save_guest_pmu(vcpu, save_pmu); +#ifdef CONFIG_PPC_PSERIES + if (kvmhv_on_pseries()) { + barrier(); + get_lppaca()->pmcregs_in_use = ppc_get_pmu_inuse(); + barrier(); + } +#endif vc->entry_exit_map = 0x101; vc->in_guest = 0; From 4a42eb3704d30de5c3d9a0ab330aa77750fce82e Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 26 Aug 2021 16:08:22 +0200 Subject: [PATCH 0696/5344] platform/x86: dell-smbios-wmi: Add missing kfree in error-exit from run_smbios_call [ Upstream commit 0487d4fc42d7f31a56cfd9e2237f9ebd889e6112 ] As pointed out be Kees Cook if we return -EIO because the obj->type != ACPI_TYPE_BUFFER, then we must kfree the output buffer before the return. Fixes: 1a258e670434 ("platform/x86: dell-smbios-wmi: Add new WMI dispatcher driver") Reported-by: Kees Cook Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20210826140822.71198-1-hdegoede@redhat.com Signed-off-by: Sasha Levin --- drivers/platform/x86/dell-smbios-wmi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/dell-smbios-wmi.c b/drivers/platform/x86/dell-smbios-wmi.c index c97bd4a452422..5821e9d9a4ce4 100644 --- a/drivers/platform/x86/dell-smbios-wmi.c +++ b/drivers/platform/x86/dell-smbios-wmi.c @@ -69,6 +69,7 @@ static int run_smbios_call(struct wmi_device *wdev) if (obj->type == ACPI_TYPE_INTEGER) dev_dbg(&wdev->dev, "SMBIOS call failed: %llu\n", obj->integer.value); + kfree(output.pointer); return -EIO; } memcpy(&priv->buf->std, obj->buffer.pointer, obj->buffer.length); From 40433a0fa0bef8a41278df9c3ded13de8d3672a1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 17 Jun 2021 14:21:00 +0100 Subject: [PATCH 0697/5344] fscache: Fix cookie key hashing [ Upstream commit 35b72573e977ed6b18b094136a4fa3e0ffb13603 ] The current hash algorithm used for hashing cookie keys is really bad, producing almost no dispersion (after a test kernel build, ~30000 files were split over just 18 out of the 32768 hash buckets). Borrow the full_name_hash() hash function into fscache to do the hashing for cookie keys and, in the future, volume keys. I don't want to use full_name_hash() as-is because I want the hash value to be consistent across arches and over time as the hash value produced may get used on disk. I can also optimise parts of it away as the key will always be a padded array of aligned 32-bit words. Fixes: ec0328e46d6e ("fscache: Maintain a catalogue of allocated cookies") Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/162431201844.2908479.8293647220901514696.stgit@warthog.procyon.org.uk/ Signed-off-by: Sasha Levin --- fs/fscache/cookie.c | 14 +------------- fs/fscache/internal.h | 2 ++ fs/fscache/main.c | 39 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 13 deletions(-) diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 0ce39658a6200..44a426c8ea01e 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -74,10 +74,8 @@ void fscache_free_cookie(struct fscache_cookie *cookie) static int fscache_set_key(struct fscache_cookie *cookie, const void *index_key, size_t index_key_len) { - unsigned long long h; u32 *buf; int bufs; - int i; bufs = DIV_ROUND_UP(index_key_len, sizeof(*buf)); @@ -91,17 +89,7 @@ static int fscache_set_key(struct fscache_cookie *cookie, } memcpy(buf, index_key, index_key_len); - - /* Calculate a hash and combine this with the length in the first word - * or first half word - */ - h = (unsigned long)cookie->parent; - h += index_key_len + cookie->type; - - for (i = 0; i < bufs; i++) - h += buf[i]; - - cookie->key_hash = h ^ (h >> 32); + cookie->key_hash = fscache_hash(0, buf, bufs); return 0; } diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 9616af3768e11..d09d4e69c818e 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -97,6 +97,8 @@ extern struct workqueue_struct *fscache_object_wq; extern struct workqueue_struct *fscache_op_wq; DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); +extern unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n); + static inline bool fscache_object_congested(void) { return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq); diff --git a/fs/fscache/main.c b/fs/fscache/main.c index 59c2494efda34..3aa3756c71761 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -94,6 +94,45 @@ static struct ctl_table fscache_sysctls_root[] = { }; #endif +/* + * Mixing scores (in bits) for (7,20): + * Input delta: 1-bit 2-bit + * 1 round: 330.3 9201.6 + * 2 rounds: 1246.4 25475.4 + * 3 rounds: 1907.1 31295.1 + * 4 rounds: 2042.3 31718.6 + * Perfect: 2048 31744 + * (32*64) (32*31/2 * 64) + */ +#define HASH_MIX(x, y, a) \ + ( x ^= (a), \ + y ^= x, x = rol32(x, 7),\ + x += y, y = rol32(y,20),\ + y *= 9 ) + +static inline unsigned int fold_hash(unsigned long x, unsigned long y) +{ + /* Use arch-optimized multiply if one exists */ + return __hash_32(y ^ __hash_32(x)); +} + +/* + * Generate a hash. This is derived from full_name_hash(), but we want to be + * sure it is arch independent and that it doesn't change as bits of the + * computed hash value might appear on disk. The caller also guarantees that + * the hashed data will be a series of aligned 32-bit words. + */ +unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n) +{ + unsigned int a, x = 0, y = salt; + + for (; n; n--) { + a = *data++; + HASH_MIX(x, y, a); + } + return fold_hash(x, y); +} + /* * initialise the fs caching module */ From 3d625db7535644e7c43489c60279f2d9c6836a6c Mon Sep 17 00:00:00 2001 From: Codrin Ciubotariu Date: Fri, 31 Jan 2020 13:58:16 +0200 Subject: [PATCH 0698/5344] clk: at91: sam9x60: Don't use audio PLL [ Upstream commit 5bf7f4a249387a6062b9a14c8a77e7ba2fd6a53b ] On sam9x60, there is not audio PLL and so I2S and classD have to use one of the best matching parents for their generated clock. Fixes: 01e2113de9a5 ("clk: at91: add sam9x60 pmc driver") Signed-off-by: Codrin Ciubotariu Link: https://lkml.kernel.org/r/20200131115816.12483-1-codrin.ciubotariu@microchip.com Signed-off-by: Stephen Boyd Signed-off-by: Sasha Levin --- drivers/clk/at91/sam9x60.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/clk/at91/sam9x60.c b/drivers/clk/at91/sam9x60.c index e3f4c8f20223a..bee1120e7041c 100644 --- a/drivers/clk/at91/sam9x60.c +++ b/drivers/clk/at91/sam9x60.c @@ -124,7 +124,6 @@ static const struct { char *n; u8 id; struct clk_range r; - bool pll; } sam9x60_gck[] = { { .n = "flex0_gclk", .id = 5, }, { .n = "flex1_gclk", .id = 6, }, @@ -144,11 +143,9 @@ static const struct { { .n = "sdmmc1_gclk", .id = 26, .r = { .min = 0, .max = 105000000 }, }, { .n = "flex11_gclk", .id = 32, }, { .n = "flex12_gclk", .id = 33, }, - { .n = "i2s_gclk", .id = 34, .r = { .min = 0, .max = 105000000 }, - .pll = true, }, + { .n = "i2s_gclk", .id = 34, .r = { .min = 0, .max = 105000000 }, }, { .n = "pit64b_gclk", .id = 37, }, - { .n = "classd_gclk", .id = 42, .r = { .min = 0, .max = 100000000 }, - .pll = true, }, + { .n = "classd_gclk", .id = 42, .r = { .min = 0, .max = 100000000 }, }, { .n = "tcb1_gclk", .id = 45, }, { .n = "dbgu_gclk", .id = 47, }, }; @@ -285,7 +282,7 @@ static void __init sam9x60_pmc_setup(struct device_node *np) sam9x60_gck[i].n, parent_names, 6, sam9x60_gck[i].id, - sam9x60_gck[i].pll, + false, &sam9x60_gck[i].r); if (IS_ERR(hw)) goto err_free; From 5bf1392ba6359c66ff383c42a099dfee8c4f4040 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Wed, 22 Jul 2020 10:38:18 +0300 Subject: [PATCH 0699/5344] clk: at91: clk-generated: pass the id of changeable parent at registration [ Upstream commit 64c9247b9e87e96e41cea545eb64727cee10c55c ] Pass the ID of changeable parent at registration. This will allow the scalability of this clock driver with regards to the changeable parent ID for versions of this IP where changeable parent is not the last one in the parents list (e.g. SAMA7G5). With this the clock flags are set to zero in case we have no changeable parent. Also in clk_generated_best_diff() the *best_diff variable is check against tmp_diff variable using ">=" operator instead of ">" so that in case the requested frequency could be obtained using fix parents + gck dividers but the clock also supports changeable parent to be able to force the usage of the changeable parent. Signed-off-by: Claudiu Beznea Link: https://lore.kernel.org/r/1595403506-8209-11-git-send-email-claudiu.beznea@microchip.com Signed-off-by: Stephen Boyd Signed-off-by: Sasha Levin --- drivers/clk/at91/clk-generated.c | 26 ++++++++++++++------------ drivers/clk/at91/dt-compat.c | 8 +++++--- drivers/clk/at91/pmc.h | 4 ++-- drivers/clk/at91/sam9x60.c | 3 +-- drivers/clk/at91/sama5d2.c | 31 +++++++++++++++---------------- 5 files changed, 37 insertions(+), 35 deletions(-) diff --git a/drivers/clk/at91/clk-generated.c b/drivers/clk/at91/clk-generated.c index 44a46dcc0518b..273d0447d7273 100644 --- a/drivers/clk/at91/clk-generated.c +++ b/drivers/clk/at91/clk-generated.c @@ -18,8 +18,6 @@ #define GENERATED_MAX_DIV 255 -#define GCK_INDEX_DT_AUDIO_PLL 5 - struct clk_generated { struct clk_hw hw; struct regmap *regmap; @@ -29,7 +27,7 @@ struct clk_generated { u32 gckdiv; const struct clk_pcr_layout *layout; u8 parent_id; - bool audio_pll_allowed; + int chg_pid; }; #define to_clk_generated(hw) \ @@ -109,7 +107,7 @@ static void clk_generated_best_diff(struct clk_rate_request *req, tmp_rate = parent_rate / div; tmp_diff = abs(req->rate - tmp_rate); - if (*best_diff < 0 || *best_diff > tmp_diff) { + if (*best_diff < 0 || *best_diff >= tmp_diff) { *best_rate = tmp_rate; *best_diff = tmp_diff; req->best_parent_rate = parent_rate; @@ -129,7 +127,10 @@ static int clk_generated_determine_rate(struct clk_hw *hw, int i; u32 div; - for (i = 0; i < clk_hw_get_num_parents(hw) - 1; i++) { + for (i = 0; i < clk_hw_get_num_parents(hw); i++) { + if (gck->chg_pid == i) + continue; + parent = clk_hw_get_parent_by_index(hw, i); if (!parent) continue; @@ -161,10 +162,10 @@ static int clk_generated_determine_rate(struct clk_hw *hw, * that the only clks able to modify gck rate are those of audio IPs. */ - if (!gck->audio_pll_allowed) + if (gck->chg_pid < 0) goto end; - parent = clk_hw_get_parent_by_index(hw, GCK_INDEX_DT_AUDIO_PLL); + parent = clk_hw_get_parent_by_index(hw, gck->chg_pid); if (!parent) goto end; @@ -271,8 +272,8 @@ struct clk_hw * __init at91_clk_register_generated(struct regmap *regmap, spinlock_t *lock, const struct clk_pcr_layout *layout, const char *name, const char **parent_names, - u8 num_parents, u8 id, bool pll_audio, - const struct clk_range *range) + u8 num_parents, u8 id, + const struct clk_range *range, int chg_pid) { struct clk_generated *gck; struct clk_init_data init; @@ -287,15 +288,16 @@ at91_clk_register_generated(struct regmap *regmap, spinlock_t *lock, init.ops = &generated_ops; init.parent_names = parent_names; init.num_parents = num_parents; - init.flags = CLK_SET_RATE_GATE | CLK_SET_PARENT_GATE | - CLK_SET_RATE_PARENT; + init.flags = CLK_SET_RATE_GATE | CLK_SET_PARENT_GATE; + if (chg_pid >= 0) + init.flags |= CLK_SET_RATE_PARENT; gck->id = id; gck->hw.init = &init; gck->regmap = regmap; gck->lock = lock; gck->range = *range; - gck->audio_pll_allowed = pll_audio; + gck->chg_pid = chg_pid; gck->layout = layout; clk_generated_startup(gck); diff --git a/drivers/clk/at91/dt-compat.c b/drivers/clk/at91/dt-compat.c index aa1754eac59ff..8a652c44c25ab 100644 --- a/drivers/clk/at91/dt-compat.c +++ b/drivers/clk/at91/dt-compat.c @@ -22,6 +22,8 @@ #define SYSTEM_MAX_ID 31 +#define GCK_INDEX_DT_AUDIO_PLL 5 + #ifdef CONFIG_HAVE_AT91_AUDIO_PLL static void __init of_sama5d2_clk_audio_pll_frac_setup(struct device_node *np) { @@ -135,7 +137,7 @@ static void __init of_sama5d2_clk_generated_setup(struct device_node *np) return; for_each_child_of_node(np, gcknp) { - bool pll_audio = false; + int chg_pid = INT_MIN; if (of_property_read_u32(gcknp, "reg", &id)) continue; @@ -152,12 +154,12 @@ static void __init of_sama5d2_clk_generated_setup(struct device_node *np) if (of_device_is_compatible(np, "atmel,sama5d2-clk-generated") && (id == GCK_ID_I2S0 || id == GCK_ID_I2S1 || id == GCK_ID_CLASSD)) - pll_audio = true; + chg_pid = GCK_INDEX_DT_AUDIO_PLL; hw = at91_clk_register_generated(regmap, &pmc_pcr_lock, &dt_pcr_layout, name, parent_names, num_parents, - id, pll_audio, &range); + id, &range, chg_pid); if (IS_ERR(hw)) continue; diff --git a/drivers/clk/at91/pmc.h b/drivers/clk/at91/pmc.h index 9b8db9cdcda53..8a88ad2360742 100644 --- a/drivers/clk/at91/pmc.h +++ b/drivers/clk/at91/pmc.h @@ -118,8 +118,8 @@ struct clk_hw * __init at91_clk_register_generated(struct regmap *regmap, spinlock_t *lock, const struct clk_pcr_layout *layout, const char *name, const char **parent_names, - u8 num_parents, u8 id, bool pll_audio, - const struct clk_range *range); + u8 num_parents, u8 id, + const struct clk_range *range, int chg_pid); struct clk_hw * __init at91_clk_register_h32mx(struct regmap *regmap, const char *name, diff --git a/drivers/clk/at91/sam9x60.c b/drivers/clk/at91/sam9x60.c index bee1120e7041c..39923899478f9 100644 --- a/drivers/clk/at91/sam9x60.c +++ b/drivers/clk/at91/sam9x60.c @@ -282,8 +282,7 @@ static void __init sam9x60_pmc_setup(struct device_node *np) sam9x60_gck[i].n, parent_names, 6, sam9x60_gck[i].id, - false, - &sam9x60_gck[i].r); + &sam9x60_gck[i].r, INT_MIN); if (IS_ERR(hw)) goto err_free; diff --git a/drivers/clk/at91/sama5d2.c b/drivers/clk/at91/sama5d2.c index ff7e3f727082e..d3c4bceb032d1 100644 --- a/drivers/clk/at91/sama5d2.c +++ b/drivers/clk/at91/sama5d2.c @@ -115,21 +115,20 @@ static const struct { char *n; u8 id; struct clk_range r; - bool pll; + int chg_pid; } sama5d2_gck[] = { - { .n = "sdmmc0_gclk", .id = 31, }, - { .n = "sdmmc1_gclk", .id = 32, }, - { .n = "tcb0_gclk", .id = 35, .r = { .min = 0, .max = 83000000 }, }, - { .n = "tcb1_gclk", .id = 36, .r = { .min = 0, .max = 83000000 }, }, - { .n = "pwm_gclk", .id = 38, .r = { .min = 0, .max = 83000000 }, }, - { .n = "isc_gclk", .id = 46, }, - { .n = "pdmic_gclk", .id = 48, }, - { .n = "i2s0_gclk", .id = 54, .pll = true }, - { .n = "i2s1_gclk", .id = 55, .pll = true }, - { .n = "can0_gclk", .id = 56, .r = { .min = 0, .max = 80000000 }, }, - { .n = "can1_gclk", .id = 57, .r = { .min = 0, .max = 80000000 }, }, - { .n = "classd_gclk", .id = 59, .r = { .min = 0, .max = 100000000 }, - .pll = true }, + { .n = "sdmmc0_gclk", .id = 31, .chg_pid = INT_MIN, }, + { .n = "sdmmc1_gclk", .id = 32, .chg_pid = INT_MIN, }, + { .n = "tcb0_gclk", .id = 35, .chg_pid = INT_MIN, .r = { .min = 0, .max = 83000000 }, }, + { .n = "tcb1_gclk", .id = 36, .chg_pid = INT_MIN, .r = { .min = 0, .max = 83000000 }, }, + { .n = "pwm_gclk", .id = 38, .chg_pid = INT_MIN, .r = { .min = 0, .max = 83000000 }, }, + { .n = "isc_gclk", .id = 46, .chg_pid = INT_MIN, }, + { .n = "pdmic_gclk", .id = 48, .chg_pid = INT_MIN, }, + { .n = "i2s0_gclk", .id = 54, .chg_pid = 5, }, + { .n = "i2s1_gclk", .id = 55, .chg_pid = 5, }, + { .n = "can0_gclk", .id = 56, .chg_pid = INT_MIN, .r = { .min = 0, .max = 80000000 }, }, + { .n = "can1_gclk", .id = 57, .chg_pid = INT_MIN, .r = { .min = 0, .max = 80000000 }, }, + { .n = "classd_gclk", .id = 59, .chg_pid = 5, .r = { .min = 0, .max = 100000000 }, }, }; static const struct clk_programmable_layout sama5d2_programmable_layout = { @@ -317,8 +316,8 @@ static void __init sama5d2_pmc_setup(struct device_node *np) sama5d2_gck[i].n, parent_names, 6, sama5d2_gck[i].id, - sama5d2_gck[i].pll, - &sama5d2_gck[i].r); + &sama5d2_gck[i].r, + sama5d2_gck[i].chg_pid); if (IS_ERR(hw)) goto err_free; From ddc3aa36e79dfd56bc9533adf0b45c690be939e2 Mon Sep 17 00:00:00 2001 From: Codrin Ciubotariu Date: Wed, 7 Jul 2021 16:12:13 +0300 Subject: [PATCH 0700/5344] clk: at91: clk-generated: Limit the requested rate to our range [ Upstream commit af7651e67b9d5f7e63ea23b118e3672ac662244a ] On clk_generated_determine_rate(), the requested rate could be outside of clk's range. Limit the rate to the clock's range to not return an error. Fixes: df70aeef6083 ("clk: at91: add generated clock driver") Signed-off-by: Codrin Ciubotariu Link: https://lore.kernel.org/r/20210707131213.3283509-1-codrin.ciubotariu@microchip.com Acked-by: Nicolas Ferre Signed-off-by: Stephen Boyd Signed-off-by: Sasha Levin --- drivers/clk/at91/clk-generated.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/clk/at91/clk-generated.c b/drivers/clk/at91/clk-generated.c index 273d0447d7273..d7fe1303f79dc 100644 --- a/drivers/clk/at91/clk-generated.c +++ b/drivers/clk/at91/clk-generated.c @@ -127,6 +127,12 @@ static int clk_generated_determine_rate(struct clk_hw *hw, int i; u32 div; + /* do not look for a rate that is outside of our range */ + if (gck->range.max && req->rate > gck->range.max) + req->rate = gck->range.max; + if (gck->range.min && req->rate < gck->range.min) + req->rate = gck->range.min; + for (i = 0; i < clk_hw_get_num_parents(hw); i++) { if (gck->chg_pid == i) continue; From 869c2920106a10afe35d81a3bc80f3e5163c3bda Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 27 Aug 2021 14:07:06 +1000 Subject: [PATCH 0701/5344] KVM: PPC: Fix clearing never mapped TCEs in realmode [ Upstream commit 1d78dfde33a02da1d816279c2e3452978b7abd39 ] Since commit e1a1ef84cd07 ("KVM: PPC: Book3S: Allocate guest TCEs on demand too"), pages for TCE tables for KVM guests are allocated only when needed. This allows skipping any update when clearing TCEs. This works mostly fine as TCE updates are handled when the MMU is enabled. The realmode handlers fail with H_TOO_HARD when pages are not yet allocated, except when clearing a TCE in which case KVM prints a warning and proceeds to dereference a NULL pointer, which crashes the host OS. This has not been caught so far as the change in commit e1a1ef84cd07 is reasonably new, and POWER9 runs mostly radix which does not use realmode handlers. With hash, the default TCE table is memset() by QEMU when the machine is reset which triggers page faults and the KVM TCE device's kvm_spapr_tce_fault() handles those with MMU on. And the huge DMA windows are not cleared by VMs which instead successfully create a DMA window big enough to map the VM memory 1:1 and then VMs just map everything without clearing. This started crashing now as commit 381ceda88c4c ("powerpc/pseries/iommu: Make use of DDW for indirect mapping") added a mode when a dymanic DMA window not big enough to map the VM memory 1:1 but it is used anyway, and the VM now is the first (i.e. not QEMU) to clear a just created table. Note that upstream QEMU needs to be modified to trigger the VM to trigger the host OS crash. This replaces WARN_ON_ONCE_RM() with a check and return, and adds another warning if TCE is not being cleared. Fixes: e1a1ef84cd07 ("KVM: PPC: Book3S: Allocate guest TCEs on demand too") Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210827040706.517652-1-aik@ozlabs.ru Signed-off-by: Sasha Levin --- arch/powerpc/kvm/book3s_64_vio_hv.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index ab6eeb8e753e5..35fd67b4ceb41 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -177,10 +177,13 @@ static void kvmppc_rm_tce_put(struct kvmppc_spapr_tce_table *stt, idx -= stt->offset; page = stt->pages[idx / TCES_PER_PAGE]; /* - * page must not be NULL in real mode, - * kvmppc_rm_ioba_validate() must have taken care of this. + * kvmppc_rm_ioba_validate() allows pages not be allocated if TCE is + * being cleared, otherwise it returns H_TOO_HARD and we skip this. */ - WARN_ON_ONCE_RM(!page); + if (!page) { + WARN_ON_ONCE_RM(tce != 0); + return; + } tbl = kvmppc_page_address(page); tbl[idx % TCES_PER_PAGE] = tce; From 9eb09e4d0b6e2e42aaef781fd35013034fdc13e2 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 24 Aug 2021 08:12:08 +0800 Subject: [PATCH 0702/5344] f2fs: fix to account missing .skipped_gc_rwsem [ Upstream commit ad126ebddecbf696e0cf214ff56c7b170fa9f0f7 ] There is a missing place we forgot to account .skipped_gc_rwsem, fix it. Fixes: 6f8d4455060d ("f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/gc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 6b13a1a892066..4b6c36208f552 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1095,8 +1095,10 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, int err; if (S_ISREG(inode->i_mode)) { - if (!down_write_trylock(&fi->i_gc_rwsem[READ])) + if (!down_write_trylock(&fi->i_gc_rwsem[READ])) { + sbi->skipped_gc_rwsem++; continue; + } if (!down_write_trylock( &fi->i_gc_rwsem[WRITE])) { sbi->skipped_gc_rwsem++; From 48ed11d8d492e98fcfba436fede1c7facc13fb8b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 26 Aug 2021 10:03:15 +0800 Subject: [PATCH 0703/5344] f2fs: fix unexpected ENOENT comes from f2fs_map_blocks() [ Upstream commit adf9ea89c719c1d23794e363f631e376b3ff8cbc ] In below path, it will return ENOENT if filesystem is shutdown: - f2fs_map_blocks - f2fs_get_dnode_of_data - f2fs_get_node_page - __get_node_page - read_node_page - is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) return -ENOENT - force return value from ENOENT to 0 It should be fine for read case, since it indicates a hole condition, and caller could use .m_next_pgofs to skip the hole and continue the lookup. However it may cause confusing for write case, since leaving a hole there, and said nothing was wrong doesn't help. There is at least one case from dax_iomap_actor() will complain that, so fix this in prior to supporting dax in f2fs. xfstest generic/388 reports below warning: ubuntu godown: xfstests-induced forced shutdown of /mnt/scratch_f2fs: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 485833 at fs/dax.c:1127 dax_iomap_actor+0x339/0x370 Call Trace: iomap_apply+0x1c4/0x7b0 ? dax_iomap_rw+0x1c0/0x1c0 dax_iomap_rw+0xad/0x1c0 ? dax_iomap_rw+0x1c0/0x1c0 f2fs_file_write_iter+0x5ab/0x970 [f2fs] do_iter_readv_writev+0x273/0x2e0 do_iter_write+0xab/0x1f0 vfs_iter_write+0x21/0x40 iter_file_splice_write+0x287/0x540 do_splice+0x37c/0xa60 __x64_sys_splice+0x15f/0x3a0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae ubuntu godown: xfstests-induced forced shutdown of /mnt/scratch_f2fs: ------------[ cut here ]------------ RIP: 0010:dax_iomap_pte_fault.isra.0+0x72e/0x14a0 Call Trace: dax_iomap_fault+0x44/0x70 f2fs_dax_huge_fault+0x155/0x400 [f2fs] f2fs_dax_fault+0x18/0x30 [f2fs] __do_fault+0x4e/0x120 do_fault+0x3cf/0x7a0 __handle_mm_fault+0xa8c/0xf20 ? find_held_lock+0x39/0xd0 handle_mm_fault+0x1b6/0x480 do_user_addr_fault+0x320/0xcd0 ? rcu_read_lock_sched_held+0x67/0xc0 exc_page_fault+0x77/0x3f0 ? asm_exc_page_fault+0x8/0x30 asm_exc_page_fault+0x1e/0x30 Fixes: 83a3bfdb5a8a ("f2fs: indicate shutdown f2fs to allow unmount successfully") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/data.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index df2c361feade6..1679f9c0b63b3 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1191,7 +1191,21 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, if (err) { if (flag == F2FS_GET_BLOCK_BMAP) map->m_pblk = 0; + if (err == -ENOENT) { + /* + * There is one exceptional case that read_node_page() + * may return -ENOENT due to filesystem has been + * shutdown or cp_error, so force to convert error + * number to EIO for such case. + */ + if (map->m_may_create && + (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || + f2fs_cp_error(sbi))) { + err = -EIO; + goto unlock_out; + } + err = 0; if (map->m_next_pgofs) *map->m_next_pgofs = From 02bd7401994b0d20003bed3ac5c516294fe6741d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 25 Aug 2021 19:34:19 +0800 Subject: [PATCH 0704/5344] f2fs: fix to unmap pages from userspace process in punch_hole() [ Upstream commit c8dc3047c48540183744f959412d44b08c5435e1 ] We need to unmap pages from userspace process before removing pagecache in punch_hole() like we did in f2fs_setattr(). Similar change: commit 5e44f8c374dc ("ext4: hole-punch use truncate_pagecache_range") Fixes: fbfa2cc58d53 ("f2fs: add file operations") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/file.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f98dce4d07b30..516007bb1ced1 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -981,7 +981,6 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) } if (pg_start < pg_end) { - struct address_space *mapping = inode->i_mapping; loff_t blk_start, blk_end; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -993,8 +992,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); down_write(&F2FS_I(inode)->i_mmap_sem); - truncate_inode_pages_range(mapping, blk_start, - blk_end - 1); + truncate_pagecache_range(inode, blk_start, blk_end - 1); f2fs_lock_op(sbi); ret = f2fs_truncate_hole(inode, pg_start, pg_end); From e1a27d88d5ee2e9f60cc85bdcfa085be803a2595 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Thu, 2 Sep 2021 09:19:51 +0200 Subject: [PATCH 0705/5344] MIPS: Malta: fix alignment of the devicetree buffer [ Upstream commit bea6a94a279bcbe6b2cde348782b28baf12255a5 ] Starting with following patch MIPS Malta is not able to boot: | commit 79edff12060fe7772af08607eff50c0e2486c5ba | Author: Rob Herring | scripts/dtc: Update to upstream version v1.6.0-51-g183df9e9c2b9 The reason is the alignment test added to the fdt_ro_probe_(). To fix this issue, we need to make sure that fdt_buf is aligned. Since the dtc patch was designed to uncover potential issue, I handle initial MIPS Malta patch as initial bug. Fixes: e81a8c7dabac ("MIPS: Malta: Setup RAM regions via DT") Signed-off-by: Oleksij Rempel Signed-off-by: Thomas Bogendoerfer Signed-off-by: Sasha Levin --- arch/mips/mti-malta/malta-dtshim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/mti-malta/malta-dtshim.c b/arch/mips/mti-malta/malta-dtshim.c index 98a063093b69a..0be28adff5572 100644 --- a/arch/mips/mti-malta/malta-dtshim.c +++ b/arch/mips/mti-malta/malta-dtshim.c @@ -22,7 +22,7 @@ #define ROCIT_CONFIG_GEN1_MEMMAP_SHIFT 8 #define ROCIT_CONFIG_GEN1_MEMMAP_MASK (0xf << 8) -static unsigned char fdt_buf[16 << 10] __initdata; +static unsigned char fdt_buf[16 << 10] __initdata __aligned(8); /* determined physical memory size, not overridden by command line args */ extern unsigned long physical_memsize; From da372d13ab2041c1f5b710d463189ef972ee18e3 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 19 Aug 2021 09:01:14 +0900 Subject: [PATCH 0706/5344] kbuild: Fix 'no symbols' warning when CONFIG_TRIM_UNUSD_KSYMS=y [ Upstream commit 52d83df682c82055961531853c066f4f16e234ea ] When CONFIG_TRIM_UNUSED_KSYMS is enabled, I see some warnings like this: nm: arch/x86/entry/vdso/vdso32/note.o: no symbols $NM (both GNU nm and llvm-nm) warns when no symbol is found in the object. Suppress the stderr. Fangrui Song mentioned binutils>=2.37 `nm -q` can be used to suppress "no symbols" [1], and llvm-nm>=13.0.0 supports -q as well. We cannot use it for now, but note it as a TODO. [1]: https://sourceware.org/bugzilla/show_bug.cgi?id=27408 Fixes: bbda5ec671d3 ("kbuild: simplify dependency generation for CONFIG_TRIM_UNUSED_KSYMS") Signed-off-by: Masahiro Yamada Reviewed-by: Nathan Chancellor Signed-off-by: Sasha Levin --- scripts/gen_ksymdeps.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/scripts/gen_ksymdeps.sh b/scripts/gen_ksymdeps.sh index 1324986e1362c..725e8c9c1b53f 100755 --- a/scripts/gen_ksymdeps.sh +++ b/scripts/gen_ksymdeps.sh @@ -4,7 +4,13 @@ set -e # List of exported symbols -ksyms=$($NM $1 | sed -n 's/.*__ksym_marker_\(.*\)/\1/p' | tr A-Z a-z) +# +# If the object has no symbol, $NM warns 'no symbols'. +# Suppress the stderr. +# TODO: +# Use -q instead of 2>/dev/null when we upgrade the minimum version of +# binutils to 2.37, llvm to 13.0.0. +ksyms=$($NM $1 2>/dev/null | sed -n 's/.*__ksym_marker_\(.*\)/\1/p' | tr A-Z a-z) if [ -z "$ksyms" ]; then exit 0 From e490b0706b6a3f89f07c271196d49da844354dcd Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 2 Sep 2021 14:58:59 -0700 Subject: [PATCH 0707/5344] userfaultfd: prevent concurrent API initialization [ Upstream commit 22e5fe2a2a279d9a6fcbdfb4dffe73821bef1c90 ] userfaultfd assumes that the enabled features are set once and never changed after UFFDIO_API ioctl succeeded. However, currently, UFFDIO_API can be called concurrently from two different threads, succeed on both threads and leave userfaultfd's features in non-deterministic state. Theoretically, other uffd operations (ioctl's and page-faults) can be dispatched while adversely affected by such changes of features. Moreover, the writes to ctx->state and ctx->features are not ordered, which can - theoretically, again - let userfaultfd_ioctl() think that userfaultfd API completed, while the features are still not initialized. To avoid races, it is arguably best to get rid of ctx->state. Since there are only 2 states, record the API initialization in ctx->features as the uppermost bit and remove ctx->state. Link: https://lkml.kernel.org/r/20210808020724.1022515-3-namit@vmware.com Fixes: 9cd75c3cd4c3d ("userfaultfd: non-cooperative: add ability to report non-PF events from uffd descriptor") Signed-off-by: Nadav Amit Cc: Alexander Viro Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Jens Axboe Cc: Mike Rapoport Cc: Peter Xu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/userfaultfd.c | 93 +++++++++++++++++++++++------------------------- 1 file changed, 45 insertions(+), 48 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 911025f3ac147..85d0db9358140 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -32,11 +32,6 @@ int sysctl_unprivileged_userfaultfd __read_mostly = 1; static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; -enum userfaultfd_state { - UFFD_STATE_WAIT_API, - UFFD_STATE_RUNNING, -}; - /* * Start with fault_pending_wqh and fault_wqh so they're more likely * to be in the same cacheline. @@ -68,8 +63,6 @@ struct userfaultfd_ctx { unsigned int flags; /* features requested from the userspace */ unsigned int features; - /* state machine */ - enum userfaultfd_state state; /* released */ bool released; /* memory mappings are changing because of non-cooperative event */ @@ -103,6 +96,14 @@ struct userfaultfd_wake_range { unsigned long len; }; +/* internal indication that UFFD_API ioctl was successfully executed */ +#define UFFD_FEATURE_INITIALIZED (1u << 31) + +static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx) +{ + return ctx->features & UFFD_FEATURE_INITIALIZED; +} + static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, int wake_flags, void *key) { @@ -699,7 +700,6 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) refcount_set(&ctx->refcount, 1); ctx->flags = octx->flags; - ctx->state = UFFD_STATE_RUNNING; ctx->features = octx->features; ctx->released = false; ctx->mmap_changing = false; @@ -980,38 +980,33 @@ static __poll_t userfaultfd_poll(struct file *file, poll_table *wait) poll_wait(file, &ctx->fd_wqh, wait); - switch (ctx->state) { - case UFFD_STATE_WAIT_API: + if (!userfaultfd_is_initialized(ctx)) return EPOLLERR; - case UFFD_STATE_RUNNING: - /* - * poll() never guarantees that read won't block. - * userfaults can be waken before they're read(). - */ - if (unlikely(!(file->f_flags & O_NONBLOCK))) - return EPOLLERR; - /* - * lockless access to see if there are pending faults - * __pollwait last action is the add_wait_queue but - * the spin_unlock would allow the waitqueue_active to - * pass above the actual list_add inside - * add_wait_queue critical section. So use a full - * memory barrier to serialize the list_add write of - * add_wait_queue() with the waitqueue_active read - * below. - */ - ret = 0; - smp_mb(); - if (waitqueue_active(&ctx->fault_pending_wqh)) - ret = EPOLLIN; - else if (waitqueue_active(&ctx->event_wqh)) - ret = EPOLLIN; - - return ret; - default: - WARN_ON_ONCE(1); + + /* + * poll() never guarantees that read won't block. + * userfaults can be waken before they're read(). + */ + if (unlikely(!(file->f_flags & O_NONBLOCK))) return EPOLLERR; - } + /* + * lockless access to see if there are pending faults + * __pollwait last action is the add_wait_queue but + * the spin_unlock would allow the waitqueue_active to + * pass above the actual list_add inside + * add_wait_queue critical section. So use a full + * memory barrier to serialize the list_add write of + * add_wait_queue() with the waitqueue_active read + * below. + */ + ret = 0; + smp_mb(); + if (waitqueue_active(&ctx->fault_pending_wqh)) + ret = EPOLLIN; + else if (waitqueue_active(&ctx->event_wqh)) + ret = EPOLLIN; + + return ret; } static const struct file_operations userfaultfd_fops; @@ -1205,7 +1200,7 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf, struct uffd_msg msg; int no_wait = file->f_flags & O_NONBLOCK; - if (ctx->state == UFFD_STATE_WAIT_API) + if (!userfaultfd_is_initialized(ctx)) return -EINVAL; for (;;) { @@ -1807,9 +1802,10 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, static inline unsigned int uffd_ctx_features(__u64 user_features) { /* - * For the current set of features the bits just coincide + * For the current set of features the bits just coincide. Set + * UFFD_FEATURE_INITIALIZED to mark the features as enabled. */ - return (unsigned int)user_features; + return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED; } /* @@ -1822,12 +1818,10 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, { struct uffdio_api uffdio_api; void __user *buf = (void __user *)arg; + unsigned int ctx_features; int ret; __u64 features; - ret = -EINVAL; - if (ctx->state != UFFD_STATE_WAIT_API) - goto out; ret = -EFAULT; if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) goto out; @@ -1844,9 +1838,13 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, ret = -EFAULT; if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) goto out; - ctx->state = UFFD_STATE_RUNNING; + /* only enable the requested features for this uffd context */ - ctx->features = uffd_ctx_features(features); + ctx_features = uffd_ctx_features(features); + ret = -EINVAL; + if (cmpxchg(&ctx->features, 0, ctx_features) != 0) + goto err_out; + ret = 0; out: return ret; @@ -1863,7 +1861,7 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd, int ret = -EINVAL; struct userfaultfd_ctx *ctx = file->private_data; - if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API) + if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx)) return -EINVAL; switch(cmd) { @@ -1964,7 +1962,6 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) refcount_set(&ctx->refcount, 1); ctx->flags = flags; ctx->features = 0; - ctx->state = UFFD_STATE_WAIT_API; ctx->released = false; ctx->mmap_changing = false; ctx->mm = current->mm; From a80ff9c0075eb0f0f0cdae63c7616f6ee5b280d3 Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Fri, 26 Mar 2021 16:40:22 -0400 Subject: [PATCH 0708/5344] drm/amdgpu: Fix amdgpu_ras_eeprom_init() [ Upstream commit dce4400e6516d18313d23de45b5be8a18980b00e ] No need to account for the 2 bytes of EEPROM address--this is now well abstracted away by the fixes the the lower layers. Cc: Andrey Grodzovsky Cc: Alexander Deucher Signed-off-by: Luben Tuikov Acked-by: Alexander Deucher Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 8a32b5c93778b..bd7ae3e130b6f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -138,7 +138,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) return ret; } - __decode_table_header_from_buff(hdr, &buff[2]); + __decode_table_header_from_buff(hdr, buff); if (hdr->header == EEPROM_TABLE_HDR_VAL) { control->num_recs = (hdr->tbl_size - EEPROM_TABLE_HEADER_SIZE) / From f5cb1f79d98a46cc5b91f618701b683da0fe2829 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 7 Jul 2021 14:47:52 -0700 Subject: [PATCH 0709/5344] ASoC: atmel: ATMEL drivers don't need HAS_DMA [ Upstream commit 6c5c659dfe3f02e08054a6c20019e3886618b512 ] On a config (such as arch/sh/) which does not set HAS_DMA when MMU is not set, several ATMEL ASoC drivers select symbols that cause kconfig warnings. There is one "depends on HAS_DMA" which is no longer needed. Dropping it eliminates the kconfig warnings and still builds with no problems reported. Fix the following kconfig warnings: WARNING: unmet direct dependencies detected for SND_ATMEL_SOC_PDC Depends on [n]: SOUND [=m] && !UML && SND [=m] && SND_SOC [=m] && SND_ATMEL_SOC [=m] && HAS_DMA [=n] Selected by [m]: - SND_ATMEL_SOC_SSC [=m] && SOUND [=m] && !UML && SND [=m] && SND_SOC [=m] && SND_ATMEL_SOC [=m] - SND_ATMEL_SOC_SSC_PDC [=m] && SOUND [=m] && !UML && SND [=m] && SND_SOC [=m] && SND_ATMEL_SOC [=m] && ATMEL_SSC [=m] WARNING: unmet direct dependencies detected for SND_ATMEL_SOC_SSC_PDC Depends on [n]: SOUND [=m] && !UML && SND [=m] && SND_SOC [=m] && SND_ATMEL_SOC [=m] && ATMEL_SSC [=m] && HAS_DMA [=n] Selected by [m]: - SND_AT91_SOC_SAM9G20_WM8731 [=m] && SOUND [=m] && !UML && SND [=m] && SND_SOC [=m] && SND_ATMEL_SOC [=m] && (ARCH_AT91 || COMPILE_TEST [=y]) && ATMEL_SSC [=m] && SND_SOC_I2C_AND_SPI [=m] WARNING: unmet direct dependencies detected for SND_ATMEL_SOC_SSC Depends on [n]: SOUND [=m] && !UML && SND [=m] && SND_SOC [=m] && SND_ATMEL_SOC [=m] && HAS_DMA [=n] Selected by [m]: - SND_ATMEL_SOC_SSC_DMA [=m] && SOUND [=m] && !UML && SND [=m] && SND_SOC [=m] && SND_ATMEL_SOC [=m] && ATMEL_SSC [=m] WARNING: unmet direct dependencies detected for SND_ATMEL_SOC_SSC_DMA Depends on [n]: SOUND [=m] && !UML && SND [=m] && SND_SOC [=m] && SND_ATMEL_SOC [=m] && ATMEL_SSC [=m] && HAS_DMA [=n] Selected by [m]: - SND_ATMEL_SOC_WM8904 [=m] && SOUND [=m] && !UML && SND [=m] && SND_SOC [=m] && SND_ATMEL_SOC [=m] && (ARCH_AT91 || COMPILE_TEST [=y]) && ATMEL_SSC [=m] && I2C [=m] - SND_AT91_SOC_SAM9X5_WM8731 [=m] && SOUND [=m] && !UML && SND [=m] && SND_SOC [=m] && SND_ATMEL_SOC [=m] && (ARCH_AT91 || COMPILE_TEST [=y]) && ATMEL_SSC [=m] && SND_SOC_I2C_AND_SPI [=m] Signed-off-by: Randy Dunlap Reviewed-by: Codrin Ciubotariu Acked-by: Alexandre Belloni Link: https://lore.kernel.org/r/20210707214752.3831-1-rdunlap@infradead.org Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/atmel/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/soc/atmel/Kconfig b/sound/soc/atmel/Kconfig index 71f2d42188c46..51e75b7819682 100644 --- a/sound/soc/atmel/Kconfig +++ b/sound/soc/atmel/Kconfig @@ -11,7 +11,6 @@ if SND_ATMEL_SOC config SND_ATMEL_SOC_PDC bool - depends on HAS_DMA config SND_ATMEL_SOC_DMA bool From 7d552544e59571a7b9fa985bf7bbc8bdffdaff66 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 17 Jun 2021 13:28:57 +0200 Subject: [PATCH 0710/5344] media: dib8000: rewrite the init prbs logic [ Upstream commit 8db11aebdb8f93f46a8513c22c9bd52fa23263aa ] The logic at dib8000_get_init_prbs() has a few issues: 1. the tables used there has an extra unused value at the beginning; 2. the dprintk() message doesn't write the right value when transmission mode is not 8K; 3. the array overflow validation is done by the callers. Rewrite the code to fix such issues. This should also shut up those smatch warnings: drivers/media/dvb-frontends/dib8000.c:2125 dib8000_get_init_prbs() error: buffer overflow 'lut_prbs_8k' 14 <= 14 drivers/media/dvb-frontends/dib8000.c:2129 dib8000_get_init_prbs() error: buffer overflow 'lut_prbs_2k' 14 <= 14 drivers/media/dvb-frontends/dib8000.c:2131 dib8000_get_init_prbs() error: buffer overflow 'lut_prbs_4k' 14 <= 14 drivers/media/dvb-frontends/dib8000.c:2134 dib8000_get_init_prbs() error: buffer overflow 'lut_prbs_8k' 14 <= 14 Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/dvb-frontends/dib8000.c | 58 +++++++++++++++++++-------- 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/drivers/media/dvb-frontends/dib8000.c b/drivers/media/dvb-frontends/dib8000.c index 082796534b0ae..bb02354a48b81 100644 --- a/drivers/media/dvb-frontends/dib8000.c +++ b/drivers/media/dvb-frontends/dib8000.c @@ -2107,32 +2107,55 @@ static void dib8000_load_ana_fe_coefs(struct dib8000_state *state, const s16 *an dib8000_write_word(state, 117 + mode, ana_fe[mode]); } -static const u16 lut_prbs_2k[14] = { - 0, 0x423, 0x009, 0x5C7, 0x7A6, 0x3D8, 0x527, 0x7FF, 0x79B, 0x3D6, 0x3A2, 0x53B, 0x2F4, 0x213 +static const u16 lut_prbs_2k[13] = { + 0x423, 0x009, 0x5C7, + 0x7A6, 0x3D8, 0x527, + 0x7FF, 0x79B, 0x3D6, + 0x3A2, 0x53B, 0x2F4, + 0x213 }; -static const u16 lut_prbs_4k[14] = { - 0, 0x208, 0x0C3, 0x7B9, 0x423, 0x5C7, 0x3D8, 0x7FF, 0x3D6, 0x53B, 0x213, 0x029, 0x0D0, 0x48E + +static const u16 lut_prbs_4k[13] = { + 0x208, 0x0C3, 0x7B9, + 0x423, 0x5C7, 0x3D8, + 0x7FF, 0x3D6, 0x53B, + 0x213, 0x029, 0x0D0, + 0x48E }; -static const u16 lut_prbs_8k[14] = { - 0, 0x740, 0x069, 0x7DD, 0x208, 0x7B9, 0x5C7, 0x7FF, 0x53B, 0x029, 0x48E, 0x4C4, 0x367, 0x684 + +static const u16 lut_prbs_8k[13] = { + 0x740, 0x069, 0x7DD, + 0x208, 0x7B9, 0x5C7, + 0x7FF, 0x53B, 0x029, + 0x48E, 0x4C4, 0x367, + 0x684 }; static u16 dib8000_get_init_prbs(struct dib8000_state *state, u16 subchannel) { int sub_channel_prbs_group = 0; + int prbs_group; - sub_channel_prbs_group = (subchannel / 3) + 1; - dprintk("sub_channel_prbs_group = %d , subchannel =%d prbs = 0x%04x\n", sub_channel_prbs_group, subchannel, lut_prbs_8k[sub_channel_prbs_group]); + sub_channel_prbs_group = subchannel / 3; + if (sub_channel_prbs_group >= ARRAY_SIZE(lut_prbs_2k)) + return 0; switch (state->fe[0]->dtv_property_cache.transmission_mode) { case TRANSMISSION_MODE_2K: - return lut_prbs_2k[sub_channel_prbs_group]; + prbs_group = lut_prbs_2k[sub_channel_prbs_group]; + break; case TRANSMISSION_MODE_4K: - return lut_prbs_4k[sub_channel_prbs_group]; + prbs_group = lut_prbs_4k[sub_channel_prbs_group]; + break; default: case TRANSMISSION_MODE_8K: - return lut_prbs_8k[sub_channel_prbs_group]; + prbs_group = lut_prbs_8k[sub_channel_prbs_group]; } + + dprintk("sub_channel_prbs_group = %d , subchannel =%d prbs = 0x%04x\n", + sub_channel_prbs_group, subchannel, prbs_group); + + return prbs_group; } static void dib8000_set_13seg_channel(struct dib8000_state *state) @@ -2409,10 +2432,8 @@ static void dib8000_set_isdbt_common_channel(struct dib8000_state *state, u8 seq /* TSB or ISDBT ? apply it now */ if (c->isdbt_sb_mode) { dib8000_set_sb_channel(state); - if (c->isdbt_sb_subchannel < 14) - init_prbs = dib8000_get_init_prbs(state, c->isdbt_sb_subchannel); - else - init_prbs = 0; + init_prbs = dib8000_get_init_prbs(state, + c->isdbt_sb_subchannel); } else { dib8000_set_13seg_channel(state); init_prbs = 0xfff; @@ -3004,6 +3025,7 @@ static int dib8000_tune(struct dvb_frontend *fe) unsigned long *timeout = &state->timeout; unsigned long now = jiffies; + u16 init_prbs; #ifdef DIB8000_AGC_FREEZE u16 agc1, agc2; #endif @@ -3302,8 +3324,10 @@ static int dib8000_tune(struct dvb_frontend *fe) break; case CT_DEMOD_STEP_11: /* 41 : init prbs autosearch */ - if (state->subchannel <= 41) { - dib8000_set_subchannel_prbs(state, dib8000_get_init_prbs(state, state->subchannel)); + init_prbs = dib8000_get_init_prbs(state, state->subchannel); + + if (init_prbs) { + dib8000_set_subchannel_prbs(state, init_prbs); *tune_state = CT_DEMOD_STEP_9; } else { *tune_state = CT_DEMOD_STOP; From 04a20075d503df2517775676104ed2aedbcb0ceb Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Thu, 1 Jul 2021 14:56:38 -0400 Subject: [PATCH 0711/5344] crypto: mxs-dcp - Use sg_mapping_iter to copy data [ Upstream commit 2e6d793e1bf07fe5e20cfbbdcec9e1af7e5097eb ] This uses the sg_pcopy_from_buffer to copy data, instead of doing it ourselves. In addition to reducing code size, this fixes the following oops resulting from failing to kmap the page: [ 68.896381] Unable to handle kernel NULL pointer dereference at virtual address 00000ab8 [ 68.904539] pgd = 3561adb3 [ 68.907475] [00000ab8] *pgd=00000000 [ 68.911153] Internal error: Oops: 805 [#1] ARM [ 68.915618] Modules linked in: cfg80211 rfkill des_generic libdes arc4 libarc4 cbc ecb algif_skcipher sha256_generic libsha256 sha1_generic hmac aes_generic libaes cmac sha512_generic md5 md4 algif_hash af_alg i2c_imx i2c_core ci_hdrc_imx ci_hdrc mxs_dcp ulpi roles udc_core imx_sdma usbmisc_imx usb_common firmware_class virt_dma phy_mxs_usb nf_tables nfnetlink ip_tables x_tables ipv6 autofs4 [ 68.950741] CPU: 0 PID: 139 Comm: mxs_dcp_chan/ae Not tainted 5.10.34 #296 [ 68.958501] Hardware name: Freescale i.MX6 Ultralite (Device Tree) [ 68.964710] PC is at memcpy+0xa8/0x330 [ 68.968479] LR is at 0xd7b2bc9d [ 68.971638] pc : [] lr : [] psr: 000f0013 [ 68.977920] sp : c2cbbee4 ip : 00000010 fp : 00000010 [ 68.983159] r10: 00000000 r9 : c3283a40 r8 : 1a5a6f08 [ 68.988402] r7 : 4bfe0ecc r6 : 76d8a220 r5 : c32f9050 r4 : 00000001 [ 68.994945] r3 : 00000ab8 r2 : fffffff0 r1 : c32f9050 r0 : 00000ab8 [ 69.001492] Flags: nzcv IRQs on FIQs on Mode SVC_32 ISA ARM Segment none [ 69.008646] Control: 10c53c7d Table: 83664059 DAC: 00000051 [ 69.014414] Process mxs_dcp_chan/ae (pid: 139, stack limit = 0x667b57ab) [ 69.021133] Stack: (0xc2cbbee4 to 0xc2cbc000) [ 69.025519] bee0: c32f9050 c3235408 00000010 00000010 00000ab8 00000001 bf10406c [ 69.033720] bf00: 00000000 00000000 00000010 00000000 c32355d0 832fb080 00000000 c13de2fc [ 69.041921] bf20: c3628010 00000010 c33d5780 00000ab8 bf1067e8 00000002 c21e5010 c2cba000 [ 69.050125] bf40: c32f8040 00000000 bf106a40 c32f9040 c3283a80 00000001 bf105240 c3234040 [ 69.058327] bf60: ffffe000 c3204100 c2c69800 c2cba000 00000000 bf103b84 00000000 c2eddc54 [ 69.066530] bf80: c3204144 c0140d1c c2cba000 c2c69800 c0140be8 00000000 00000000 00000000 [ 69.074730] bfa0: 00000000 00000000 00000000 c0100114 00000000 00000000 00000000 00000000 [ 69.082932] bfc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 [ 69.091131] bfe0: 00000000 00000000 00000000 00000000 00000013 00000000 00000000 00000000 [ 69.099364] [] (memcpy) from [] (dcp_chan_thread_aes+0x4e8/0x840 [mxs_dcp]) [ 69.108117] [] (dcp_chan_thread_aes [mxs_dcp]) from [] (kthread+0x134/0x160) [ 69.116941] [] (kthread) from [] (ret_from_fork+0x14/0x20) [ 69.124178] Exception stack(0xc2cbbfb0 to 0xc2cbbff8) [ 69.129250] bfa0: 00000000 00000000 00000000 00000000 [ 69.137450] bfc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 [ 69.145648] bfe0: 00000000 00000000 00000000 00000000 00000013 00000000 [ 69.152289] Code: e320f000 e4803004 e4804004 e4805004 (e4806004) Signed-off-by: Sean Anderson Signed-off-by: Herbert Xu Signed-off-by: Sasha Levin --- drivers/crypto/mxs-dcp.c | 36 +++++++++--------------------------- 1 file changed, 9 insertions(+), 27 deletions(-) diff --git a/drivers/crypto/mxs-dcp.c b/drivers/crypto/mxs-dcp.c index 66fa524b6261e..5471110792071 100644 --- a/drivers/crypto/mxs-dcp.c +++ b/drivers/crypto/mxs-dcp.c @@ -298,21 +298,20 @@ static int mxs_dcp_aes_block_crypt(struct crypto_async_request *arq) struct scatterlist *dst = req->dst; struct scatterlist *src = req->src; - const int nents = sg_nents(req->src); + int dst_nents = sg_nents(dst); const int out_off = DCP_BUF_SZ; uint8_t *in_buf = sdcp->coh->aes_in_buf; uint8_t *out_buf = sdcp->coh->aes_out_buf; - uint8_t *out_tmp, *src_buf, *dst_buf = NULL; uint32_t dst_off = 0; + uint8_t *src_buf = NULL; uint32_t last_out_len = 0; uint8_t *key = sdcp->coh->aes_key; int ret = 0; - int split = 0; - unsigned int i, len, clen, rem = 0, tlen = 0; + unsigned int i, len, clen, tlen = 0; int init = 0; bool limit_hit = false; @@ -330,7 +329,7 @@ static int mxs_dcp_aes_block_crypt(struct crypto_async_request *arq) memset(key + AES_KEYSIZE_128, 0, AES_KEYSIZE_128); } - for_each_sg(req->src, src, nents, i) { + for_each_sg(req->src, src, sg_nents(src), i) { src_buf = sg_virt(src); len = sg_dma_len(src); tlen += len; @@ -355,34 +354,17 @@ static int mxs_dcp_aes_block_crypt(struct crypto_async_request *arq) * submit the buffer. */ if (actx->fill == out_off || sg_is_last(src) || - limit_hit) { + limit_hit) { ret = mxs_dcp_run_aes(actx, req, init); if (ret) return ret; init = 0; - out_tmp = out_buf; + sg_pcopy_from_buffer(dst, dst_nents, out_buf, + actx->fill, dst_off); + dst_off += actx->fill; last_out_len = actx->fill; - while (dst && actx->fill) { - if (!split) { - dst_buf = sg_virt(dst); - dst_off = 0; - } - rem = min(sg_dma_len(dst) - dst_off, - actx->fill); - - memcpy(dst_buf + dst_off, out_tmp, rem); - out_tmp += rem; - dst_off += rem; - actx->fill -= rem; - - if (dst_off == sg_dma_len(dst)) { - dst = sg_next(dst); - split = 0; - } else { - split = 1; - } - } + actx->fill = 0; } } while (len); From 19db48b0b88c7183c7bd2964d20e47159a97ec45 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 8 Jul 2021 15:25:06 +0200 Subject: [PATCH 0712/5344] PCI: Use pci_update_current_state() in pci_enable_device_flags() [ Upstream commit 14858dcc3b3587f4bb5c48e130ee7d68fc2b0a29 ] Updating the current_state field of struct pci_dev the way it is done in pci_enable_device_flags() before calling do_pci_enable_device() may not work. For example, if the given PCI device depends on an ACPI power resource whose _STA method initially returns 0 ("off"), but the config space of the PCI device is accessible and the power state retrieved from the PCI_PM_CTRL register is D0, the current_state field in the struct pci_dev representing that device will get out of sync with the power.state of its ACPI companion object and that will lead to power management issues going forward. To avoid such issues, make pci_enable_device_flags() call pci_update_current_state() which takes ACPI device power management into account, if present, to retrieve the current power state of the device. Link: https://lore.kernel.org/lkml/20210314000439.3138941-1-luzmaximilian@gmail.com/ Reported-by: Maximilian Luz Signed-off-by: Rafael J. Wysocki Tested-by: Maximilian Luz Signed-off-by: Sasha Levin --- drivers/pci/pci.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 76e4cc3ebe63e..84f3e0e5dfe11 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1674,11 +1674,7 @@ static int pci_enable_device_flags(struct pci_dev *dev, unsigned long flags) * so that things like MSI message writing will behave as expected * (e.g. if the device really is in D0 at enable time). */ - if (dev->pm_cap) { - u16 pmcsr; - pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr); - dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK); - } + pci_update_current_state(dev, dev->current_state); if (atomic_inc_return(&dev->enable_cnt) > 1) return 0; /* already enabled */ From dc1ba50f296589251f61be1a3e3d3da2846daf25 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 16 Jul 2021 17:44:07 -0400 Subject: [PATCH 0713/5344] tipc: keep the skb in rcv queue until the whole data is read [ Upstream commit f4919ff59c2828064b4156e3c3600a169909bcf4 ] Currently, when userspace reads a datagram with a buffer that is smaller than this datagram, the data will be truncated and only part of it can be received by users. It doesn't seem right that users don't know the datagram size and have to use a huge buffer to read it to avoid the truncation. This patch to fix it by keeping the skb in rcv queue until the whole data is read by users. Only the last msg of the datagram will be marked with MSG_EOR, just as TCP/SCTP does. Note that this will work as above only when MSG_EOR is set in the flags parameter of recvmsg(), so that it won't break any old user applications. Signed-off-by: Xin Long Acked-by: Jon Maloy Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/tipc/socket.c | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index a5922ce9109cf..231f9e1bf6bb2 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1756,6 +1756,7 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, bool connected = !tipc_sk_type_connectionless(sk); struct tipc_sock *tsk = tipc_sk(sk); int rc, err, hlen, dlen, copy; + struct tipc_skb_cb *skb_cb; struct sk_buff_head xmitq; struct tipc_msg *hdr; struct sk_buff *skb; @@ -1779,6 +1780,7 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, if (unlikely(rc)) goto exit; skb = skb_peek(&sk->sk_receive_queue); + skb_cb = TIPC_SKB_CB(skb); hdr = buf_msg(skb); dlen = msg_data_sz(hdr); hlen = msg_hdr_sz(hdr); @@ -1798,18 +1800,33 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, /* Capture data if non-error msg, otherwise just set return value */ if (likely(!err)) { - copy = min_t(int, dlen, buflen); - if (unlikely(copy != dlen)) - m->msg_flags |= MSG_TRUNC; - rc = skb_copy_datagram_msg(skb, hlen, m, copy); + int offset = skb_cb->bytes_read; + + copy = min_t(int, dlen - offset, buflen); + rc = skb_copy_datagram_msg(skb, hlen + offset, m, copy); + if (unlikely(rc)) + goto exit; + if (unlikely(offset + copy < dlen)) { + if (flags & MSG_EOR) { + if (!(flags & MSG_PEEK)) + skb_cb->bytes_read = offset + copy; + } else { + m->msg_flags |= MSG_TRUNC; + skb_cb->bytes_read = 0; + } + } else { + if (flags & MSG_EOR) + m->msg_flags |= MSG_EOR; + skb_cb->bytes_read = 0; + } } else { copy = 0; rc = 0; - if (err != TIPC_CONN_SHUTDOWN && connected && !m->msg_control) + if (err != TIPC_CONN_SHUTDOWN && connected && !m->msg_control) { rc = -ECONNRESET; + goto exit; + } } - if (unlikely(rc)) - goto exit; /* Mark message as group event if applicable */ if (unlikely(grp_evt)) { @@ -1832,9 +1849,10 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, tipc_node_distr_xmit(sock_net(sk), &xmitq); } - tsk_advance_rx_queue(sk); + if (!skb_cb->bytes_read) + tsk_advance_rx_queue(sk); - if (likely(!connected)) + if (likely(!connected) || skb_cb->bytes_read) goto exit; /* Send connection flow control advertisement when applicable */ From 99c633fbd5ab9e64bf5c3c01b7e90732811cb5f3 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Sun, 27 Jun 2021 17:32:37 +0100 Subject: [PATCH 0714/5344] iio: dac: ad5624r: Fix incorrect handling of an optional regulator. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 97683c851f9cdbd3ea55697cbe2dcb6af4287bbd ] The naming of the regulator is problematic. VCC is usually a supply voltage whereas these devices have a separate VREF pin. Secondly, the regulator core might have provided a stub regulator if a real regulator wasn't provided. That would in turn have failed to provide a voltage when queried. So reality was that there was no way to use the internal reference. In order to avoid breaking any dts out in the wild, make sure to fallback to the original vcc naming if vref is not available. Signed-off-by: Jonathan Cameron Reported-by: kernel test robot Acked-by: Nuno Sá Link: https://lore.kernel.org/r/20210627163244.1090296-9-jic23@kernel.org Signed-off-by: Sasha Levin --- drivers/iio/dac/ad5624r_spi.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/iio/dac/ad5624r_spi.c b/drivers/iio/dac/ad5624r_spi.c index e6c022e1dc1cf..17cc8b3fc5d82 100644 --- a/drivers/iio/dac/ad5624r_spi.c +++ b/drivers/iio/dac/ad5624r_spi.c @@ -229,7 +229,7 @@ static int ad5624r_probe(struct spi_device *spi) if (!indio_dev) return -ENOMEM; st = iio_priv(indio_dev); - st->reg = devm_regulator_get(&spi->dev, "vcc"); + st->reg = devm_regulator_get_optional(&spi->dev, "vref"); if (!IS_ERR(st->reg)) { ret = regulator_enable(st->reg); if (ret) @@ -240,6 +240,22 @@ static int ad5624r_probe(struct spi_device *spi) goto error_disable_reg; voltage_uv = ret; + } else { + if (PTR_ERR(st->reg) != -ENODEV) + return PTR_ERR(st->reg); + /* Backwards compatibility. This naming is not correct */ + st->reg = devm_regulator_get_optional(&spi->dev, "vcc"); + if (!IS_ERR(st->reg)) { + ret = regulator_enable(st->reg); + if (ret) + return ret; + + ret = regulator_get_voltage(st->reg); + if (ret < 0) + goto error_disable_reg; + + voltage_uv = ret; + } } spi_set_drvdata(spi, indio_dev); From 444f07f1a5361f4268d439edc761d176e5a95c70 Mon Sep 17 00:00:00 2001 From: Stefan Assmann Date: Fri, 5 Mar 2021 13:38:56 +0100 Subject: [PATCH 0715/5344] iavf: do not override the adapter state in the watchdog task [ Upstream commit 22c8fd71d3a5e6fe584ccc2c1e8760e5baefd5aa ] The iavf watchdog task overrides adapter->state to __IAVF_RESETTING when it detects a pending reset. Then schedules iavf_reset_task() which takes care of the reset. The reset task is capable of handling the reset without changing adapter->state. In fact we lose the state information when the watchdog task prematurely changes the adapter state. This may lead to a crash if instead of the reset task the iavf_remove() function gets called before the reset task. In that case (if we were in state __IAVF_RUNNING previously) the iavf_remove() function triggers iavf_close() which fails to close the device because of the incorrect state information. This may result in a crash due to pending interrupts. kernel BUG at drivers/pci/msi.c:357! [...] Call Trace: [] pci_disable_msix+0x3d/0x50 [] iavf_reset_interrupt_capability+0x23/0x40 [iavf] [] iavf_remove+0x10a/0x350 [iavf] [] pci_device_remove+0x39/0xc0 [] __device_release_driver+0x7f/0xf0 [] device_release_driver+0x23/0x30 [] pci_stop_bus_device+0x84/0xa0 [] pci_stop_and_remove_bus_device+0x12/0x20 [] pci_iov_remove_virtfn+0xaf/0x160 [] sriov_disable+0x3c/0xf0 [] pci_disable_sriov+0x23/0x30 [] i40e_free_vfs+0x265/0x2d0 [i40e] [] i40e_pci_sriov_configure+0x144/0x1f0 [i40e] [] sriov_numvfs_store+0x177/0x1d0 Code: 00 00 e8 3c 25 e3 ff 49 c7 86 88 08 00 00 00 00 00 00 5b 41 5c 41 5d 41 5e 41 5f 5d c3 48 8b 7b 28 e8 0d 44 RIP [] free_msi_irqs+0x188/0x190 The solution is to not touch the adapter->state in iavf_watchdog_task() and let the reset task handle the state transition. Signed-off-by: Stefan Assmann Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/iavf/iavf_main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 94a3f000e999b..3e76111af872c 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -1961,7 +1961,6 @@ static void iavf_watchdog_task(struct work_struct *work) /* check for hw reset */ reg_val = rd32(hw, IAVF_VF_ARQLEN1) & IAVF_VF_ARQLEN1_ARQENABLE_MASK; if (!reg_val) { - adapter->state = __IAVF_RESETTING; adapter->flags |= IAVF_FLAG_RESET_PENDING; adapter->aq_required = 0; adapter->current_op = VIRTCHNL_OP_UNKNOWN; From 8cda84b120a8ee1e463eda2078e2e60d9e78c9b0 Mon Sep 17 00:00:00 2001 From: Stefan Assmann Date: Tue, 16 Mar 2021 11:01:41 +0100 Subject: [PATCH 0716/5344] iavf: fix locking of critical sections [ Upstream commit 226d528512cfac890a1619aea4301f3dd314fe60 ] To avoid races between iavf_init_task(), iavf_reset_task(), iavf_watchdog_task(), iavf_adminq_task() as well as the shutdown and remove functions more locking is required. The current protection by __IAVF_IN_CRITICAL_TASK is needed in additional places. - The reset task performs state transitions, therefore needs locking. - The adminq task acts on replies from the PF in iavf_virtchnl_completion() which may alter the states. - The init task is not only run during probe but also if a VF gets stuck to reinitialize it. - The shutdown function performs a state transition. - The remove function performs a state transition and also free's resources. iavf_lock_timeout() is introduced to avoid waiting infinitely and cause a deadlock. Rather unlock and print a warning. Signed-off-by: Stefan Assmann Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/iavf/iavf_main.c | 57 ++++++++++++++++++--- 1 file changed, 50 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 3e76111af872c..bc46c262b42d8 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -142,6 +142,30 @@ enum iavf_status iavf_free_virt_mem_d(struct iavf_hw *hw, return 0; } +/** + * iavf_lock_timeout - try to set bit but give up after timeout + * @adapter: board private structure + * @bit: bit to set + * @msecs: timeout in msecs + * + * Returns 0 on success, negative on failure + **/ +static int iavf_lock_timeout(struct iavf_adapter *adapter, + enum iavf_critical_section_t bit, + unsigned int msecs) +{ + unsigned int wait, delay = 10; + + for (wait = 0; wait < msecs; wait += delay) { + if (!test_and_set_bit(bit, &adapter->crit_section)) + return 0; + + msleep(delay); + } + + return -1; +} + /** * iavf_schedule_reset - Set the flags and schedule a reset event * @adapter: board private structure @@ -2076,6 +2100,10 @@ static void iavf_reset_task(struct work_struct *work) if (test_bit(__IAVF_IN_REMOVE_TASK, &adapter->crit_section)) return; + if (iavf_lock_timeout(adapter, __IAVF_IN_CRITICAL_TASK, 200)) { + schedule_work(&adapter->reset_task); + return; + } while (test_and_set_bit(__IAVF_IN_CLIENT_TASK, &adapter->crit_section)) usleep_range(500, 1000); @@ -2290,6 +2318,8 @@ static void iavf_adminq_task(struct work_struct *work) if (!event.msg_buf) goto out; + if (iavf_lock_timeout(adapter, __IAVF_IN_CRITICAL_TASK, 200)) + goto freedom; do { ret = iavf_clean_arq_element(hw, &event, &pending); v_op = (enum virtchnl_ops)le32_to_cpu(event.desc.cookie_high); @@ -2303,6 +2333,7 @@ static void iavf_adminq_task(struct work_struct *work) if (pending != 0) memset(event.msg_buf, 0, IAVF_MAX_AQ_BUF_SIZE); } while (pending); + clear_bit(__IAVF_IN_CRITICAL_TASK, &adapter->crit_section); if ((adapter->flags & (IAVF_FLAG_RESET_PENDING | IAVF_FLAG_RESET_NEEDED)) || @@ -3599,6 +3630,10 @@ static void iavf_init_task(struct work_struct *work) init_task.work); struct iavf_hw *hw = &adapter->hw; + if (iavf_lock_timeout(adapter, __IAVF_IN_CRITICAL_TASK, 5000)) { + dev_warn(&adapter->pdev->dev, "failed to set __IAVF_IN_CRITICAL_TASK in %s\n", __FUNCTION__); + return; + } switch (adapter->state) { case __IAVF_STARTUP: if (iavf_startup(adapter) < 0) @@ -3611,14 +3646,14 @@ static void iavf_init_task(struct work_struct *work) case __IAVF_INIT_GET_RESOURCES: if (iavf_init_get_resources(adapter) < 0) goto init_failed; - return; + goto out; default: goto init_failed; } queue_delayed_work(iavf_wq, &adapter->init_task, msecs_to_jiffies(30)); - return; + goto out; init_failed: if (++adapter->aq_wait_count > IAVF_AQ_MAX_ERR) { dev_err(&adapter->pdev->dev, @@ -3627,9 +3662,11 @@ static void iavf_init_task(struct work_struct *work) iavf_shutdown_adminq(hw); adapter->state = __IAVF_STARTUP; queue_delayed_work(iavf_wq, &adapter->init_task, HZ * 5); - return; + goto out; } queue_delayed_work(iavf_wq, &adapter->init_task, HZ); +out: + clear_bit(__IAVF_IN_CRITICAL_TASK, &adapter->crit_section); } /** @@ -3646,9 +3683,12 @@ static void iavf_shutdown(struct pci_dev *pdev) if (netif_running(netdev)) iavf_close(netdev); + if (iavf_lock_timeout(adapter, __IAVF_IN_CRITICAL_TASK, 5000)) + dev_warn(&adapter->pdev->dev, "failed to set __IAVF_IN_CRITICAL_TASK in %s\n", __FUNCTION__); /* Prevent the watchdog from running. */ adapter->state = __IAVF_REMOVE; adapter->aq_required = 0; + clear_bit(__IAVF_IN_CRITICAL_TASK, &adapter->crit_section); #ifdef CONFIG_PM pci_save_state(pdev); @@ -3877,10 +3917,6 @@ static void iavf_remove(struct pci_dev *pdev) err); } - /* Shut down all the garbage mashers on the detention level */ - adapter->state = __IAVF_REMOVE; - adapter->aq_required = 0; - adapter->flags &= ~IAVF_FLAG_REINIT_ITR_NEEDED; iavf_request_reset(adapter); msleep(50); /* If the FW isn't responding, kick it once, but only once. */ @@ -3888,6 +3924,13 @@ static void iavf_remove(struct pci_dev *pdev) iavf_request_reset(adapter); msleep(50); } + if (iavf_lock_timeout(adapter, __IAVF_IN_CRITICAL_TASK, 5000)) + dev_warn(&adapter->pdev->dev, "failed to set __IAVF_IN_CRITICAL_TASK in %s\n", __FUNCTION__); + + /* Shut down all the garbage mashers on the detention level */ + adapter->state = __IAVF_REMOVE; + adapter->aq_required = 0; + adapter->flags &= ~IAVF_FLAG_REINIT_ITR_NEEDED; iavf_free_all_tx_resources(adapter); iavf_free_all_rx_resources(adapter); iavf_misc_irq_disable(adapter); From 078b28e87ff1d4be354febeab371cdeca63580d8 Mon Sep 17 00:00:00 2001 From: David Heidelberg Date: Wed, 7 Jul 2021 15:14:53 +0200 Subject: [PATCH 0717/5344] ARM: dts: qcom: apq8064: correct clock names [ Upstream commit 0dc6c59892ead17a9febd11202c9f6794aac1895 ] Since new code doesn't take old clk names in account, it does fixes error: msm_dsi 4700000.mdss_dsi: dev_pm_opp_set_clkname: Couldn't find clock: -2 and following kernel oops introduced by b0530eb1191 ("drm/msm/dpu: Use OPP API to set clk/perf state"). Also removes warning about deprecated clock names. Tested against linux-5.10.y LTS on Nexus 7 2013. Reviewed-by: Brian Masney Signed-off-by: David Heidelberg Link: https://lore.kernel.org/r/20210707131453.24041-1-david@ixit.cz Signed-off-by: Bjorn Andersson Signed-off-by: Sasha Levin --- arch/arm/boot/dts/qcom-apq8064.dtsi | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm/boot/dts/qcom-apq8064.dtsi b/arch/arm/boot/dts/qcom-apq8064.dtsi index 8b79b4112ee1a..2b075e287610f 100644 --- a/arch/arm/boot/dts/qcom-apq8064.dtsi +++ b/arch/arm/boot/dts/qcom-apq8064.dtsi @@ -1261,9 +1261,9 @@ <&mmcc DSI1_BYTE_CLK>, <&mmcc DSI_PIXEL_CLK>, <&mmcc DSI1_ESC_CLK>; - clock-names = "iface_clk", "bus_clk", "core_mmss_clk", - "src_clk", "byte_clk", "pixel_clk", - "core_clk"; + clock-names = "iface", "bus", "core_mmss", + "src", "byte", "pixel", + "core"; assigned-clocks = <&mmcc DSI1_BYTE_SRC>, <&mmcc DSI1_ESC_SRC>, From a37eeea060efb488bb5ac40e9463f668acff0cc2 Mon Sep 17 00:00:00 2001 From: Zheyu Ma Date: Wed, 14 Jul 2021 04:09:22 +0000 Subject: [PATCH 0718/5344] video: fbdev: kyro: fix a DoS bug by restricting user input [ Upstream commit 98a65439172dc69cb16834e62e852afc2adb83ed ] The user can pass in any value to the driver through the 'ioctl' interface. The driver dost not check, which may cause DoS bugs. The following log reveals it: divide error: 0000 [#1] PREEMPT SMP KASAN PTI RIP: 0010:SetOverlayViewPort+0x133/0x5f0 drivers/video/fbdev/kyro/STG4000OverlayDevice.c:476 Call Trace: kyro_dev_overlay_viewport_set drivers/video/fbdev/kyro/fbdev.c:378 [inline] kyrofb_ioctl+0x2eb/0x330 drivers/video/fbdev/kyro/fbdev.c:603 do_fb_ioctl+0x1f3/0x700 drivers/video/fbdev/core/fbmem.c:1171 fb_ioctl+0xeb/0x130 drivers/video/fbdev/core/fbmem.c:1185 vfs_ioctl fs/ioctl.c:48 [inline] __do_sys_ioctl fs/ioctl.c:753 [inline] __se_sys_ioctl fs/ioctl.c:739 [inline] __x64_sys_ioctl+0x19b/0x220 fs/ioctl.c:739 do_syscall_64+0x32/0x80 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xae Signed-off-by: Zheyu Ma Signed-off-by: Sam Ravnborg Link: https://patchwork.freedesktop.org/patch/msgid/1626235762-2590-1-git-send-email-zheyuma97@gmail.com Signed-off-by: Sasha Levin --- drivers/video/fbdev/kyro/fbdev.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/video/fbdev/kyro/fbdev.c b/drivers/video/fbdev/kyro/fbdev.c index a7bd9f25911b5..d7aa431e68460 100644 --- a/drivers/video/fbdev/kyro/fbdev.c +++ b/drivers/video/fbdev/kyro/fbdev.c @@ -372,6 +372,11 @@ static int kyro_dev_overlay_viewport_set(u32 x, u32 y, u32 ulWidth, u32 ulHeight /* probably haven't called CreateOverlay yet */ return -EINVAL; + if (ulWidth == 0 || ulWidth == 0xffffffff || + ulHeight == 0 || ulHeight == 0xffffffff || + (x < 2 && ulWidth + 2 == 0)) + return -EINVAL; + /* Stop Ramdac Output */ DisableRamdacOutput(deviceInfo.pSTGReg); From 5b73a68a0adcd9bc5b177becbc652dbd9aed714d Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Mon, 19 Jul 2021 13:18:16 +0800 Subject: [PATCH 0719/5344] netlink: Deal with ESRCH error in nlmsg_notify() [ Upstream commit fef773fc8110d8124c73a5e6610f89e52814637d ] Yonghong Song report: The bpf selftest tc_bpf failed with latest bpf-next. The following is the command to run and the result: $ ./test_progs -n 132 [ 40.947571] bpf_testmod: loading out-of-tree module taints kernel. test_tc_bpf:PASS:test_tc_bpf__open_and_load 0 nsec test_tc_bpf:PASS:bpf_tc_hook_create(BPF_TC_INGRESS) 0 nsec test_tc_bpf:PASS:bpf_tc_hook_create invalid hook.attach_point 0 nsec test_tc_bpf_basic:PASS:bpf_obj_get_info_by_fd 0 nsec test_tc_bpf_basic:PASS:bpf_tc_attach 0 nsec test_tc_bpf_basic:PASS:handle set 0 nsec test_tc_bpf_basic:PASS:priority set 0 nsec test_tc_bpf_basic:PASS:prog_id set 0 nsec test_tc_bpf_basic:PASS:bpf_tc_attach replace mode 0 nsec test_tc_bpf_basic:PASS:bpf_tc_query 0 nsec test_tc_bpf_basic:PASS:handle set 0 nsec test_tc_bpf_basic:PASS:priority set 0 nsec test_tc_bpf_basic:PASS:prog_id set 0 nsec libbpf: Kernel error message: Failed to send filter delete notification test_tc_bpf_basic:FAIL:bpf_tc_detach unexpected error: -3 (errno 3) test_tc_bpf:FAIL:test_tc_internal ingress unexpected error: -3 (errno 3) The failure seems due to the commit cfdf0d9ae75b ("rtnetlink: use nlmsg_notify() in rtnetlink_send()") Deal with ESRCH error in nlmsg_notify() even the report variable is zero. Reported-by: Yonghong Song Signed-off-by: Yajun Deng Link: https://lore.kernel.org/r/20210719051816.11762-1-yajun.deng@linux.dev Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/netlink/af_netlink.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 9d993b4cf1aff..acc76a738cfd8 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2521,13 +2521,15 @@ int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid, /* errors reported via destination sk->sk_err, but propagate * delivery errors if NETLINK_BROADCAST_ERROR flag is set */ err = nlmsg_multicast(sk, skb, exclude_portid, group, flags); + if (err == -ESRCH) + err = 0; } if (report) { int err2; err2 = nlmsg_unicast(sk, skb, portid); - if (!err || err == -ESRCH) + if (!err) err = err2; } From accee4d8c80f64a3323d5f0ee44ecb2ee932af9a Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Thu, 15 Jul 2021 17:17:24 +0800 Subject: [PATCH 0720/5344] Smack: Fix wrong semantics in smk_access_entry() [ Upstream commit 6d14f5c7028eea70760df284057fe198ce7778dd ] In the smk_access_entry() function, if no matching rule is found in the rust_list, a negative error code will be used to perform bit operations with the MAY_ enumeration value. This is semantically wrong. This patch fixes this issue. Signed-off-by: Tianjia Zhang Signed-off-by: Casey Schaufler Signed-off-by: Sasha Levin --- security/smack/smack_access.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/security/smack/smack_access.c b/security/smack/smack_access.c index 38ac3da4e791e..beeba1a9be170 100644 --- a/security/smack/smack_access.c +++ b/security/smack/smack_access.c @@ -81,23 +81,22 @@ int log_policy = SMACK_AUDIT_DENIED; int smk_access_entry(char *subject_label, char *object_label, struct list_head *rule_list) { - int may = -ENOENT; struct smack_rule *srp; list_for_each_entry_rcu(srp, rule_list, list) { if (srp->smk_object->smk_known == object_label && srp->smk_subject->smk_known == subject_label) { - may = srp->smk_access; - break; + int may = srp->smk_access; + /* + * MAY_WRITE implies MAY_LOCK. + */ + if ((may & MAY_WRITE) == MAY_WRITE) + may |= MAY_LOCK; + return may; } } - /* - * MAY_WRITE implies MAY_LOCK. - */ - if ((may & MAY_WRITE) == MAY_WRITE) - may |= MAY_LOCK; - return may; + return -ENOENT; } /** From 71bd1e9e8ae5fd03e5d9943f2de9236a4abc87d3 Mon Sep 17 00:00:00 2001 From: Desmond Cheong Zhi Xi Date: Mon, 12 Jul 2021 12:35:05 +0800 Subject: [PATCH 0721/5344] drm: avoid blocking in drm_clients_info's rcu section [ Upstream commit 5eff9585de220cdd131237f5665db5e6c6bdf590 ] Inside drm_clients_info, the rcu_read_lock is held to lock pid_task()->comm. However, within this protected section, a call to drm_is_current_master is made, which involves a mutex lock in a future patch. However, this is illegal because the mutex lock might block while in the RCU read-side critical section. Since drm_is_current_master isn't protected by rcu_read_lock, we avoid this by moving it out of the RCU critical section. The following report came from intel-gfx ci's igt@debugfs_test@read_all_entries testcase: ============================= [ BUG: Invalid wait context ] 5.13.0-CI-Patchwork_20515+ #1 Tainted: G W ----------------------------- debugfs_test/1101 is trying to lock: ffff888132d901a8 (&dev->master_mutex){+.+.}-{3:3}, at: drm_is_current_master+0x1e/0x50 other info that might help us debug this: context-{4:4} 3 locks held by debugfs_test/1101: #0: ffff88810fdffc90 (&p->lock){+.+.}-{3:3}, at: seq_read_iter+0x53/0x3b0 #1: ffff888132d90240 (&dev->filelist_mutex){+.+.}-{3:3}, at: drm_clients_info+0x63/0x2a0 #2: ffffffff82734220 (rcu_read_lock){....}-{1:2}, at: drm_clients_info+0x1b1/0x2a0 stack backtrace: CPU: 8 PID: 1101 Comm: debugfs_test Tainted: G W 5.13.0-CI-Patchwork_20515+ #1 Hardware name: Intel Corporation CometLake Client Platform/CometLake S UDIMM (ERB/CRB), BIOS CMLSFWR1.R00.1263.D00.1906260926 06/26/2019 Call Trace: dump_stack+0x7f/0xad __lock_acquire.cold.78+0x2af/0x2ca lock_acquire+0xd3/0x300 ? drm_is_current_master+0x1e/0x50 ? __mutex_lock+0x76/0x970 ? lockdep_hardirqs_on+0xbf/0x130 __mutex_lock+0xab/0x970 ? drm_is_current_master+0x1e/0x50 ? drm_is_current_master+0x1e/0x50 ? drm_is_current_master+0x1e/0x50 drm_is_current_master+0x1e/0x50 drm_clients_info+0x107/0x2a0 seq_read_iter+0x178/0x3b0 seq_read+0x104/0x150 full_proxy_read+0x4e/0x80 vfs_read+0xa5/0x1b0 ksys_read+0x5a/0xd0 do_syscall_64+0x39/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae Signed-off-by: Desmond Cheong Zhi Xi Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20210712043508.11584-3-desmondcheongzx@gmail.com Signed-off-by: Sasha Levin --- drivers/gpu/drm/drm_debugfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_debugfs.c b/drivers/gpu/drm/drm_debugfs.c index 00debd02c3220..0ba92428ef560 100644 --- a/drivers/gpu/drm/drm_debugfs.c +++ b/drivers/gpu/drm/drm_debugfs.c @@ -91,6 +91,7 @@ static int drm_clients_info(struct seq_file *m, void *data) mutex_lock(&dev->filelist_mutex); list_for_each_entry_reverse(priv, &dev->filelist, lhead) { struct task_struct *task; + bool is_current_master = drm_is_current_master(priv); rcu_read_lock(); /* locks pid_task()->comm */ task = pid_task(priv->pid, PIDTYPE_PID); @@ -99,7 +100,7 @@ static int drm_clients_info(struct seq_file *m, void *data) task ? task->comm : "", pid_vnr(priv->pid), priv->minor->index, - drm_is_current_master(priv) ? 'y' : 'n', + is_current_master ? 'y' : 'n', priv->authenticated ? 'y' : 'n', from_kuid_munged(seq_user_ns(m), uid), priv->magic); From 2dcbb503ca0133ae01da70e93aa3470b74af913f Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Mon, 14 Jun 2021 15:19:39 +0300 Subject: [PATCH 0722/5344] igc: Check if num of q_vectors is smaller than max before array access [ Upstream commit 373e2829e7c2e1e606503cdb5c97749f512a4be9 ] Ensure that the adapter->q_vector[MAX_Q_VECTORS] array isn't accessed beyond its size. It was fixed by using a local variable num_q_vectors as a limit for loop index, and ensure that num_q_vectors is not bigger than MAX_Q_VECTORS. Suggested-by: Aleksandr Loktionov Signed-off-by: Sasha Neftin Tested-by: Dvora Fuxbrumer Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/igc/igc_main.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 084cf4a4114ad..9ba05d9aa8e08 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -2693,6 +2693,7 @@ static irqreturn_t igc_msix_ring(int irq, void *data) */ static int igc_request_msix(struct igc_adapter *adapter) { + unsigned int num_q_vectors = adapter->num_q_vectors; int i = 0, err = 0, vector = 0, free_vector = 0; struct net_device *netdev = adapter->netdev; @@ -2701,7 +2702,13 @@ static int igc_request_msix(struct igc_adapter *adapter) if (err) goto err_out; - for (i = 0; i < adapter->num_q_vectors; i++) { + if (num_q_vectors > MAX_Q_VECTORS) { + num_q_vectors = MAX_Q_VECTORS; + dev_warn(&adapter->pdev->dev, + "The number of queue vectors (%d) is higher than max allowed (%d)\n", + adapter->num_q_vectors, MAX_Q_VECTORS); + } + for (i = 0; i < num_q_vectors; i++) { struct igc_q_vector *q_vector = adapter->q_vector[i]; vector++; From 8d8d0fb41905b93bcbb1593bc60c8d7945701591 Mon Sep 17 00:00:00 2001 From: Kelly Devilliv Date: Sun, 27 Jun 2021 20:57:46 +0800 Subject: [PATCH 0723/5344] usb: host: fotg210: fix the endpoint's transactional opportunities calculation [ Upstream commit c2e898764245c852bc8ee4857613ba4f3a6d761d ] Now that usb_endpoint_maxp() only returns the lowest 11 bits from wMaxPacketSize, we should make use of the usb_endpoint_* helpers instead and remove the unnecessary max_packet()/hb_mult() macro. Signed-off-by: Kelly Devilliv Link: https://lore.kernel.org/r/20210627125747.127646-3-kelly.devilliv@gmail.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/usb/host/fotg210-hcd.c | 36 ++++++++++++++++------------------ 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/drivers/usb/host/fotg210-hcd.c b/drivers/usb/host/fotg210-hcd.c index c3f74d6674e1d..e62c456e78ac1 100644 --- a/drivers/usb/host/fotg210-hcd.c +++ b/drivers/usb/host/fotg210-hcd.c @@ -2511,11 +2511,6 @@ static unsigned qh_completions(struct fotg210_hcd *fotg210, return count; } -/* high bandwidth multiplier, as encoded in highspeed endpoint descriptors */ -#define hb_mult(wMaxPacketSize) (1 + (((wMaxPacketSize) >> 11) & 0x03)) -/* ... and packet size, for any kind of endpoint descriptor */ -#define max_packet(wMaxPacketSize) ((wMaxPacketSize) & 0x07ff) - /* reverse of qh_urb_transaction: free a list of TDs. * used for cleanup after errors, before HC sees an URB's TDs. */ @@ -2601,7 +2596,7 @@ static struct list_head *qh_urb_transaction(struct fotg210_hcd *fotg210, token |= (1 /* "in" */ << 8); /* else it's already initted to "out" pid (0 << 8) */ - maxpacket = max_packet(usb_maxpacket(urb->dev, urb->pipe, !is_input)); + maxpacket = usb_maxpacket(urb->dev, urb->pipe, !is_input); /* * buffer gets wrapped in one or more qtds; @@ -2715,9 +2710,11 @@ static struct fotg210_qh *qh_make(struct fotg210_hcd *fotg210, struct urb *urb, gfp_t flags) { struct fotg210_qh *qh = fotg210_qh_alloc(fotg210, flags); + struct usb_host_endpoint *ep; u32 info1 = 0, info2 = 0; int is_input, type; int maxp = 0; + int mult; struct usb_tt *tt = urb->dev->tt; struct fotg210_qh_hw *hw; @@ -2732,14 +2729,15 @@ static struct fotg210_qh *qh_make(struct fotg210_hcd *fotg210, struct urb *urb, is_input = usb_pipein(urb->pipe); type = usb_pipetype(urb->pipe); - maxp = usb_maxpacket(urb->dev, urb->pipe, !is_input); + ep = usb_pipe_endpoint(urb->dev, urb->pipe); + maxp = usb_endpoint_maxp(&ep->desc); + mult = usb_endpoint_maxp_mult(&ep->desc); /* 1024 byte maxpacket is a hardware ceiling. High bandwidth * acts like up to 3KB, but is built from smaller packets. */ - if (max_packet(maxp) > 1024) { - fotg210_dbg(fotg210, "bogus qh maxpacket %d\n", - max_packet(maxp)); + if (maxp > 1024) { + fotg210_dbg(fotg210, "bogus qh maxpacket %d\n", maxp); goto done; } @@ -2753,8 +2751,7 @@ static struct fotg210_qh *qh_make(struct fotg210_hcd *fotg210, struct urb *urb, */ if (type == PIPE_INTERRUPT) { qh->usecs = NS_TO_US(usb_calc_bus_time(USB_SPEED_HIGH, - is_input, 0, - hb_mult(maxp) * max_packet(maxp))); + is_input, 0, mult * maxp)); qh->start = NO_FRAME; if (urb->dev->speed == USB_SPEED_HIGH) { @@ -2791,7 +2788,7 @@ static struct fotg210_qh *qh_make(struct fotg210_hcd *fotg210, struct urb *urb, think_time = tt ? tt->think_time : 0; qh->tt_usecs = NS_TO_US(think_time + usb_calc_bus_time(urb->dev->speed, - is_input, 0, max_packet(maxp))); + is_input, 0, maxp)); qh->period = urb->interval; if (qh->period > fotg210->periodic_size) { qh->period = fotg210->periodic_size; @@ -2854,11 +2851,11 @@ static struct fotg210_qh *qh_make(struct fotg210_hcd *fotg210, struct urb *urb, * to help them do so. So now people expect to use * such nonconformant devices with Linux too; sigh. */ - info1 |= max_packet(maxp) << 16; + info1 |= maxp << 16; info2 |= (FOTG210_TUNE_MULT_HS << 30); } else { /* PIPE_INTERRUPT */ - info1 |= max_packet(maxp) << 16; - info2 |= hb_mult(maxp) << 30; + info1 |= maxp << 16; + info2 |= mult << 30; } break; default: @@ -3928,6 +3925,7 @@ static void iso_stream_init(struct fotg210_hcd *fotg210, int is_input; long bandwidth; unsigned multi; + struct usb_host_endpoint *ep; /* * this might be a "high bandwidth" highspeed endpoint, @@ -3935,14 +3933,14 @@ static void iso_stream_init(struct fotg210_hcd *fotg210, */ epnum = usb_pipeendpoint(pipe); is_input = usb_pipein(pipe) ? USB_DIR_IN : 0; - maxp = usb_maxpacket(dev, pipe, !is_input); + ep = usb_pipe_endpoint(dev, pipe); + maxp = usb_endpoint_maxp(&ep->desc); if (is_input) buf1 = (1 << 11); else buf1 = 0; - maxp = max_packet(maxp); - multi = hb_mult(maxp); + multi = usb_endpoint_maxp_mult(&ep->desc); buf1 |= maxp; maxp *= multi; From 32eedea70089a8666582ff7225d3af498e413cbf Mon Sep 17 00:00:00 2001 From: Kelly Devilliv Date: Sun, 27 Jun 2021 20:57:47 +0800 Subject: [PATCH 0724/5344] usb: host: fotg210: fix the actual_length of an iso packet [ Upstream commit 091cb2f782f32ab68c6f5f326d7868683d3d4875 ] We should acquire the actual_length of an iso packet from the iTD directly using FOTG210_ITD_LENGTH() macro. Signed-off-by: Kelly Devilliv Link: https://lore.kernel.org/r/20210627125747.127646-4-kelly.devilliv@gmail.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/usb/host/fotg210-hcd.c | 5 ++--- drivers/usb/host/fotg210.h | 5 ----- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/usb/host/fotg210-hcd.c b/drivers/usb/host/fotg210-hcd.c index e62c456e78ac1..f457e083a6f89 100644 --- a/drivers/usb/host/fotg210-hcd.c +++ b/drivers/usb/host/fotg210-hcd.c @@ -4461,13 +4461,12 @@ static bool itd_complete(struct fotg210_hcd *fotg210, struct fotg210_itd *itd) /* HC need not update length with this error */ if (!(t & FOTG210_ISOC_BABBLE)) { - desc->actual_length = - fotg210_itdlen(urb, desc, t); + desc->actual_length = FOTG210_ITD_LENGTH(t); urb->actual_length += desc->actual_length; } } else if (likely((t & FOTG210_ISOC_ACTIVE) == 0)) { desc->status = 0; - desc->actual_length = fotg210_itdlen(urb, desc, t); + desc->actual_length = FOTG210_ITD_LENGTH(t); urb->actual_length += desc->actual_length; } else { /* URB was too late */ diff --git a/drivers/usb/host/fotg210.h b/drivers/usb/host/fotg210.h index 1b4db95e5c43a..291add93d84ee 100644 --- a/drivers/usb/host/fotg210.h +++ b/drivers/usb/host/fotg210.h @@ -686,11 +686,6 @@ static inline unsigned fotg210_read_frame_index(struct fotg210_hcd *fotg210) return fotg210_readl(fotg210, &fotg210->regs->frame_index); } -#define fotg210_itdlen(urb, desc, t) ({ \ - usb_pipein((urb)->pipe) ? \ - (desc)->length - FOTG210_ITD_LENGTH(t) : \ - FOTG210_ITD_LENGTH(t); \ -}) /*-------------------------------------------------------------------------*/ #endif /* __LINUX_FOTG210_H */ From 87ccbcb8e76480c5591cd16ea998f3fe30b18c5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Thu, 1 Jul 2021 04:48:34 -0700 Subject: [PATCH 0725/5344] usb: gadget: u_ether: fix a potential null pointer dereference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 8ae01239609b29ec2eff55967c8e0fe3650cfa09 ] f_ncm tx timeout can call us with null skb to flush a pending frame. In this case skb is NULL to begin with but ceases to be null after dev->wrap() completes. In such a case in->maxpacket will be read, even though we've failed to check that 'in' is not NULL. Though I've never observed this fail in practice, however the 'flush operation' simply does not make sense with a null usb IN endpoint - there's nowhere to flush to... (note that we're the gadget/device, and IN is from the point of view of the host, so here IN actually means outbound...) Cc: Brooke Basile Cc: "Bryan O'Donoghue" Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Lorenzo Colitti Signed-off-by: Maciej Żenczykowski Link: https://lore.kernel.org/r/20210701114834.884597-6-zenczykowski@gmail.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/usb/gadget/function/u_ether.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/function/u_ether.c b/drivers/usb/gadget/function/u_ether.c index 99b840daf3d94..57da62e331848 100644 --- a/drivers/usb/gadget/function/u_ether.c +++ b/drivers/usb/gadget/function/u_ether.c @@ -491,8 +491,9 @@ static netdev_tx_t eth_start_xmit(struct sk_buff *skb, } spin_unlock_irqrestore(&dev->lock, flags); - if (skb && !in) { - dev_kfree_skb_any(skb); + if (!in) { + if (skb) + dev_kfree_skb_any(skb); return NETDEV_TX_OK; } From 560c105da195974d4b39b3f685bfa8f006b00602 Mon Sep 17 00:00:00 2001 From: Evgeny Novikov Date: Thu, 8 Jul 2021 11:30:56 +0300 Subject: [PATCH 0726/5344] USB: EHCI: ehci-mv: improve error handling in mv_ehci_enable() [ Upstream commit 61136a12cbed234374ec6f588af57c580b20b772 ] mv_ehci_enable() did not disable and unprepare clocks in case of failures of phy_init(). Besides, it did not take into account failures of ehci_clock_enable() (in effect, failures of clk_prepare_enable()). The patch fixes both issues and gets rid of redundant wrappers around clk_prepare_enable() and clk_disable_unprepare() to simplify this a bit. Found by Linux Driver Verification project (linuxtesting.org). Acked-by: Alan Stern Signed-off-by: Evgeny Novikov Link: https://lore.kernel.org/r/20210708083056.21543-1-novikov@ispras.ru Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/usb/host/ehci-mv.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/drivers/usb/host/ehci-mv.c b/drivers/usb/host/ehci-mv.c index b6f196f5e252e..b0e0f8ea98a9c 100644 --- a/drivers/usb/host/ehci-mv.c +++ b/drivers/usb/host/ehci-mv.c @@ -41,26 +41,25 @@ struct ehci_hcd_mv { int (*set_vbus)(unsigned int vbus); }; -static void ehci_clock_enable(struct ehci_hcd_mv *ehci_mv) +static int mv_ehci_enable(struct ehci_hcd_mv *ehci_mv) { - clk_prepare_enable(ehci_mv->clk); -} + int retval; -static void ehci_clock_disable(struct ehci_hcd_mv *ehci_mv) -{ - clk_disable_unprepare(ehci_mv->clk); -} + retval = clk_prepare_enable(ehci_mv->clk); + if (retval) + return retval; -static int mv_ehci_enable(struct ehci_hcd_mv *ehci_mv) -{ - ehci_clock_enable(ehci_mv); - return phy_init(ehci_mv->phy); + retval = phy_init(ehci_mv->phy); + if (retval) + clk_disable_unprepare(ehci_mv->clk); + + return retval; } static void mv_ehci_disable(struct ehci_hcd_mv *ehci_mv) { phy_exit(ehci_mv->phy); - ehci_clock_disable(ehci_mv); + clk_disable_unprepare(ehci_mv->clk); } static int mv_ehci_reset(struct usb_hcd *hcd) From 698dc8feabd2c298f38f0c6c74001c4c0245122a Mon Sep 17 00:00:00 2001 From: Jack Pham Date: Tue, 20 Jul 2021 01:09:07 -0700 Subject: [PATCH 0727/5344] usb: gadget: composite: Allow bMaxPower=0 if self-powered [ Upstream commit bcacbf06c891374e7fdd7b72d11cda03b0269b43 ] Currently the composite driver encodes the MaxPower field of the configuration descriptor by reading the c->MaxPower of the usb_configuration only if it is non-zero, otherwise it falls back to using the value hard-coded in CONFIG_USB_GADGET_VBUS_DRAW. However, there are cases when a configuration must explicitly set bMaxPower to 0, particularly if its bmAttributes also has the Self-Powered bit set, which is a valid combination. This is specifically called out in the USB PD specification section 9.1, in which a PDUSB device "shall report zero in the bMaxPower field after negotiating a mutually agreeable Contract", and also verified by the USB Type-C Functional Test TD.4.10.2 Sink Power Precedence Test. The fix allows the c->MaxPower to be used for encoding the bMaxPower even if it is 0, if the self-powered bit is also set. An example usage of this would be for a ConfigFS gadget to be dynamically updated by userspace when the Type-C connection is determined to be operating in Power Delivery mode. Co-developed-by: Ronak Vijay Raheja Acked-by: Felipe Balbi Signed-off-by: Ronak Vijay Raheja Signed-off-by: Jack Pham Link: https://lore.kernel.org/r/20210720080907.30292-1-jackp@codeaurora.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/usb/gadget/composite.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c index 24dad1d78d1ea..6bd3fdb925cd9 100644 --- a/drivers/usb/gadget/composite.c +++ b/drivers/usb/gadget/composite.c @@ -481,7 +481,7 @@ static u8 encode_bMaxPower(enum usb_device_speed speed, { unsigned val; - if (c->MaxPower) + if (c->MaxPower || (c->bmAttributes & USB_CONFIG_ATT_SELFPOWER)) val = c->MaxPower; else val = CONFIG_USB_GADGET_VBUS_DRAW; @@ -905,7 +905,11 @@ static int set_config(struct usb_composite_dev *cdev, } /* when we return, be sure our power usage is valid */ - power = c->MaxPower ? c->MaxPower : CONFIG_USB_GADGET_VBUS_DRAW; + if (c->MaxPower || (c->bmAttributes & USB_CONFIG_ATT_SELFPOWER)) + power = c->MaxPower; + else + power = CONFIG_USB_GADGET_VBUS_DRAW; + if (gadget->speed < USB_SPEED_SUPER) power = min(power, 500U); else From 87154658e058250a33eff289557feb5a36392fd4 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 14 Jul 2021 12:13:46 +0200 Subject: [PATCH 0728/5344] staging: board: Fix uninitialized spinlock when attaching genpd [ Upstream commit df00609821bf17f50a75a446266d19adb8339d84 ] On Armadillo-800-EVA with CONFIG_DEBUG_SPINLOCK=y: BUG: spinlock bad magic on CPU#0, swapper/1 lock: lcdc0_device+0x10c/0x308, .magic: 00000000, .owner: /-1, .owner_cpu: 0 CPU: 0 PID: 1 Comm: swapper Not tainted 5.11.0-rc5-armadillo-00036-gbbca04be7a80-dirty #287 Hardware name: Generic R8A7740 (Flattened Device Tree) [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [] (show_stack) from [] (do_raw_spin_lock+0x20/0x94) [] (do_raw_spin_lock) from [] (dev_pm_get_subsys_data+0x8c/0x11c) [] (dev_pm_get_subsys_data) from [] (genpd_add_device+0x78/0x2b8) [] (genpd_add_device) from [] (of_genpd_add_device+0x34/0x4c) [] (of_genpd_add_device) from [] (board_staging_register_device+0x11c/0x148) [] (board_staging_register_device) from [] (board_staging_register_devices+0x24/0x28) of_genpd_add_device() is called before platform_device_register(), as it needs to attach the genpd before the device is probed. But the spinlock is only initialized when the device is registered. Fix this by open-coding the spinlock initialization, cfr. device_pm_init_common() in the internal drivers/base code, and in the SuperH early platform code. Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/57783ece7ddae55f2bda2f59f452180bff744ea0.1626257398.git.geert+renesas@glider.be Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/staging/board/board.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/staging/board/board.c b/drivers/staging/board/board.c index cb6feb34dd401..f980af0373452 100644 --- a/drivers/staging/board/board.c +++ b/drivers/staging/board/board.c @@ -136,6 +136,7 @@ int __init board_staging_register_clock(const struct board_staging_clk *bsc) static int board_staging_add_dev_domain(struct platform_device *pdev, const char *domain) { + struct device *dev = &pdev->dev; struct of_phandle_args pd_args; struct device_node *np; @@ -148,7 +149,11 @@ static int board_staging_add_dev_domain(struct platform_device *pdev, pd_args.np = np; pd_args.args_count = 0; - return of_genpd_add_device(&pd_args, &pdev->dev); + /* Initialization similar to device_pm_init_common() */ + spin_lock_init(&dev->power.lock); + dev->power.early_init = true; + + return of_genpd_add_device(&pd_args, dev); } #else static inline int board_staging_add_dev_domain(struct platform_device *pdev, From 637b196b1b1e34a066aae9aad98d14ec7d22bfe9 Mon Sep 17 00:00:00 2001 From: Zheyu Ma Date: Wed, 14 Jul 2021 05:53:23 +0000 Subject: [PATCH 0729/5344] tty: serial: jsm: hold port lock when reporting modem line changes [ Upstream commit 240e126c28df084222f0b661321e8e3ecb0d232e ] uart_handle_dcd_change() requires a port lock to be held and will emit a warning when lockdep is enabled. Held corresponding lock to fix the following warnings. [ 132.528648] WARNING: CPU: 5 PID: 11600 at drivers/tty/serial/serial_core.c:3046 uart_handle_dcd_change+0xf4/0x120 [ 132.530482] Modules linked in: [ 132.531050] CPU: 5 PID: 11600 Comm: jsm Not tainted 5.14.0-rc1-00003-g7fef2edf7cc7-dirty #31 [ 132.535268] RIP: 0010:uart_handle_dcd_change+0xf4/0x120 [ 132.557100] Call Trace: [ 132.557562] ? __free_pages+0x83/0xb0 [ 132.558213] neo_parse_modem+0x156/0x220 [ 132.558897] neo_param+0x399/0x840 [ 132.559495] jsm_tty_open+0x12f/0x2d0 [ 132.560131] uart_startup.part.18+0x153/0x340 [ 132.560888] ? lock_is_held_type+0xe9/0x140 [ 132.561660] uart_port_activate+0x7f/0xe0 [ 132.562351] ? uart_startup.part.18+0x340/0x340 [ 132.563003] tty_port_open+0x8d/0xf0 [ 132.563523] ? uart_set_options+0x1e0/0x1e0 [ 132.564125] uart_open+0x24/0x40 [ 132.564604] tty_open+0x15c/0x630 Signed-off-by: Zheyu Ma Link: https://lore.kernel.org/r/1626242003-3809-1-git-send-email-zheyuma97@gmail.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/tty/serial/jsm/jsm_neo.c | 2 ++ drivers/tty/serial/jsm/jsm_tty.c | 3 +++ 2 files changed, 5 insertions(+) diff --git a/drivers/tty/serial/jsm/jsm_neo.c b/drivers/tty/serial/jsm/jsm_neo.c index bf0e2a4cb0cef..c6f927a76c3be 100644 --- a/drivers/tty/serial/jsm/jsm_neo.c +++ b/drivers/tty/serial/jsm/jsm_neo.c @@ -815,7 +815,9 @@ static void neo_parse_isr(struct jsm_board *brd, u32 port) /* Parse any modem signal changes */ jsm_dbg(INTR, &ch->ch_bd->pci_dev, "MOD_STAT: sending to parse_modem_sigs\n"); + spin_lock_irqsave(&ch->uart_port.lock, lock_flags); neo_parse_modem(ch, readb(&ch->ch_neo_uart->msr)); + spin_unlock_irqrestore(&ch->uart_port.lock, lock_flags); } } diff --git a/drivers/tty/serial/jsm/jsm_tty.c b/drivers/tty/serial/jsm/jsm_tty.c index 689774c073ca4..8438454ca653f 100644 --- a/drivers/tty/serial/jsm/jsm_tty.c +++ b/drivers/tty/serial/jsm/jsm_tty.c @@ -187,6 +187,7 @@ static void jsm_tty_break(struct uart_port *port, int break_state) static int jsm_tty_open(struct uart_port *port) { + unsigned long lock_flags; struct jsm_board *brd; struct jsm_channel *channel = container_of(port, struct jsm_channel, uart_port); @@ -240,6 +241,7 @@ static int jsm_tty_open(struct uart_port *port) channel->ch_cached_lsr = 0; channel->ch_stops_sent = 0; + spin_lock_irqsave(&port->lock, lock_flags); termios = &port->state->port.tty->termios; channel->ch_c_cflag = termios->c_cflag; channel->ch_c_iflag = termios->c_iflag; @@ -259,6 +261,7 @@ static int jsm_tty_open(struct uart_port *port) jsm_carrier(channel); channel->ch_open_count++; + spin_unlock_irqrestore(&port->lock, lock_flags); jsm_dbg(OPEN, &channel->ch_bd->pci_dev, "finish\n"); return 0; From 3217b86dd98ef7b068688d26f7614aa5040839e0 Mon Sep 17 00:00:00 2001 From: Oliver Logush Date: Wed, 23 Jun 2021 15:04:04 -0400 Subject: [PATCH 0730/5344] drm/amd/display: Fix timer_per_pixel unit error [ Upstream commit 23e55639b87fb16a9f0f66032ecb57060df6c46c ] [why] The units of the time_per_pixel variable were incorrect, this had to be changed for the code to properly function. [how] The change was very straightforward, only required one line of code to be changed where the calculation was done. Acked-by: Rodrigo Siqueira Signed-off-by: Oliver Logush Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c index 2b1175bb2daee..d2ea4c003d442 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c @@ -2232,7 +2232,7 @@ void dcn20_set_mcif_arb_params( wb_arb_params->cli_watermark[k] = get_wm_writeback_urgent(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; wb_arb_params->pstate_watermark[k] = get_wm_writeback_dram_clock_change(&context->bw_ctx.dml, pipes, pipe_cnt) * 1000; } - wb_arb_params->time_per_pixel = 16.0 / context->res_ctx.pipe_ctx[i].stream->phy_pix_clk; /* 4 bit fraction, ms */ + wb_arb_params->time_per_pixel = 16.0 * 1000 / (context->res_ctx.pipe_ctx[i].stream->phy_pix_clk / 1000); /* 4 bit fraction, ms */ wb_arb_params->slice_lines = 32; wb_arb_params->arbitration_slice = 2; wb_arb_params->max_scaled_time = dcn20_calc_max_scaled_time(wb_arb_params->time_per_pixel, From 96e09191a24bdb767e175eda61484a0a71a19ce0 Mon Sep 17 00:00:00 2001 From: Anson Jacob Date: Tue, 20 Jul 2021 11:00:44 -0400 Subject: [PATCH 0731/5344] drm/amd/amdgpu: Update debugfs link_settings output link_rate field in hex [ Upstream commit 1a394b3c3de2577f200cb623c52a5c2b82805cec ] link_rate is updated via debugfs using hex values, set it to output in hex as well. eg: Resolution: 1920x1080@144Hz cat /sys/kernel/debug/dri/0/DP-1/link_settings Current: 4 0x14 0 Verified: 4 0x1e 0 Reported: 4 0x1e 16 Preferred: 0 0x0 0 echo "4 0x1e" > /sys/kernel/debug/dri/0/DP-1/link_settings cat /sys/kernel/debug/dri/0/DP-1/link_settings Current: 4 0x1e 0 Verified: 4 0x1e 0 Reported: 4 0x1e 16 Preferred: 4 0x1e 0 Signed-off-by: Anson Jacob Reviewed-by: Harry Wentland Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin --- .../amd/display/amdgpu_dm/amdgpu_dm_debugfs.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c index f3dfb2887ae0b..2cdcefab2d7d4 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c @@ -95,29 +95,29 @@ static ssize_t dp_link_settings_read(struct file *f, char __user *buf, rd_buf_ptr = rd_buf; - str_len = strlen("Current: %d %d %d "); - snprintf(rd_buf_ptr, str_len, "Current: %d %d %d ", + str_len = strlen("Current: %d 0x%x %d "); + snprintf(rd_buf_ptr, str_len, "Current: %d 0x%x %d ", link->cur_link_settings.lane_count, link->cur_link_settings.link_rate, link->cur_link_settings.link_spread); rd_buf_ptr += str_len; - str_len = strlen("Verified: %d %d %d "); - snprintf(rd_buf_ptr, str_len, "Verified: %d %d %d ", + str_len = strlen("Verified: %d 0x%x %d "); + snprintf(rd_buf_ptr, str_len, "Verified: %d 0x%x %d ", link->verified_link_cap.lane_count, link->verified_link_cap.link_rate, link->verified_link_cap.link_spread); rd_buf_ptr += str_len; - str_len = strlen("Reported: %d %d %d "); - snprintf(rd_buf_ptr, str_len, "Reported: %d %d %d ", + str_len = strlen("Reported: %d 0x%x %d "); + snprintf(rd_buf_ptr, str_len, "Reported: %d 0x%x %d ", link->reported_link_cap.lane_count, link->reported_link_cap.link_rate, link->reported_link_cap.link_spread); rd_buf_ptr += str_len; - str_len = strlen("Preferred: %d %d %d "); - snprintf(rd_buf_ptr, str_len, "Preferred: %d %d %d\n", + str_len = strlen("Preferred: %d 0x%x %d "); + snprintf(rd_buf_ptr, str_len, "Preferred: %d 0x%x %d\n", link->preferred_link_setting.lane_count, link->preferred_link_setting.link_rate, link->preferred_link_setting.link_spread); From 26bf21c0b0d5a024fbc5a9cf2730b0acaed54f72 Mon Sep 17 00:00:00 2001 From: Johan Almbladh Date: Wed, 21 Jul 2021 12:40:58 +0200 Subject: [PATCH 0732/5344] bpf/tests: Fix copy-and-paste error in double word test [ Upstream commit ae7f47041d928b1a2f28717d095b4153c63cbf6a ] This test now operates on DW as stated instead of W, which was already covered by another test. Signed-off-by: Johan Almbladh Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210721104058.3755254-1-johan.almbladh@anyfinetworks.com Signed-off-by: Sasha Levin --- lib/test_bpf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 5ef3eccee27cb..5e985ed68b2ad 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -4286,8 +4286,8 @@ static struct bpf_test tests[] = { .u.insns_int = { BPF_LD_IMM64(R0, 0), BPF_LD_IMM64(R1, 0xffffffffffffffffLL), - BPF_STX_MEM(BPF_W, R10, R1, -40), - BPF_LDX_MEM(BPF_W, R0, R10, -40), + BPF_STX_MEM(BPF_DW, R10, R1, -40), + BPF_LDX_MEM(BPF_DW, R0, R10, -40), BPF_EXIT_INSN(), }, INTERNAL, From 91c4fb79e3fdb93bd895044f36085ab3ceb19dda Mon Sep 17 00:00:00 2001 From: Johan Almbladh Date: Wed, 21 Jul 2021 12:38:22 +0200 Subject: [PATCH 0733/5344] bpf/tests: Do not PASS tests without actually testing the result [ Upstream commit 2b7e9f25e590726cca76700ebdb10e92a7a72ca1 ] Each test case can have a set of sub-tests, where each sub-test can run the cBPF/eBPF test snippet with its own data_size and expected result. Before, the end of the sub-test array was indicated by both data_size and result being zero. However, most or all of the internal eBPF tests has a data_size of zero already. When such a test also had an expected value of zero, the test was never run but reported as PASS anyway. Now the test runner always runs the first sub-test, regardless of the data_size and result values. The sub-test array zero-termination only applies for any additional sub-tests. There are other ways fix it of course, but this solution at least removes the surprise of eBPF tests with a zero result always succeeding. Signed-off-by: Johan Almbladh Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210721103822.3755111-1-johan.almbladh@anyfinetworks.com Signed-off-by: Sasha Levin --- lib/test_bpf.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 5e985ed68b2ad..3ae002ced4c7a 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -6684,7 +6684,14 @@ static int run_one(const struct bpf_prog *fp, struct bpf_test *test) u64 duration; u32 ret; - if (test->test[i].data_size == 0 && + /* + * NOTE: Several sub-tests may be present, in which case + * a zero {data_size, result} tuple indicates the end of + * the sub-test array. The first test is always run, + * even if both data_size and result happen to be zero. + */ + if (i > 0 && + test->test[i].data_size == 0 && test->test[i].result == 0) break; From 954b58c60c598e67ed4eaa11c3365c2380b9346a Mon Sep 17 00:00:00 2001 From: Zheyu Ma Date: Mon, 26 Jul 2021 10:03:53 +0000 Subject: [PATCH 0734/5344] video: fbdev: asiliantfb: Error out if 'pixclock' equals zero [ Upstream commit b36b242d4b8ea178f7fd038965e3cac7f30c3f09 ] The userspace program could pass any values to the driver through ioctl() interface. If the driver doesn't check the value of 'pixclock', it may cause divide error. Fix this by checking whether 'pixclock' is zero first. The following log reveals it: [ 43.861711] divide error: 0000 [#1] PREEMPT SMP KASAN PTI [ 43.861737] CPU: 2 PID: 11764 Comm: i740 Not tainted 5.14.0-rc2-00513-gac532c9bbcfb-dirty #224 [ 43.861756] RIP: 0010:asiliantfb_check_var+0x4e/0x730 [ 43.861843] Call Trace: [ 43.861848] ? asiliantfb_remove+0x190/0x190 [ 43.861858] fb_set_var+0x2e4/0xeb0 [ 43.861866] ? fb_blank+0x1a0/0x1a0 [ 43.861873] ? lock_acquire+0x1ef/0x530 [ 43.861884] ? lock_release+0x810/0x810 [ 43.861892] ? lock_is_held_type+0x100/0x140 [ 43.861903] ? ___might_sleep+0x1ee/0x2d0 [ 43.861914] ? __mutex_lock+0x620/0x1190 [ 43.861921] ? do_fb_ioctl+0x313/0x700 [ 43.861929] ? mutex_lock_io_nested+0xfa0/0xfa0 [ 43.861936] ? __this_cpu_preempt_check+0x1d/0x30 [ 43.861944] ? _raw_spin_unlock_irqrestore+0x46/0x60 [ 43.861952] ? lockdep_hardirqs_on+0x59/0x100 [ 43.861959] ? _raw_spin_unlock_irqrestore+0x46/0x60 [ 43.861967] ? trace_hardirqs_on+0x6a/0x1c0 [ 43.861978] do_fb_ioctl+0x31e/0x700 Signed-off-by: Zheyu Ma Signed-off-by: Sam Ravnborg Link: https://patchwork.freedesktop.org/patch/msgid/1627293835-17441-2-git-send-email-zheyuma97@gmail.com Signed-off-by: Sasha Levin --- drivers/video/fbdev/asiliantfb.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/video/fbdev/asiliantfb.c b/drivers/video/fbdev/asiliantfb.c index ea31054a28ca8..c1d6e63362259 100644 --- a/drivers/video/fbdev/asiliantfb.c +++ b/drivers/video/fbdev/asiliantfb.c @@ -227,6 +227,9 @@ static int asiliantfb_check_var(struct fb_var_screeninfo *var, { unsigned long Ftarget, ratio, remainder; + if (!var->pixclock) + return -EINVAL; + ratio = 1000000 / var->pixclock; remainder = 1000000 % var->pixclock; Ftarget = 1000000 * ratio + (1000000 * remainder) / var->pixclock; From 006caeeaf0b1435ed27074911a6cc2c2246d030d Mon Sep 17 00:00:00 2001 From: Zheyu Ma Date: Mon, 26 Jul 2021 10:03:54 +0000 Subject: [PATCH 0735/5344] video: fbdev: kyro: Error out if 'pixclock' equals zero [ Upstream commit 1520b4b7ba964f8eec2e7dd14c571d50de3e5191 ] The userspace program could pass any values to the driver through ioctl() interface. if the driver doesn't check the value of 'pixclock', it may cause divide error because the value of 'lineclock' and 'frameclock' will be zero. Fix this by checking whether 'pixclock' is zero in kyrofb_check_var(). The following log reveals it: [ 103.073930] divide error: 0000 [#1] PREEMPT SMP KASAN PTI [ 103.073942] CPU: 4 PID: 12483 Comm: syz-executor Not tainted 5.14.0-rc2-00478-g2734d6c1b1a0-dirty #118 [ 103.073959] RIP: 0010:kyrofb_set_par+0x316/0xc80 [ 103.074045] Call Trace: [ 103.074048] ? ___might_sleep+0x1ee/0x2d0 [ 103.074060] ? kyrofb_ioctl+0x330/0x330 [ 103.074069] fb_set_var+0x5bf/0xeb0 [ 103.074078] ? fb_blank+0x1a0/0x1a0 [ 103.074085] ? lock_acquire+0x3bd/0x530 [ 103.074094] ? lock_release+0x810/0x810 [ 103.074103] ? ___might_sleep+0x1ee/0x2d0 [ 103.074114] ? __mutex_lock+0x620/0x1190 [ 103.074126] ? trace_hardirqs_on+0x6a/0x1c0 [ 103.074137] do_fb_ioctl+0x31e/0x700 [ 103.074144] ? fb_getput_cmap+0x280/0x280 [ 103.074152] ? rcu_read_lock_sched_held+0x11/0x80 [ 103.074162] ? rcu_read_lock_sched_held+0x11/0x80 [ 103.074171] ? __sanitizer_cov_trace_switch+0x67/0xf0 [ 103.074181] ? __sanitizer_cov_trace_const_cmp2+0x20/0x80 [ 103.074191] ? do_vfs_ioctl+0x14b/0x16c0 [ 103.074199] ? vfs_fileattr_set+0xb60/0xb60 [ 103.074207] ? rcu_read_lock_sched_held+0x11/0x80 [ 103.074216] ? lock_release+0x483/0x810 [ 103.074224] ? __fget_files+0x217/0x3d0 [ 103.074234] ? __fget_files+0x239/0x3d0 [ 103.074243] ? do_fb_ioctl+0x700/0x700 [ 103.074250] fb_ioctl+0xe6/0x130 Signed-off-by: Zheyu Ma Signed-off-by: Sam Ravnborg Link: https://patchwork.freedesktop.org/patch/msgid/1627293835-17441-3-git-send-email-zheyuma97@gmail.com Signed-off-by: Sasha Levin --- drivers/video/fbdev/kyro/fbdev.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/video/fbdev/kyro/fbdev.c b/drivers/video/fbdev/kyro/fbdev.c index d7aa431e68460..74bf26b527b91 100644 --- a/drivers/video/fbdev/kyro/fbdev.c +++ b/drivers/video/fbdev/kyro/fbdev.c @@ -399,6 +399,9 @@ static int kyrofb_check_var(struct fb_var_screeninfo *var, struct fb_info *info) { struct kyrofb_info *par = info->par; + if (!var->pixclock) + return -EINVAL; + if (var->bits_per_pixel != 16 && var->bits_per_pixel != 32) { printk(KERN_WARNING "kyrofb: depth not supported: %u\n", var->bits_per_pixel); return -EINVAL; From 701876c5d7d73e06c93436a8a283aeece29929a0 Mon Sep 17 00:00:00 2001 From: Zheyu Ma Date: Mon, 26 Jul 2021 10:03:55 +0000 Subject: [PATCH 0736/5344] video: fbdev: riva: Error out if 'pixclock' equals zero [ Upstream commit f92763cb0feba247e0939ed137b495601fd072a5 ] The userspace program could pass any values to the driver through ioctl() interface. If the driver doesn't check the value of 'pixclock', it may cause divide error. Fix this by checking whether 'pixclock' is zero first. The following log reveals it: [ 33.396850] divide error: 0000 [#1] PREEMPT SMP KASAN PTI [ 33.396864] CPU: 5 PID: 11754 Comm: i740 Not tainted 5.14.0-rc2-00513-gac532c9bbcfb-dirty #222 [ 33.396883] RIP: 0010:riva_load_video_mode+0x417/0xf70 [ 33.396969] Call Trace: [ 33.396973] ? debug_smp_processor_id+0x1c/0x20 [ 33.396984] ? tick_nohz_tick_stopped+0x1a/0x90 [ 33.396996] ? rivafb_copyarea+0x3c0/0x3c0 [ 33.397003] ? wake_up_klogd.part.0+0x99/0xd0 [ 33.397014] ? vprintk_emit+0x110/0x4b0 [ 33.397024] ? vprintk_default+0x26/0x30 [ 33.397033] ? vprintk+0x9c/0x1f0 [ 33.397041] ? printk+0xba/0xed [ 33.397054] ? record_print_text.cold+0x16/0x16 [ 33.397063] ? __kasan_check_read+0x11/0x20 [ 33.397074] ? profile_tick+0xc0/0x100 [ 33.397084] ? __sanitizer_cov_trace_const_cmp4+0x24/0x80 [ 33.397094] ? riva_set_rop_solid+0x2a0/0x2a0 [ 33.397102] rivafb_set_par+0xbe/0x610 [ 33.397111] ? riva_set_rop_solid+0x2a0/0x2a0 [ 33.397119] fb_set_var+0x5bf/0xeb0 [ 33.397127] ? fb_blank+0x1a0/0x1a0 [ 33.397134] ? lock_acquire+0x1ef/0x530 [ 33.397143] ? lock_release+0x810/0x810 [ 33.397151] ? lock_is_held_type+0x100/0x140 [ 33.397159] ? ___might_sleep+0x1ee/0x2d0 [ 33.397170] ? __mutex_lock+0x620/0x1190 [ 33.397180] ? trace_hardirqs_on+0x6a/0x1c0 [ 33.397190] do_fb_ioctl+0x31e/0x700 Signed-off-by: Zheyu Ma Signed-off-by: Sam Ravnborg Link: https://patchwork.freedesktop.org/patch/msgid/1627293835-17441-4-git-send-email-zheyuma97@gmail.com Signed-off-by: Sasha Levin --- drivers/video/fbdev/riva/fbdev.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/video/fbdev/riva/fbdev.c b/drivers/video/fbdev/riva/fbdev.c index ca593a3e41d74..51c9d9508c0b0 100644 --- a/drivers/video/fbdev/riva/fbdev.c +++ b/drivers/video/fbdev/riva/fbdev.c @@ -1088,6 +1088,9 @@ static int rivafb_check_var(struct fb_var_screeninfo *var, struct fb_info *info) int mode_valid = 0; NVTRACE_ENTER(); + if (!var->pixclock) + return -EINVAL; + switch (var->bits_per_pixel) { case 1 ... 8: var->red.offset = var->green.offset = var->blue.offset = 0; From e1776e32e319be7089a40787d2f355f4de0c2f9f Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 26 Jul 2021 14:52:51 -0500 Subject: [PATCH 0737/5344] ipv4: ip_output.c: Fix out-of-bounds warning in ip_copy_addrs() [ Upstream commit 6321c7acb82872ef6576c520b0e178eaad3a25c0 ] Fix the following out-of-bounds warning: In function 'ip_copy_addrs', inlined from '__ip_queue_xmit' at net/ipv4/ip_output.c:517:2: net/ipv4/ip_output.c:449:2: warning: 'memcpy' offset [40, 43] from the object at 'fl' is out of the bounds of referenced subobject 'saddr' with type 'unsigned int' at offset 36 [-Warray-bounds] 449 | memcpy(&iph->saddr, &fl4->saddr, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 450 | sizeof(fl4->saddr) + sizeof(fl4->daddr)); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The problem is that the original code is trying to copy data into a couple of struct members adjacent to each other in a single call to memcpy(). This causes a legitimate compiler warning because memcpy() overruns the length of &iph->saddr and &fl4->saddr. As these are just a couple of struct members, fix this by using direct assignments, instead of memcpy(). This helps with the ongoing efforts to globally enable -Warray-bounds and get us closer to being able to tighten the FORTIFY_SOURCE routines on memcpy(). Link: https://github.com/KSPP/linux/issues/109 Reported-by: kernel test robot Link: https://lore.kernel.org/lkml/d5ae2e65-1f18-2577-246f-bada7eee6ccd@intel.com/ Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/ipv4/ip_output.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index f52bc9c22e5b8..0ec529d77a56e 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -446,8 +446,9 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) { BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) != offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr)); - memcpy(&iph->saddr, &fl4->saddr, - sizeof(fl4->saddr) + sizeof(fl4->daddr)); + + iph->saddr = fl4->saddr; + iph->daddr = fl4->daddr; } /* Note: skb->sk can be different from sk, in case of tunnels */ From e39579f163c1487c0eb27865659276e18b9fb62c Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 26 Jul 2021 14:25:11 -0500 Subject: [PATCH 0738/5344] flow_dissector: Fix out-of-bounds warnings [ Upstream commit 323e0cb473e2a8706ff162b6b4f4fa16023c9ba7 ] Fix the following out-of-bounds warnings: net/core/flow_dissector.c: In function '__skb_flow_dissect': >> net/core/flow_dissector.c:1104:4: warning: 'memcpy' offset [24, 39] from the object at '' is out of the bounds of referenced subobject 'saddr' with type 'struct in6_addr' at offset 8 [-Warray-bounds] 1104 | memcpy(&key_addrs->v6addrs, &iph->saddr, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1105 | sizeof(key_addrs->v6addrs)); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ In file included from include/linux/ipv6.h:5, from net/core/flow_dissector.c:6: include/uapi/linux/ipv6.h:133:18: note: subobject 'saddr' declared here 133 | struct in6_addr saddr; | ^~~~~ >> net/core/flow_dissector.c:1059:4: warning: 'memcpy' offset [16, 19] from the object at '' is out of the bounds of referenced subobject 'saddr' with type 'unsigned int' at offset 12 [-Warray-bounds] 1059 | memcpy(&key_addrs->v4addrs, &iph->saddr, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1060 | sizeof(key_addrs->v4addrs)); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ In file included from include/linux/ip.h:17, from net/core/flow_dissector.c:5: include/uapi/linux/ip.h:103:9: note: subobject 'saddr' declared here 103 | __be32 saddr; | ^~~~~ The problem is that the original code is trying to copy data into a couple of struct members adjacent to each other in a single call to memcpy(). So, the compiler legitimately complains about it. As these are just a couple of members, fix this by copying each one of them in separate calls to memcpy(). This helps with the ongoing efforts to globally enable -Warray-bounds and get us closer to being able to tighten the FORTIFY_SOURCE routines on memcpy(). Link: https://github.com/KSPP/linux/issues/109 Reported-by: kernel test robot Link: https://lore.kernel.org/lkml/d5ae2e65-1f18-2577-246f-bada7eee6ccd@intel.com/ Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/core/flow_dissector.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 96957a7c732fa..b740a74f06f22 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1025,8 +1025,10 @@ bool __skb_flow_dissect(const struct net *net, FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container); - memcpy(&key_addrs->v4addrs, &iph->saddr, - sizeof(key_addrs->v4addrs)); + memcpy(&key_addrs->v4addrs.src, &iph->saddr, + sizeof(key_addrs->v4addrs.src)); + memcpy(&key_addrs->v4addrs.dst, &iph->daddr, + sizeof(key_addrs->v4addrs.dst)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } @@ -1070,8 +1072,10 @@ bool __skb_flow_dissect(const struct net *net, FLOW_DISSECTOR_KEY_IPV6_ADDRS, target_container); - memcpy(&key_addrs->v6addrs, &iph->saddr, - sizeof(key_addrs->v6addrs)); + memcpy(&key_addrs->v6addrs.src, &iph->saddr, + sizeof(key_addrs->v6addrs.src)); + memcpy(&key_addrs->v6addrs.dst, &iph->daddr, + sizeof(key_addrs->v6addrs.dst)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } From 317923351759c3a4b1cd3074135d02ad5974abf8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 12 Jul 2021 19:26:01 +0200 Subject: [PATCH 0739/5344] s390/jump_label: print real address in a case of a jump label bug [ Upstream commit 5492886c14744d239e87f1b0b774b5a341e755cc ] In case of a jump label print the real address of the piece of code where a mismatch was detected. This is right before the system panics, so there is nothing revealed. Signed-off-by: Heiko Carstens Signed-off-by: Sasha Levin --- arch/s390/kernel/jump_label.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c index ab584e8e35275..9156653b56f69 100644 --- a/arch/s390/kernel/jump_label.c +++ b/arch/s390/kernel/jump_label.c @@ -36,7 +36,7 @@ static void jump_label_bug(struct jump_entry *entry, struct insn *expected, unsigned char *ipe = (unsigned char *)expected; unsigned char *ipn = (unsigned char *)new; - pr_emerg("Jump label code mismatch at %pS [%p]\n", ipc, ipc); + pr_emerg("Jump label code mismatch at %pS [%px]\n", ipc, ipc); pr_emerg("Found: %6ph\n", ipc); pr_emerg("Expected: %6ph\n", ipe); pr_emerg("New: %6ph\n", ipn); From ce8d8057ad254a5cf9a84299973883c27e90765d Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Thu, 8 Jul 2021 14:55:42 +0200 Subject: [PATCH 0740/5344] s390: make PCI mio support a machine flag [ Upstream commit 3322ba0d7bea1e24ae464418626f6a15b69533ab ] Kernel support for the newer PCI mio instructions can be toggled off with the pci=nomio command line option which needs to integrate with common code PCI option parsing. However this option then toggles static branches which can't be toggled yet in an early_param() call. Thus commit 9964f396f1d0 ("s390: fix setting of mio addressing control") moved toggling the static branches to the PCI init routine. With this setup however we can't check for mio support outside the PCI code during early boot, i.e. before switching the static branches, which we need to be able to export this as an ELF HWCAP. Improve on this by turning mio availability into a machine flag that gets initially set based on CONFIG_PCI and the facility bit and gets toggled off if pci=nomio is found during PCI option parsing allowing simple access to this machine flag after early init. Reviewed-by: Heiko Carstens Signed-off-by: Niklas Schnelle Signed-off-by: Heiko Carstens Signed-off-by: Sasha Levin --- arch/s390/include/asm/setup.h | 2 ++ arch/s390/kernel/early.c | 4 ++++ arch/s390/pci/pci.c | 5 ++--- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index 1932088686a68..e6a5007f017d8 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -39,6 +39,7 @@ #define MACHINE_FLAG_NX BIT(15) #define MACHINE_FLAG_GS BIT(16) #define MACHINE_FLAG_SCC BIT(17) +#define MACHINE_FLAG_PCI_MIO BIT(18) #define LPP_MAGIC BIT(31) #define LPP_PID_MASK _AC(0xffffffff, UL) @@ -106,6 +107,7 @@ extern unsigned long __swsusp_reset_dma; #define MACHINE_HAS_NX (S390_lowcore.machine_flags & MACHINE_FLAG_NX) #define MACHINE_HAS_GS (S390_lowcore.machine_flags & MACHINE_FLAG_GS) #define MACHINE_HAS_SCC (S390_lowcore.machine_flags & MACHINE_FLAG_SCC) +#define MACHINE_HAS_PCI_MIO (S390_lowcore.machine_flags & MACHINE_FLAG_PCI_MIO) /* * Console mode. Override with conmode= diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index 2531776cf6cf9..eb89cb0aa60b4 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -252,6 +252,10 @@ static __init void detect_machine_facilities(void) clock_comparator_max = -1ULL >> 1; __ctl_set_bit(0, 53); } + if (IS_ENABLED(CONFIG_PCI) && test_facility(153)) { + S390_lowcore.machine_flags |= MACHINE_FLAG_PCI_MIO; + /* the control bit is set during PCI initialization */ + } } static inline void save_vector_registers(void) diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 6105b1b6e49b7..b8ddacf1efe11 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -854,7 +854,6 @@ static void zpci_mem_exit(void) } static unsigned int s390_pci_probe __initdata = 1; -static unsigned int s390_pci_no_mio __initdata; unsigned int s390_pci_force_floating __initdata; static unsigned int s390_pci_initialized; @@ -865,7 +864,7 @@ char * __init pcibios_setup(char *str) return NULL; } if (!strcmp(str, "nomio")) { - s390_pci_no_mio = 1; + S390_lowcore.machine_flags &= ~MACHINE_FLAG_PCI_MIO; return NULL; } if (!strcmp(str, "force_floating")) { @@ -890,7 +889,7 @@ static int __init pci_base_init(void) if (!test_facility(69) || !test_facility(71)) return 0; - if (test_facility(153) && !s390_pci_no_mio) { + if (MACHINE_HAS_PCI_MIO) { static_branch_enable(&have_mio); ctl_set_bit(2, 5); } From 117a3a51093e4c14f0254ea1f48c273ff8b3f1d4 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Sat, 26 Jun 2021 06:11:51 +0200 Subject: [PATCH 0741/5344] serial: 8250: Define RX trigger levels for OxSemi 950 devices [ Upstream commit d7aff291d069c4418285f3c8ee27b0ff67ce5998 ] Oxford Semiconductor 950 serial port devices have a 128-byte FIFO and in the enhanced (650) mode, which we select in `autoconfig_has_efr' with the ECB bit set in the EFR register, they support the receive interrupt trigger level selectable with FCR bits 7:6 from the set of 16, 32, 112, 120. This applies to the original OX16C950 discrete UART[1] as well as 950 cores embedded into more complex devices. For these devices we set the default to 112, which sets an excessively high level of 112 or 7/8 of the FIFO capacity, unlike with other port types where we choose at most 1/2 of their respective FIFO capacities. Additionally we don't make the trigger level configurable. Consequently frequent input overruns happen with high bit rates where hardware flow control cannot be used (e.g. terminal applications) even with otherwise highly-performant systems. Lower the default receive interrupt trigger level to 32 then, and make it configurable. Document the trigger levels along with other port types, including the set of 16, 32, 64, 112 for the transmit interrupt as well[2]. References: [1] "OX16C950 rev B High Performance UART with 128 byte FIFOs", Oxford Semiconductor, Inc., DS-0031, Sep 05, Table 10: "Receiver Trigger Levels", p. 22 [2] same, Table 9: "Transmit Interrupt Trigger Levels", p. 22 Signed-off-by: Maciej W. Rozycki Link: https://lore.kernel.org/r/alpine.DEB.2.21.2106260608480.37803@angie.orcam.me.uk Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/tty/serial/8250/8250_port.c | 3 ++- include/uapi/linux/serial_reg.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index 8a7c6d65f10ef..777ef1a9591c0 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -125,7 +125,8 @@ static const struct serial8250_config uart_config[] = { .name = "16C950/954", .fifo_size = 128, .tx_loadsz = 128, - .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, + .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01, + .rxtrig_bytes = {16, 32, 112, 120}, /* UART_CAP_EFR breaks billionon CF bluetooth card. */ .flags = UART_CAP_FIFO | UART_CAP_SLEEP, }, diff --git a/include/uapi/linux/serial_reg.h b/include/uapi/linux/serial_reg.h index be07b5470f4bb..f51bc8f368134 100644 --- a/include/uapi/linux/serial_reg.h +++ b/include/uapi/linux/serial_reg.h @@ -62,6 +62,7 @@ * ST16C654: 8 16 56 60 8 16 32 56 PORT_16654 * TI16C750: 1 16 32 56 xx xx xx xx PORT_16750 * TI16C752: 8 16 56 60 8 16 32 56 + * OX16C950: 16 32 112 120 16 32 64 112 PORT_16C950 * Tegra: 1 4 8 14 16 8 4 1 PORT_TEGRA */ #define UART_FCR_R_TRIG_00 0x00 From 8cf02e31510c17dcce455513c07464f8e139beb1 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 23 Jul 2021 09:43:10 +0200 Subject: [PATCH 0742/5344] xtensa: ISS: don't panic in rs_init [ Upstream commit 23411c720052ad860b3e579ee4873511e367130a ] While alloc_tty_driver failure in rs_init would mean we have much bigger problem, there is no reason to panic when tty_register_driver fails there. It can fail for various reasons. So handle the failure gracefully. Actually handle them both while at it. This will make at least the console functional as it was enabled earlier by console_initcall in iss_console_init. Instead of shooting down the whole system. We move tty_port_init() after alloc_tty_driver(), so that we don't need to destroy the port in case the latter function fails. Cc: Chris Zankel Cc: Max Filippov Cc: linux-xtensa@linux-xtensa.org Acked-by: Max Filippov Signed-off-by: Jiri Slaby Link: https://lore.kernel.org/r/20210723074317.32690-2-jslaby@suse.cz Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- arch/xtensa/platforms/iss/console.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/arch/xtensa/platforms/iss/console.c b/arch/xtensa/platforms/iss/console.c index af81a62faba64..e7faea3d73d3b 100644 --- a/arch/xtensa/platforms/iss/console.c +++ b/arch/xtensa/platforms/iss/console.c @@ -168,9 +168,13 @@ static const struct tty_operations serial_ops = { int __init rs_init(void) { - tty_port_init(&serial_port); + int ret; serial_driver = alloc_tty_driver(SERIAL_MAX_NUM_LINES); + if (!serial_driver) + return -ENOMEM; + + tty_port_init(&serial_port); pr_info("%s %s\n", serial_name, serial_version); @@ -190,8 +194,15 @@ int __init rs_init(void) tty_set_operations(serial_driver, &serial_ops); tty_port_link_device(&serial_port, serial_driver, 0); - if (tty_register_driver(serial_driver)) - panic("Couldn't register serial driver\n"); + ret = tty_register_driver(serial_driver); + if (ret) { + pr_err("Couldn't register serial driver\n"); + tty_driver_kref_put(serial_driver); + tty_port_destroy(&serial_port); + + return ret; + } + return 0; } From 44bf1ca6ad24b7a350638f9aaacc81e515d69e3e Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 23 Jul 2021 09:43:11 +0200 Subject: [PATCH 0743/5344] hvsi: don't panic on tty_register_driver failure [ Upstream commit 7ccbdcc4d08a6d7041e4849219bbb12ffa45db4c ] The alloc_tty_driver failure is handled gracefully in hvsi_init. But tty_register_driver is not. panic is called if that one fails. So handle the failure of tty_register_driver gracefully too. This will keep at least the console functional as it was enabled earlier by console_initcall in hvsi_console_init. Instead of shooting down the whole system. This means, we disable interrupts and restore hvsi_wait back to poll_for_state(). Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Jiri Slaby Link: https://lore.kernel.org/r/20210723074317.32690-3-jslaby@suse.cz Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/tty/hvc/hvsi.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/tty/hvc/hvsi.c b/drivers/tty/hvc/hvsi.c index 66f95f758be05..73226337f5610 100644 --- a/drivers/tty/hvc/hvsi.c +++ b/drivers/tty/hvc/hvsi.c @@ -1038,7 +1038,7 @@ static const struct tty_operations hvsi_ops = { static int __init hvsi_init(void) { - int i; + int i, ret; hvsi_driver = alloc_tty_driver(hvsi_count); if (!hvsi_driver) @@ -1069,12 +1069,25 @@ static int __init hvsi_init(void) } hvsi_wait = wait_for_state; /* irqs active now */ - if (tty_register_driver(hvsi_driver)) - panic("Couldn't register hvsi console driver\n"); + ret = tty_register_driver(hvsi_driver); + if (ret) { + pr_err("Couldn't register hvsi console driver\n"); + goto err_free_irq; + } printk(KERN_DEBUG "HVSI: registered %i devices\n", hvsi_count); return 0; +err_free_irq: + hvsi_wait = poll_for_state; + for (i = 0; i < hvsi_count; i++) { + struct hvsi_struct *hp = &hvsi_ports[i]; + + free_irq(hp->virq, hp); + } + tty_driver_kref_put(hvsi_driver); + + return ret; } device_initcall(hvsi_init); From 9b30e6b7b6a6c6ce1e4894968de80b562233fe1f Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 26 Jul 2021 15:07:17 +0200 Subject: [PATCH 0744/5344] serial: 8250_pci: make setup_port() parameters explicitly unsigned [ Upstream commit 3a96e97ab4e835078e6f27b7e1c0947814df3841 ] The bar and offset parameters to setup_port() are used in pointer math, and while it would be very difficult to get them to wrap as a negative number, just be "safe" and make them unsigned so that static checkers do not trip over them unintentionally. Cc: Jiri Slaby Reported-by: Jordy Zomer Link: https://lore.kernel.org/r/20210726130717.2052096-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/tty/serial/8250/8250_pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c index 43fc5b6a25d35..a2bb103f22fc6 100644 --- a/drivers/tty/serial/8250/8250_pci.c +++ b/drivers/tty/serial/8250/8250_pci.c @@ -89,7 +89,7 @@ static void moan_device(const char *str, struct pci_dev *dev) static int setup_port(struct serial_private *priv, struct uart_8250_port *port, - int bar, int offset, int regshift) + u8 bar, unsigned int offset, int regshift) { struct pci_dev *dev = priv->dev; From b399296feaa255ae71013697436dadebc0489eed Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 21 Jul 2021 10:45:11 +0200 Subject: [PATCH 0745/5344] staging: ks7010: Fix the initialization of the 'sleep_status' structure [ Upstream commit 56315e55119c0ea57e142b6efb7c31208628ad86 ] 'sleep_status' has 3 atomic_t members. Initialize the 3 of them instead of initializing only 2 of them and setting 0 twice to the same variable. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/d2e52a33a9beab41879551d0ae2fdfc99970adab.1626856991.git.christophe.jaillet@wanadoo.fr Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/staging/ks7010/ks7010_sdio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/ks7010/ks7010_sdio.c b/drivers/staging/ks7010/ks7010_sdio.c index 4b379542ecd50..3fbe223d59b8e 100644 --- a/drivers/staging/ks7010/ks7010_sdio.c +++ b/drivers/staging/ks7010/ks7010_sdio.c @@ -938,9 +938,9 @@ static void ks7010_private_init(struct ks_wlan_private *priv, memset(&priv->wstats, 0, sizeof(priv->wstats)); /* sleep mode */ + atomic_set(&priv->sleepstatus.status, 0); atomic_set(&priv->sleepstatus.doze_request, 0); atomic_set(&priv->sleepstatus.wakeup_request, 0); - atomic_set(&priv->sleepstatus.wakeup_request, 0); trx_device_init(priv); hostif_init(priv); From f9a937bd72662d4235399a41034eefedd8b3d653 Mon Sep 17 00:00:00 2001 From: Juhee Kang Date: Tue, 27 Jul 2021 04:10:55 +0000 Subject: [PATCH 0746/5344] samples: bpf: Fix tracex7 error raised on the missing argument [ Upstream commit 7d07006f05922b95518be403f08ef8437b67aa32 ] The current behavior of 'tracex7' doesn't consist with other bpf samples tracex{1..6}. Other samples do not require any argument to run with, but tracex7 should be run with btrfs device argument. (it should be executed with test_override_return.sh) Currently, tracex7 doesn't have any description about how to run this program and raises an unexpected error. And this result might be confusing since users might not have a hunch about how to run this program. // Current behavior # ./tracex7 sh: 1: Syntax error: word unexpected (expecting ")") // Fixed behavior # ./tracex7 ERROR: Run with the btrfs device argument! In order to fix this error, this commit adds logic to report a message and exit when running this program with a missing argument. Additionally in test_override_return.sh, there is a problem with multiple directory(tmpmnt) creation. So in this commit adds a line with removing the directory with every execution. Signed-off-by: Juhee Kang Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210727041056.23455-1-claudiajkang@gmail.com Signed-off-by: Sasha Levin --- samples/bpf/test_override_return.sh | 1 + samples/bpf/tracex7_user.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/samples/bpf/test_override_return.sh b/samples/bpf/test_override_return.sh index e68b9ee6814b8..35db26f736b9d 100755 --- a/samples/bpf/test_override_return.sh +++ b/samples/bpf/test_override_return.sh @@ -1,5 +1,6 @@ #!/bin/bash +rm -r tmpmnt rm -f testfile.img dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1 DEVICE=$(losetup --show -f testfile.img) diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c index ea6dae78f0dff..2ed13e9f3fcb0 100644 --- a/samples/bpf/tracex7_user.c +++ b/samples/bpf/tracex7_user.c @@ -13,6 +13,11 @@ int main(int argc, char **argv) char command[256]; int ret; + if (!argv[1]) { + fprintf(stderr, "ERROR: Run with the btrfs device argument!\n"); + return 0; + } + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); if (load_bpf_file(filename)) { From 38d289b34aeb16a3f34bbe50977f2a758b7cbb3f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 27 Jul 2021 15:51:30 +0300 Subject: [PATCH 0747/5344] ata: sata_dwc_460ex: No need to call phy_exit() befre phy_init() [ Upstream commit 3ad4a31620355358316fa08fcfab37b9d6c33347 ] Last change to device managed APIs cleaned up error path to simple phy_exit() call, which in some cases has been executed with NULL parameter. This per se is not a problem, but rather logical misconception: no need to free resource when it's for sure has not been allocated yet. Fix the driver accordingly. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210727125130.19977-1-andriy.shevchenko@linux.intel.com Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- drivers/ata/sata_dwc_460ex.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/ata/sata_dwc_460ex.c b/drivers/ata/sata_dwc_460ex.c index 9dcef6ac643b9..982fe91125322 100644 --- a/drivers/ata/sata_dwc_460ex.c +++ b/drivers/ata/sata_dwc_460ex.c @@ -1249,24 +1249,20 @@ static int sata_dwc_probe(struct platform_device *ofdev) irq = irq_of_parse_and_map(np, 0); if (irq == NO_IRQ) { dev_err(&ofdev->dev, "no SATA DMA irq\n"); - err = -ENODEV; - goto error_out; + return -ENODEV; } #ifdef CONFIG_SATA_DWC_OLD_DMA if (!of_find_property(np, "dmas", NULL)) { err = sata_dwc_dma_init_old(ofdev, hsdev); if (err) - goto error_out; + return err; } #endif hsdev->phy = devm_phy_optional_get(hsdev->dev, "sata-phy"); - if (IS_ERR(hsdev->phy)) { - err = PTR_ERR(hsdev->phy); - hsdev->phy = NULL; - goto error_out; - } + if (IS_ERR(hsdev->phy)) + return PTR_ERR(hsdev->phy); err = phy_init(hsdev->phy); if (err) From e465ab62c514a682c85db4863ca6bc95cbdb57c9 Mon Sep 17 00:00:00 2001 From: Desmond Cheong Zhi Xi Date: Wed, 28 Jul 2021 15:51:04 +0800 Subject: [PATCH 0748/5344] Bluetooth: skip invalid hci_sync_conn_complete_evt [ Upstream commit 92fe24a7db751b80925214ede43f8d2be792ea7b ] Syzbot reported a corrupted list in kobject_add_internal [1]. This happens when multiple HCI_EV_SYNC_CONN_COMPLETE event packets with status 0 are sent for the same HCI connection. This causes us to register the device more than once which corrupts the kset list. As this is forbidden behavior, we add a check for whether we're trying to process the same HCI_EV_SYNC_CONN_COMPLETE event multiple times for one connection. If that's the case, the event is invalid, so we report an error that the device is misbehaving, and ignore the packet. Link: https://syzkaller.appspot.com/bug?extid=66264bf2fd0476be7e6c [1] Reported-by: syzbot+66264bf2fd0476be7e6c@syzkaller.appspotmail.com Tested-by: syzbot+66264bf2fd0476be7e6c@syzkaller.appspotmail.com Signed-off-by: Desmond Cheong Zhi Xi Signed-off-by: Marcel Holtmann Signed-off-by: Sasha Levin --- net/bluetooth/hci_event.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index e8e7f108b0161..82e42d8e2ea03 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4202,6 +4202,21 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, switch (ev->status) { case 0x00: + /* The synchronous connection complete event should only be + * sent once per new connection. Receiving a successful + * complete event when the connection status is already + * BT_CONNECTED means that the device is misbehaving and sent + * multiple complete event packets for the same new connection. + * + * Registering the device more than once can corrupt kernel + * memory, hence upon detecting this invalid event, we report + * an error and ignore the packet. + */ + if (conn->state == BT_CONNECTED) { + bt_dev_err(hdev, "Ignoring connect complete event for existing connection"); + goto unlock; + } + conn->handle = __le16_to_cpu(ev->handle); conn->state = BT_CONNECTED; conn->type = ev->link_type; From 30ff001e9049247dec0b37c0f92d98cafbd0420e Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Thu, 22 Jul 2021 11:03:52 +0800 Subject: [PATCH 0749/5344] workqueue: Fix possible memory leaks in wq_numa_init() [ Upstream commit f728c4a9e8405caae69d4bc1232c54ff57b5d20f ] In error handling branch "if (WARN_ON(node == NUMA_NO_NODE))", the previously allocated memories are not released. Doing this before allocating memory eliminates memory leaks. tj: Note that the condition only occurs when the arch code is pretty broken and the WARN_ON might as well be BUG_ON(). Signed-off-by: Zhen Lei Reviewed-by: Lai Jiangshan Signed-off-by: Tejun Heo Signed-off-by: Sasha Levin --- kernel/workqueue.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6aeb53b4e19f8..885d4792abdfc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5869,6 +5869,13 @@ static void __init wq_numa_init(void) return; } + for_each_possible_cpu(cpu) { + if (WARN_ON(cpu_to_node(cpu) == NUMA_NO_NODE)) { + pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu); + return; + } + } + wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(); BUG_ON(!wq_update_unbound_numa_attrs_buf); @@ -5886,11 +5893,6 @@ static void __init wq_numa_init(void) for_each_possible_cpu(cpu) { node = cpu_to_node(cpu); - if (WARN_ON(node == NUMA_NO_NODE)) { - pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu); - /* happens iff arch is bonkers, let's just proceed */ - return; - } cpumask_set_cpu(cpu, tbl[node]); } From b45d3a4aaf6f8e4d945b9d7cebee04d8a86a4f06 Mon Sep 17 00:00:00 2001 From: Yufeng Mo Date: Fri, 30 Jul 2021 10:19:11 +0800 Subject: [PATCH 0750/5344] bonding: 3ad: fix the concurrency between __bond_release_one() and bond_3ad_state_machine_handler() [ Upstream commit 220ade77452c15ecb1ab94c3f8aaeb6d033c3582 ] Some time ago, I reported a calltrace issue "did not find a suitable aggregator", please see[1]. After a period of analysis and reproduction, I find that this problem is caused by concurrency. Before the problem occurs, the bond structure is like follows: bond0 - slaver0(eth0) - agg0.lag_ports -> port0 - port1 \ port0 \ slaver1(eth1) - agg1.lag_ports -> NULL \ port1 If we run 'ifenslave bond0 -d eth1', the process is like below: excuting __bond_release_one() | bond_upper_dev_unlink()[step1] | | | | | bond_3ad_lacpdu_recv() | | ->bond_3ad_rx_indication() | | spin_lock_bh() | | ->ad_rx_machine() | | ->__record_pdu()[step2] | | spin_unlock_bh() | | | | bond_3ad_state_machine_handler() | spin_lock_bh() | ->ad_port_selection_logic() | ->try to find free aggregator[step3] | ->try to find suitable aggregator[step4] | ->did not find a suitable aggregator[step5] | spin_unlock_bh() | | | | bond_3ad_unbind_slave() | spin_lock_bh() spin_unlock_bh() step1: already removed slaver1(eth1) from list, but port1 remains step2: receive a lacpdu and update port0 step3: port0 will be removed from agg0.lag_ports. The struct is "agg0.lag_ports -> port1" now, and agg0 is not free. At the same time, slaver1/agg1 has been removed from the list by step1. So we can't find a free aggregator now. step4: can't find suitable aggregator because of step2 step5: cause a calltrace since port->aggregator is NULL To solve this concurrency problem, put bond_upper_dev_unlink() after bond_3ad_unbind_slave(). In this way, we can invalid the port first and skip this port in bond_3ad_state_machine_handler(). This eliminates the situation that the slaver has been removed from the list but the port is still valid. [1]https://lore.kernel.org/netdev/10374.1611947473@famine/ Signed-off-by: Yufeng Mo Acked-by: Jay Vosburgh Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/bonding/bond_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index e21643377162b..1949f631e1bc5 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1926,7 +1926,6 @@ static int __bond_release_one(struct net_device *bond_dev, /* recompute stats just before removing the slave */ bond_get_stats(bond->dev, &bond->bond_stats); - bond_upper_dev_unlink(bond, slave); /* unregister rx_handler early so bond_handle_frame wouldn't be called * for this slave anymore. */ @@ -1935,6 +1934,8 @@ static int __bond_release_one(struct net_device *bond_dev, if (BOND_MODE(bond) == BOND_MODE_8023AD) bond_3ad_unbind_slave(slave); + bond_upper_dev_unlink(bond, slave); + if (bond_mode_can_use_xmit_hash(bond)) bond_update_slave_arr(bond, slave); From 7e84bd0e8b2faad4dbc6079d2f21a5e846b04e3d Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Wed, 28 Jul 2021 00:20:55 +0530 Subject: [PATCH 0751/5344] arm64: tegra: Fix Tegra194 PCIe EP compatible string [ Upstream commit bf2942a8b7c38e8cc2d5157b4f0323d7f4e5ec71 ] The initialization sequence performed by the generic platform driver pcie-designware-plat.c for a DWC based implementation doesn't work for Tegra194. Tegra194 has a different initialization sequence requirement which can only be satisfied by the Tegra194 specific platform driver pcie-tegra194.c. So, remove the generic compatible string "snps,dw-pcie-ep" from Tegra194's endpoint controller nodes. Signed-off-by: Vidya Sagar Reviewed-by: Jon Hunter Signed-off-by: Thierry Reding Signed-off-by: Sasha Levin --- arch/arm64/boot/dts/nvidia/tegra194.dtsi | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/boot/dts/nvidia/tegra194.dtsi b/arch/arm64/boot/dts/nvidia/tegra194.dtsi index 0821754f0fd6d..90adff8aa9baf 100644 --- a/arch/arm64/boot/dts/nvidia/tegra194.dtsi +++ b/arch/arm64/boot/dts/nvidia/tegra194.dtsi @@ -1434,7 +1434,7 @@ }; pcie_ep@14160000 { - compatible = "nvidia,tegra194-pcie-ep", "snps,dw-pcie-ep"; + compatible = "nvidia,tegra194-pcie-ep"; power-domains = <&bpmp TEGRA194_POWER_DOMAIN_PCIEX4A>; reg = <0x00 0x14160000 0x0 0x00020000 /* appl registers (128K) */ 0x00 0x36040000 0x0 0x00040000 /* iATU_DMA reg space (256K) */ @@ -1466,7 +1466,7 @@ }; pcie_ep@14180000 { - compatible = "nvidia,tegra194-pcie-ep", "snps,dw-pcie-ep"; + compatible = "nvidia,tegra194-pcie-ep"; power-domains = <&bpmp TEGRA194_POWER_DOMAIN_PCIEX8B>; reg = <0x00 0x14180000 0x0 0x00020000 /* appl registers (128K) */ 0x00 0x38040000 0x0 0x00040000 /* iATU_DMA reg space (256K) */ @@ -1498,7 +1498,7 @@ }; pcie_ep@141a0000 { - compatible = "nvidia,tegra194-pcie-ep", "snps,dw-pcie-ep"; + compatible = "nvidia,tegra194-pcie-ep"; power-domains = <&bpmp TEGRA194_POWER_DOMAIN_PCIEX8A>; reg = <0x00 0x141a0000 0x0 0x00020000 /* appl registers (128K) */ 0x00 0x3a040000 0x0 0x00040000 /* iATU_DMA reg space (256K) */ From 5de42a18cdb3825cdab580ad051b628c64db9912 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 2 Aug 2021 16:24:56 +0200 Subject: [PATCH 0752/5344] ASoC: Intel: bytcr_rt5640: Move "Platform Clock" routes to the maps for the matching in-/output [ Upstream commit dccd1dfd0770bfd494b68d1135b4547b2c602c42 ] Move the "Platform Clock" routes for the "Internal Mic" and "Speaker" routes to the intmic_*_map[] / *_spk_map[] arrays. This ensures that these "Platform Clock" routes do not get added when the BYT_RT5640_NO_INTERNAL_MIC_MAP / BYT_RT5640_NO_SPEAKERS quirks are used. Signed-off-by: Hans de Goede Acked-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20210802142501.991985-2-hdegoede@redhat.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/intel/boards/bytcr_rt5640.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sound/soc/intel/boards/bytcr_rt5640.c b/sound/soc/intel/boards/bytcr_rt5640.c index c67b86e2d0c0a..7830d014d9247 100644 --- a/sound/soc/intel/boards/bytcr_rt5640.c +++ b/sound/soc/intel/boards/bytcr_rt5640.c @@ -284,9 +284,6 @@ static const struct snd_soc_dapm_widget byt_rt5640_widgets[] = { static const struct snd_soc_dapm_route byt_rt5640_audio_map[] = { {"Headphone", NULL, "Platform Clock"}, {"Headset Mic", NULL, "Platform Clock"}, - {"Internal Mic", NULL, "Platform Clock"}, - {"Speaker", NULL, "Platform Clock"}, - {"Headset Mic", NULL, "MICBIAS1"}, {"IN2P", NULL, "Headset Mic"}, {"Headphone", NULL, "HPOL"}, @@ -294,19 +291,23 @@ static const struct snd_soc_dapm_route byt_rt5640_audio_map[] = { }; static const struct snd_soc_dapm_route byt_rt5640_intmic_dmic1_map[] = { + {"Internal Mic", NULL, "Platform Clock"}, {"DMIC1", NULL, "Internal Mic"}, }; static const struct snd_soc_dapm_route byt_rt5640_intmic_dmic2_map[] = { + {"Internal Mic", NULL, "Platform Clock"}, {"DMIC2", NULL, "Internal Mic"}, }; static const struct snd_soc_dapm_route byt_rt5640_intmic_in1_map[] = { + {"Internal Mic", NULL, "Platform Clock"}, {"Internal Mic", NULL, "MICBIAS1"}, {"IN1P", NULL, "Internal Mic"}, }; static const struct snd_soc_dapm_route byt_rt5640_intmic_in3_map[] = { + {"Internal Mic", NULL, "Platform Clock"}, {"Internal Mic", NULL, "MICBIAS1"}, {"IN3P", NULL, "Internal Mic"}, }; @@ -348,6 +349,7 @@ static const struct snd_soc_dapm_route byt_rt5640_ssp0_aif2_map[] = { }; static const struct snd_soc_dapm_route byt_rt5640_stereo_spk_map[] = { + {"Speaker", NULL, "Platform Clock"}, {"Speaker", NULL, "SPOLP"}, {"Speaker", NULL, "SPOLN"}, {"Speaker", NULL, "SPORP"}, @@ -355,6 +357,7 @@ static const struct snd_soc_dapm_route byt_rt5640_stereo_spk_map[] = { }; static const struct snd_soc_dapm_route byt_rt5640_mono_spk_map[] = { + {"Speaker", NULL, "Platform Clock"}, {"Speaker", NULL, "SPOLP"}, {"Speaker", NULL, "SPOLN"}, }; From 7b34327e54daea204570258102d682a9f69739d4 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Fri, 23 Jul 2021 13:22:32 +0200 Subject: [PATCH 0753/5344] media: imx258: Rectify mismatch of VTS value [ Upstream commit 51f93add3669f1b1f540de1cf397815afbd4c756 ] The frame_length_lines (0x0340) registers are hard-coded as follows: - 4208x3118 frame_length_lines = 0x0c50 - 2104x1560 frame_length_lines = 0x0638 - 1048x780 frame_length_lines = 0x034c The driver exposes the V4L2_CID_VBLANK control in read-only mode and sets its value to vts_def - height, where vts_def is a mode-dependent value coming from the supported_modes array. It is set using one of the following macros defined in the driver: #define IMX258_VTS_30FPS 0x0c98 #define IMX258_VTS_30FPS_2K 0x0638 #define IMX258_VTS_30FPS_VGA 0x034c There's a clear mismatch in the value for the full resolution mode i.e. IMX258_VTS_30FPS. Fix it by rectifying the macro with the value set for the frame_length_lines register as stated above. Signed-off-by: Laurent Pinchart Signed-off-by: Umang Jain Reviewed-by: Bingbu Cao Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/i2c/imx258.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/i2c/imx258.c b/drivers/media/i2c/imx258.c index f86ae18bc104b..5f5e50c01b129 100644 --- a/drivers/media/i2c/imx258.c +++ b/drivers/media/i2c/imx258.c @@ -22,7 +22,7 @@ #define IMX258_CHIP_ID 0x0258 /* V_TIMING internal */ -#define IMX258_VTS_30FPS 0x0c98 +#define IMX258_VTS_30FPS 0x0c50 #define IMX258_VTS_30FPS_2K 0x0638 #define IMX258_VTS_30FPS_VGA 0x034c #define IMX258_VTS_MAX 0xffff From d7affb6787c902228a92adf0a5369412b68dfee7 Mon Sep 17 00:00:00 2001 From: Umang Jain Date: Fri, 23 Jul 2021 13:22:33 +0200 Subject: [PATCH 0754/5344] media: imx258: Limit the max analogue gain to 480 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit f809665ee75fff3f4ea8907f406a66d380aeb184 ] The range for analog gain mentioned in the datasheet is [0, 480]. The real gain formula mentioned in the datasheet is: Gain = 512 / (512 – X) Hence, values larger than 511 clearly makes no sense. The gain register field is also documented to be of 9-bits in the datasheet. Certainly, it is enough to infer that, the kernel driver currently advertises an arbitrary analog gain max. Fix it by rectifying the value as per the data sheet i.e. 480. Signed-off-by: Umang Jain Reviewed-by: Laurent Pinchart Reviewed-by: Dave Stevenson Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/i2c/imx258.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/i2c/imx258.c b/drivers/media/i2c/imx258.c index 5f5e50c01b129..ffaa4a91e5713 100644 --- a/drivers/media/i2c/imx258.c +++ b/drivers/media/i2c/imx258.c @@ -46,7 +46,7 @@ /* Analog gain control */ #define IMX258_REG_ANALOG_GAIN 0x0204 #define IMX258_ANA_GAIN_MIN 0 -#define IMX258_ANA_GAIN_MAX 0x1fff +#define IMX258_ANA_GAIN_MAX 480 #define IMX258_ANA_GAIN_STEP 1 #define IMX258_ANA_GAIN_DEFAULT 0x0 From e8d77f4bf3a562749b5169e39baa9cd0fab51037 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Fri, 23 Jul 2021 10:22:59 +0200 Subject: [PATCH 0755/5344] media: v4l2-dv-timings.c: fix wrong condition in two for-loops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 4108b3e6db31acc4c68133290bbcc87d4db905c9 ] These for-loops should test against v4l2_dv_timings_presets[i].bt.width, not if i < v4l2_dv_timings_presets[i].bt.width. Luckily nothing ever broke, since the smallest width is still a lot higher than the total number of presets, but it is wrong. The last item in the presets array is all 0, so the for-loop must stop when it reaches that sentinel. Signed-off-by: Hans Verkuil Reported-by: Krzysztof Hałasa Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/v4l2-core/v4l2-dv-timings.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/media/v4l2-core/v4l2-dv-timings.c b/drivers/media/v4l2-core/v4l2-dv-timings.c index 4f23e939ead0b..60454e1b727e9 100644 --- a/drivers/media/v4l2-core/v4l2-dv-timings.c +++ b/drivers/media/v4l2-core/v4l2-dv-timings.c @@ -196,7 +196,7 @@ bool v4l2_find_dv_timings_cap(struct v4l2_dv_timings *t, if (!v4l2_valid_dv_timings(t, cap, fnc, fnc_handle)) return false; - for (i = 0; i < v4l2_dv_timings_presets[i].bt.width; i++) { + for (i = 0; v4l2_dv_timings_presets[i].bt.width; i++) { if (v4l2_valid_dv_timings(v4l2_dv_timings_presets + i, cap, fnc, fnc_handle) && v4l2_match_dv_timings(t, v4l2_dv_timings_presets + i, @@ -218,7 +218,7 @@ bool v4l2_find_dv_timings_cea861_vic(struct v4l2_dv_timings *t, u8 vic) { unsigned int i; - for (i = 0; i < v4l2_dv_timings_presets[i].bt.width; i++) { + for (i = 0; v4l2_dv_timings_presets[i].bt.width; i++) { const struct v4l2_bt_timings *bt = &v4l2_dv_timings_presets[i].bt; From 4f73299a6d7a659cab270e64279dfcb1731a6ac3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Ha=C5=82asa?= Date: Mon, 26 Jul 2021 12:46:28 +0200 Subject: [PATCH 0756/5344] media: TDA1997x: fix tda1997x_query_dv_timings() return value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 7dee1030871a48d4f3c5a74227a4b4188463479a ] Correctly propagate the tda1997x_detect_std error value. Signed-off-by: Krzysztof Hałasa Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/i2c/tda1997x.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/media/i2c/tda1997x.c b/drivers/media/i2c/tda1997x.c index 1088161498df0..18a2027ba1450 100644 --- a/drivers/media/i2c/tda1997x.c +++ b/drivers/media/i2c/tda1997x.c @@ -1695,14 +1695,15 @@ static int tda1997x_query_dv_timings(struct v4l2_subdev *sd, struct v4l2_dv_timings *timings) { struct tda1997x_state *state = to_state(sd); + int ret; v4l_dbg(1, debug, state->client, "%s\n", __func__); memset(timings, 0, sizeof(struct v4l2_dv_timings)); mutex_lock(&state->lock); - tda1997x_detect_std(state, timings); + ret = tda1997x_detect_std(state, timings); mutex_unlock(&state->lock); - return 0; + return ret; } static const struct v4l2_subdev_video_ops tda1997x_video_ops = { From 1a9591671695c208b67f83c0e53a70cc026dc360 Mon Sep 17 00:00:00 2001 From: Evgeny Novikov Date: Wed, 28 Jul 2021 16:44:32 +0200 Subject: [PATCH 0757/5344] media: tegra-cec: Handle errors of clk_prepare_enable() [ Upstream commit 38367073c796a37a61549b1f66a71b3adb03802d ] tegra_cec_probe() and tegra_cec_resume() ignored possible errors of clk_prepare_enable(). The patch fixes this. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Evgeny Novikov Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/platform/tegra-cec/tegra_cec.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/media/platform/tegra-cec/tegra_cec.c b/drivers/media/platform/tegra-cec/tegra_cec.c index a632602131f21..efb80a78d2fa2 100644 --- a/drivers/media/platform/tegra-cec/tegra_cec.c +++ b/drivers/media/platform/tegra-cec/tegra_cec.c @@ -366,7 +366,11 @@ static int tegra_cec_probe(struct platform_device *pdev) return -ENOENT; } - clk_prepare_enable(cec->clk); + ret = clk_prepare_enable(cec->clk); + if (ret) { + dev_err(&pdev->dev, "Unable to prepare clock for CEC\n"); + return ret; + } /* set context info. */ cec->dev = &pdev->dev; @@ -446,9 +450,7 @@ static int tegra_cec_resume(struct platform_device *pdev) dev_notice(&pdev->dev, "Resuming\n"); - clk_prepare_enable(cec->clk); - - return 0; + return clk_prepare_enable(cec->clk); } #endif From f81a40ef4b46df3fc36331f06f6f465f697a4bb9 Mon Sep 17 00:00:00 2001 From: Sebastian Reichel Date: Mon, 2 Aug 2021 19:23:08 +0200 Subject: [PATCH 0758/5344] ARM: dts: imx53-ppd: Fix ACHC entry [ Upstream commit cd7cd5b716d594e27a933c12f026d4f2426d7bf4 ] PPD has only one ACHC device, which effectively is a Kinetis microcontroller. It has one SPI interface used for normal communication. Additionally it's possible to flash the device firmware using NXP's EzPort protocol by correctly driving a second chip select pin and the device reset pin. Signed-off-by: Sebastian Reichel Link: https://lore.kernel.org/r/20210802172309.164365-3-sebastian.reichel@collabora.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- arch/arm/boot/dts/imx53-ppd.dts | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/arch/arm/boot/dts/imx53-ppd.dts b/arch/arm/boot/dts/imx53-ppd.dts index 5ff9a179c83c3..c80d1700e0949 100644 --- a/arch/arm/boot/dts/imx53-ppd.dts +++ b/arch/arm/boot/dts/imx53-ppd.dts @@ -70,6 +70,12 @@ clock-frequency = <11289600>; }; + achc_24M: achc-clock { + compatible = "fixed-clock"; + #clock-cells = <0>; + clock-frequency = <24000000>; + }; + sgtlsound: sound { compatible = "fsl,imx53-cpuvo-sgtl5000", "fsl,imx-audio-sgtl5000"; @@ -287,16 +293,13 @@ &gpio4 12 GPIO_ACTIVE_LOW>; status = "okay"; - spidev0: spi@0 { - compatible = "ge,achc"; - reg = <0>; - spi-max-frequency = <1000000>; - }; - - spidev1: spi@1 { - compatible = "ge,achc"; - reg = <1>; - spi-max-frequency = <1000000>; + spidev0: spi@1 { + compatible = "ge,achc", "nxp,kinetis-k20"; + reg = <1>, <0>; + vdd-supply = <®_3v3>; + vdda-supply = <®_3v3>; + clocks = <&achc_24M>; + reset-gpios = <&gpio3 6 GPIO_ACTIVE_LOW>; }; gpioxra0: gpio@2 { From a24a17476cfcc69ace1d4bf566ef930c3a401e98 Mon Sep 17 00:00:00 2001 From: Vinod Koul Date: Mon, 8 Mar 2021 11:38:25 +0530 Subject: [PATCH 0759/5344] arm64: dts: qcom: sdm660: use reg value for memory node [ Upstream commit c81210e38966cfa1c784364e4035081c3227cf5b ] memory node like other node should be node@reg, which is missing in this case, so fix it up arch/arm64/boot/dts/qcom/ipq8074-hk01.dt.yaml: /: memory: False schema does not allow {'device_type': ['memory'], 'reg': [[0, 1073741824, 0, 536870912]]} Signed-off-by: Vinod Koul Link: https://lore.kernel.org/r/20210308060826.3074234-18-vkoul@kernel.org Signed-off-by: Bjorn Andersson Signed-off-by: Sasha Levin --- arch/arm64/boot/dts/qcom/ipq8074-hk01.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/qcom/ipq8074-hk01.dts b/arch/arm64/boot/dts/qcom/ipq8074-hk01.dts index 70be3f95209bc..830d9f2c1e5f2 100644 --- a/arch/arm64/boot/dts/qcom/ipq8074-hk01.dts +++ b/arch/arm64/boot/dts/qcom/ipq8074-hk01.dts @@ -20,7 +20,7 @@ stdout-path = "serial0"; }; - memory { + memory@40000000 { device_type = "memory"; reg = <0x0 0x40000000 0x0 0x20000000>; }; From 465fd416f99c32029aba0fdf25d67433a1376efb Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 6 Aug 2021 12:13:40 -0700 Subject: [PATCH 0760/5344] net: ethernet: stmmac: Do not use unreachable() in ipq806x_gmac_probe() [ Upstream commit 4367355dd90942a71641c98c40c74589c9bddf90 ] When compiling with clang in certain configurations, an objtool warning appears: drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.o: warning: objtool: ipq806x_gmac_probe() falls through to next function phy_modes() This happens because the unreachable annotation in the third switch statement is not eliminated. The compiler should know that the first default case would prevent the second and third from being reached as the comment notes but sanitizer options can make it harder for the compiler to reason this out. Help the compiler out by eliminating the unreachable() annotation and unifying the default case error handling so that there is no objtool warning, the meaning of the code stays the same, and there is less duplication. Reported-by: Sami Tolvanen Tested-by: Sami Tolvanen Signed-off-by: Nathan Chancellor Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- .../ethernet/stmicro/stmmac/dwmac-ipq806x.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c index 0f56f8e336917..03b11f191c262 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c @@ -288,10 +288,7 @@ static int ipq806x_gmac_probe(struct platform_device *pdev) val &= ~NSS_COMMON_GMAC_CTL_PHY_IFACE_SEL; break; default: - dev_err(&pdev->dev, "Unsupported PHY mode: \"%s\"\n", - phy_modes(gmac->phy_mode)); - err = -EINVAL; - goto err_remove_config_dt; + goto err_unsupported_phy; } regmap_write(gmac->nss_common, NSS_COMMON_GMAC_CTL(gmac->id), val); @@ -308,10 +305,7 @@ static int ipq806x_gmac_probe(struct platform_device *pdev) NSS_COMMON_CLK_SRC_CTRL_OFFSET(gmac->id); break; default: - dev_err(&pdev->dev, "Unsupported PHY mode: \"%s\"\n", - phy_modes(gmac->phy_mode)); - err = -EINVAL; - goto err_remove_config_dt; + goto err_unsupported_phy; } regmap_write(gmac->nss_common, NSS_COMMON_CLK_SRC_CTRL, val); @@ -328,8 +322,7 @@ static int ipq806x_gmac_probe(struct platform_device *pdev) NSS_COMMON_CLK_GATE_GMII_TX_EN(gmac->id); break; default: - /* We don't get here; the switch above will have errored out */ - unreachable(); + goto err_unsupported_phy; } regmap_write(gmac->nss_common, NSS_COMMON_CLK_GATE, val); @@ -360,6 +353,11 @@ static int ipq806x_gmac_probe(struct platform_device *pdev) return 0; +err_unsupported_phy: + dev_err(&pdev->dev, "Unsupported PHY mode: \"%s\"\n", + phy_modes(gmac->phy_mode)); + err = -EINVAL; + err_remove_config_dt: stmmac_remove_config_dt(pdev, plat_dat); From e29e9e1845deb4c969ee7e78624eaee8d64a44e0 Mon Sep 17 00:00:00 2001 From: David Heidelberg Date: Thu, 15 Jul 2021 08:09:25 +0200 Subject: [PATCH 0761/5344] drm/msm: mdp4: drop vblank get/put from prepare/complete_commit [ Upstream commit 56bd931ae506730c9ab1e4cc4bfefa43fc2d18fa ] msm_atomic is doing vblank get/put's already, currently there no need to duplicate the effort in MDP4 Fix warning: ... WARNING: CPU: 3 PID: 79 at drivers/gpu/drm/drm_vblank.c:1194 drm_vblank_put+0x1cc/0x1d4 ... and multiple vblank time-outs: ... msm 5100000.mdp: vblank time out, crtc=1 ... Tested on Nexus 7 2013 (deb), LTS 5.10.50. Introduced by: 119ecb7fd3b5 ("drm/msm/mdp4: request vblank during modeset") Signed-off-by: David Heidelberg Link: https://lore.kernel.org/r/20210715060925.7880-1-david@ixit.cz Signed-off-by: Dmitry Baryshkov Signed-off-by: Rob Clark Signed-off-by: Sasha Levin --- drivers/gpu/drm/msm/disp/mdp4/mdp4_kms.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/mdp4/mdp4_kms.c b/drivers/gpu/drm/msm/disp/mdp4/mdp4_kms.c index 20194d86d0339..5d50e93efe363 100644 --- a/drivers/gpu/drm/msm/disp/mdp4/mdp4_kms.c +++ b/drivers/gpu/drm/msm/disp/mdp4/mdp4_kms.c @@ -108,13 +108,6 @@ static void mdp4_disable_commit(struct msm_kms *kms) static void mdp4_prepare_commit(struct msm_kms *kms, struct drm_atomic_state *state) { - int i; - struct drm_crtc *crtc; - struct drm_crtc_state *crtc_state; - - /* see 119ecb7fd */ - for_each_new_crtc_in_state(state, crtc, crtc_state, i) - drm_crtc_vblank_get(crtc); } static void mdp4_flush_commit(struct msm_kms *kms, unsigned crtc_mask) @@ -133,12 +126,6 @@ static void mdp4_wait_flush(struct msm_kms *kms, unsigned crtc_mask) static void mdp4_complete_commit(struct msm_kms *kms, unsigned crtc_mask) { - struct mdp4_kms *mdp4_kms = to_mdp4_kms(to_mdp_kms(kms)); - struct drm_crtc *crtc; - - /* see 119ecb7fd */ - for_each_crtc_mask(mdp4_kms->dev, crtc, crtc_mask) - drm_crtc_vblank_put(crtc); } static long mdp4_round_pixclk(struct msm_kms *kms, unsigned long rate, From fc8333bb6e72c779d655771af748e152dbc20747 Mon Sep 17 00:00:00 2001 From: Jussi Maki Date: Sat, 31 Jul 2021 05:57:37 +0000 Subject: [PATCH 0762/5344] selftests/bpf: Fix xdp_tx.c prog section name [ Upstream commit 95413846cca37f20000dd095cf6d91f8777129d7 ] The program type cannot be deduced from 'tx' which causes an invalid argument error when trying to load xdp_tx.o using the skeleton. Rename the section name to "xdp" so that libbpf can deduce the type. Signed-off-by: Jussi Maki Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210731055738.16820-7-joamaki@gmail.com Signed-off-by: Sasha Levin --- tools/testing/selftests/bpf/progs/xdp_tx.c | 2 +- tools/testing/selftests/bpf/test_xdp_veth.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/xdp_tx.c b/tools/testing/selftests/bpf/progs/xdp_tx.c index 57912e7c94b0a..9ed477776eca8 100644 --- a/tools/testing/selftests/bpf/progs/xdp_tx.c +++ b/tools/testing/selftests/bpf/progs/xdp_tx.c @@ -3,7 +3,7 @@ #include #include "bpf_helpers.h" -SEC("tx") +SEC("xdp") int xdp_tx(struct xdp_md *xdp) { return XDP_TX; diff --git a/tools/testing/selftests/bpf/test_xdp_veth.sh b/tools/testing/selftests/bpf/test_xdp_veth.sh index ba8ffcdaac302..995278e684b6e 100755 --- a/tools/testing/selftests/bpf/test_xdp_veth.sh +++ b/tools/testing/selftests/bpf/test_xdp_veth.sh @@ -108,7 +108,7 @@ ip link set dev veth2 xdp pinned $BPF_DIR/progs/redirect_map_1 ip link set dev veth3 xdp pinned $BPF_DIR/progs/redirect_map_2 ip -n ns1 link set dev veth11 xdp obj xdp_dummy.o sec xdp_dummy -ip -n ns2 link set dev veth22 xdp obj xdp_tx.o sec tx +ip -n ns2 link set dev veth22 xdp obj xdp_tx.o sec xdp ip -n ns3 link set dev veth33 xdp obj xdp_dummy.o sec xdp_dummy trap cleanup EXIT From 4d139e3f35740a9bbc54a6119668524eeddce342 Mon Sep 17 00:00:00 2001 From: Desmond Cheong Zhi Xi Date: Tue, 10 Aug 2021 12:14:05 +0800 Subject: [PATCH 0763/5344] Bluetooth: schedule SCO timeouts with delayed_work [ Upstream commit ba316be1b6a00db7126ed9a39f9bee434a508043 ] struct sock.sk_timer should be used as a sock cleanup timer. However, SCO uses it to implement sock timeouts. This causes issues because struct sock.sk_timer's callback is run in an IRQ context, and the timer callback function sco_sock_timeout takes a spin lock on the socket. However, other functions such as sco_conn_del and sco_conn_ready take the spin lock with interrupts enabled. This inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} lock usage could lead to deadlocks as reported by Syzbot [1]: CPU0 ---- lock(slock-AF_BLUETOOTH-BTPROTO_SCO); lock(slock-AF_BLUETOOTH-BTPROTO_SCO); To fix this, we use delayed work to implement SCO sock timouts instead. This allows us to avoid taking the spin lock on the socket in an IRQ context, and corrects the misuse of struct sock.sk_timer. As a note, cancel_delayed_work is used instead of cancel_delayed_work_sync in sco_sock_set_timer and sco_sock_clear_timer to avoid a deadlock. In the future, the call to bh_lock_sock inside sco_sock_timeout should be changed to lock_sock to synchronize with other functions using lock_sock. However, since sco_sock_set_timer and sco_sock_clear_timer are sometimes called under the locked socket (in sco_connect and __sco_sock_close), cancel_delayed_work_sync might cause them to sleep until an sco_sock_timeout that has started finishes running. But sco_sock_timeout would also sleep until it can grab the lock_sock. Using cancel_delayed_work is fine because sco_sock_timeout does not change from run to run, hence there is no functional difference between: 1. waiting for a timeout to finish running before scheduling another timeout 2. scheduling another timeout while a timeout is running. Link: https://syzkaller.appspot.com/bug?id=9089d89de0502e120f234ca0fc8a703f7368b31e [1] Reported-by: syzbot+2f6d7c28bb4bf7e82060@syzkaller.appspotmail.com Tested-by: syzbot+2f6d7c28bb4bf7e82060@syzkaller.appspotmail.com Signed-off-by: Desmond Cheong Zhi Xi Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Sasha Levin --- net/bluetooth/sco.c | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 1b7540cb8e5c4..818f244aee46c 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -48,6 +48,8 @@ struct sco_conn { spinlock_t lock; struct sock *sk; + struct delayed_work timeout_work; + unsigned int mtu; }; @@ -73,9 +75,20 @@ struct sco_pinfo { #define SCO_CONN_TIMEOUT (HZ * 40) #define SCO_DISCONN_TIMEOUT (HZ * 2) -static void sco_sock_timeout(struct timer_list *t) +static void sco_sock_timeout(struct work_struct *work) { - struct sock *sk = from_timer(sk, t, sk_timer); + struct sco_conn *conn = container_of(work, struct sco_conn, + timeout_work.work); + struct sock *sk; + + sco_conn_lock(conn); + sk = conn->sk; + if (sk) + sock_hold(sk); + sco_conn_unlock(conn); + + if (!sk) + return; BT_DBG("sock %p state %d", sk, sk->sk_state); @@ -89,14 +102,21 @@ static void sco_sock_timeout(struct timer_list *t) static void sco_sock_set_timer(struct sock *sk, long timeout) { + if (!sco_pi(sk)->conn) + return; + BT_DBG("sock %p state %d timeout %ld", sk, sk->sk_state, timeout); - sk_reset_timer(sk, &sk->sk_timer, jiffies + timeout); + cancel_delayed_work(&sco_pi(sk)->conn->timeout_work); + schedule_delayed_work(&sco_pi(sk)->conn->timeout_work, timeout); } static void sco_sock_clear_timer(struct sock *sk) { + if (!sco_pi(sk)->conn) + return; + BT_DBG("sock %p state %d", sk, sk->sk_state); - sk_stop_timer(sk, &sk->sk_timer); + cancel_delayed_work(&sco_pi(sk)->conn->timeout_work); } /* ---- SCO connections ---- */ @@ -176,6 +196,9 @@ static void sco_conn_del(struct hci_conn *hcon, int err) sco_chan_del(sk, err); bh_unlock_sock(sk); sock_put(sk); + + /* Ensure no more work items will run before freeing conn. */ + cancel_delayed_work_sync(&conn->timeout_work); } hcon->sco_data = NULL; @@ -190,6 +213,8 @@ static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, sco_pi(sk)->conn = conn; conn->sk = sk; + INIT_DELAYED_WORK(&conn->timeout_work, sco_sock_timeout); + if (parent) bt_accept_enqueue(parent, sk, true); } @@ -484,8 +509,6 @@ static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, sco_pi(sk)->setting = BT_VOICE_CVSD_16BIT; - timer_setup(&sk->sk_timer, sco_sock_timeout, 0); - bt_sock_link(&sco_sk_list, sk); return sk; } From 45fc8dadf6f4e7569d11c521e275954bbdcf5526 Mon Sep 17 00:00:00 2001 From: Desmond Cheong Zhi Xi Date: Tue, 10 Aug 2021 12:14:06 +0800 Subject: [PATCH 0764/5344] Bluetooth: avoid circular locks in sco_sock_connect [ Upstream commit 734bc5ff783115aa3164f4e9dd5967ae78e0a8ab ] In a future patch, calls to bh_lock_sock in sco.c should be replaced by lock_sock now that none of the functions are run in IRQ context. However, doing so results in a circular locking dependency: ====================================================== WARNING: possible circular locking dependency detected 5.14.0-rc4-syzkaller #0 Not tainted ------------------------------------------------------ syz-executor.2/14867 is trying to acquire lock: ffff88803e3c1120 (sk_lock-AF_BLUETOOTH-BTPROTO_SCO){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1613 [inline] ffff88803e3c1120 (sk_lock-AF_BLUETOOTH-BTPROTO_SCO){+.+.}-{0:0}, at: sco_conn_del+0x12a/0x2a0 net/bluetooth/sco.c:191 but task is already holding lock: ffffffff8d2dc7c8 (hci_cb_list_lock){+.+.}-{3:3}, at: hci_disconn_cfm include/net/bluetooth/hci_core.h:1497 [inline] ffffffff8d2dc7c8 (hci_cb_list_lock){+.+.}-{3:3}, at: hci_conn_hash_flush+0xda/0x260 net/bluetooth/hci_conn.c:1608 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (hci_cb_list_lock){+.+.}-{3:3}: __mutex_lock_common kernel/locking/mutex.c:959 [inline] __mutex_lock+0x12a/0x10a0 kernel/locking/mutex.c:1104 hci_connect_cfm include/net/bluetooth/hci_core.h:1482 [inline] hci_remote_features_evt net/bluetooth/hci_event.c:3263 [inline] hci_event_packet+0x2f4d/0x7c50 net/bluetooth/hci_event.c:6240 hci_rx_work+0x4f8/0xd30 net/bluetooth/hci_core.c:5122 process_one_work+0x98d/0x1630 kernel/workqueue.c:2276 worker_thread+0x658/0x11f0 kernel/workqueue.c:2422 kthread+0x3e5/0x4d0 kernel/kthread.c:319 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 -> #1 (&hdev->lock){+.+.}-{3:3}: __mutex_lock_common kernel/locking/mutex.c:959 [inline] __mutex_lock+0x12a/0x10a0 kernel/locking/mutex.c:1104 sco_connect net/bluetooth/sco.c:245 [inline] sco_sock_connect+0x227/0xa10 net/bluetooth/sco.c:601 __sys_connect_file+0x155/0x1a0 net/socket.c:1879 __sys_connect+0x161/0x190 net/socket.c:1896 __do_sys_connect net/socket.c:1906 [inline] __se_sys_connect net/socket.c:1903 [inline] __x64_sys_connect+0x6f/0xb0 net/socket.c:1903 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae -> #0 (sk_lock-AF_BLUETOOTH-BTPROTO_SCO){+.+.}-{0:0}: check_prev_add kernel/locking/lockdep.c:3051 [inline] check_prevs_add kernel/locking/lockdep.c:3174 [inline] validate_chain kernel/locking/lockdep.c:3789 [inline] __lock_acquire+0x2a07/0x54a0 kernel/locking/lockdep.c:5015 lock_acquire kernel/locking/lockdep.c:5625 [inline] lock_acquire+0x1ab/0x510 kernel/locking/lockdep.c:5590 lock_sock_nested+0xca/0x120 net/core/sock.c:3170 lock_sock include/net/sock.h:1613 [inline] sco_conn_del+0x12a/0x2a0 net/bluetooth/sco.c:191 sco_disconn_cfm+0x71/0xb0 net/bluetooth/sco.c:1202 hci_disconn_cfm include/net/bluetooth/hci_core.h:1500 [inline] hci_conn_hash_flush+0x127/0x260 net/bluetooth/hci_conn.c:1608 hci_dev_do_close+0x528/0x1130 net/bluetooth/hci_core.c:1778 hci_unregister_dev+0x1c0/0x5a0 net/bluetooth/hci_core.c:4015 vhci_release+0x70/0xe0 drivers/bluetooth/hci_vhci.c:340 __fput+0x288/0x920 fs/file_table.c:280 task_work_run+0xdd/0x1a0 kernel/task_work.c:164 exit_task_work include/linux/task_work.h:32 [inline] do_exit+0xbd4/0x2a60 kernel/exit.c:825 do_group_exit+0x125/0x310 kernel/exit.c:922 get_signal+0x47f/0x2160 kernel/signal.c:2808 arch_do_signal_or_restart+0x2a9/0x1c40 arch/x86/kernel/signal.c:865 handle_signal_work kernel/entry/common.c:148 [inline] exit_to_user_mode_loop kernel/entry/common.c:172 [inline] exit_to_user_mode_prepare+0x17d/0x290 kernel/entry/common.c:209 __syscall_exit_to_user_mode_work kernel/entry/common.c:291 [inline] syscall_exit_to_user_mode+0x19/0x60 kernel/entry/common.c:302 ret_from_fork+0x15/0x30 arch/x86/entry/entry_64.S:288 other info that might help us debug this: Chain exists of: sk_lock-AF_BLUETOOTH-BTPROTO_SCO --> &hdev->lock --> hci_cb_list_lock Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(hci_cb_list_lock); lock(&hdev->lock); lock(hci_cb_list_lock); lock(sk_lock-AF_BLUETOOTH-BTPROTO_SCO); *** DEADLOCK *** The issue is that the lock hierarchy should go from &hdev->lock --> hci_cb_list_lock --> sk_lock-AF_BLUETOOTH-BTPROTO_SCO. For example, one such call trace is: hci_dev_do_close(): hci_dev_lock(); hci_conn_hash_flush(): hci_disconn_cfm(): mutex_lock(&hci_cb_list_lock); sco_disconn_cfm(): sco_conn_del(): lock_sock(sk); However, in sco_sock_connect, we call lock_sock before calling hci_dev_lock inside sco_connect, thus inverting the lock hierarchy. We fix this by pulling the call to hci_dev_lock out from sco_connect. Signed-off-by: Desmond Cheong Zhi Xi Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Sasha Levin --- net/bluetooth/sco.c | 39 ++++++++++++++++----------------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 818f244aee46c..1915943bb646a 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -234,44 +234,32 @@ static int sco_chan_add(struct sco_conn *conn, struct sock *sk, return err; } -static int sco_connect(struct sock *sk) +static int sco_connect(struct hci_dev *hdev, struct sock *sk) { struct sco_conn *conn; struct hci_conn *hcon; - struct hci_dev *hdev; int err, type; BT_DBG("%pMR -> %pMR", &sco_pi(sk)->src, &sco_pi(sk)->dst); - hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, BDADDR_BREDR); - if (!hdev) - return -EHOSTUNREACH; - - hci_dev_lock(hdev); - if (lmp_esco_capable(hdev) && !disable_esco) type = ESCO_LINK; else type = SCO_LINK; if (sco_pi(sk)->setting == BT_VOICE_TRANSPARENT && - (!lmp_transp_capable(hdev) || !lmp_esco_capable(hdev))) { - err = -EOPNOTSUPP; - goto done; - } + (!lmp_transp_capable(hdev) || !lmp_esco_capable(hdev))) + return -EOPNOTSUPP; hcon = hci_connect_sco(hdev, type, &sco_pi(sk)->dst, sco_pi(sk)->setting); - if (IS_ERR(hcon)) { - err = PTR_ERR(hcon); - goto done; - } + if (IS_ERR(hcon)) + return PTR_ERR(hcon); conn = sco_conn_add(hcon); if (!conn) { hci_conn_drop(hcon); - err = -ENOMEM; - goto done; + return -ENOMEM; } /* Update source addr of the socket */ @@ -279,7 +267,7 @@ static int sco_connect(struct sock *sk) err = sco_chan_add(conn, sk, NULL); if (err) - goto done; + return err; if (hcon->state == BT_CONNECTED) { sco_sock_clear_timer(sk); @@ -289,9 +277,6 @@ static int sco_connect(struct sock *sk) sco_sock_set_timer(sk, sk->sk_sndtimeo); } -done: - hci_dev_unlock(hdev); - hci_dev_put(hdev); return err; } @@ -573,6 +558,7 @@ static int sco_sock_connect(struct socket *sock, struct sockaddr *addr, int alen { struct sockaddr_sco *sa = (struct sockaddr_sco *) addr; struct sock *sk = sock->sk; + struct hci_dev *hdev; int err; BT_DBG("sk %p", sk); @@ -587,12 +573,19 @@ static int sco_sock_connect(struct socket *sock, struct sockaddr *addr, int alen if (sk->sk_type != SOCK_SEQPACKET) return -EINVAL; + hdev = hci_get_route(&sa->sco_bdaddr, &sco_pi(sk)->src, BDADDR_BREDR); + if (!hdev) + return -EHOSTUNREACH; + hci_dev_lock(hdev); + lock_sock(sk); /* Set destination address and psm */ bacpy(&sco_pi(sk)->dst, &sa->sco_bdaddr); - err = sco_connect(sk); + err = sco_connect(hdev, sk); + hci_dev_unlock(hdev); + hci_dev_put(hdev); if (err) goto done; From db91449f3bd05a54c0b31eb75c83a4e218206c48 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Tue, 10 Aug 2021 21:15:05 +0300 Subject: [PATCH 0765/5344] net/mlx5: Fix variable type to match 64bit [ Upstream commit 979aa51967add26b37f9d77e01729d44a2da8e5f ] Fix the following smatch warning: wait_func_handle_exec_timeout() warn: should '1 << ent->idx' be a 64 bit type? Use 1ULL, to have a 64 bit type variable. Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Eran Ben Elisha Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed Signed-off-by: Sasha Levin --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 402dae3bca4da..f40c04bc8f3d1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -870,7 +870,7 @@ static void cb_timeout_handler(struct work_struct *work) ent->ret = -ETIMEDOUT; mlx5_core_warn(dev, "cmd[%d]: %s(0x%x) Async, timeout. Will cause a leak of a command resource\n", ent->idx, mlx5_command_str(msg_to_opcode(ent->in)), msg_to_opcode(ent->in)); - mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true); + mlx5_cmd_comp_handler(dev, 1ULL << ent->idx, true); out: cmd_ent_put(ent); /* for the cmd_ent_get() took on schedule delayed work */ @@ -982,7 +982,7 @@ static void cmd_work_handler(struct work_struct *work) MLX5_SET(mbox_out, ent->out, status, status); MLX5_SET(mbox_out, ent->out, syndrome, drv_synd); - mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true); + mlx5_cmd_comp_handler(dev, 1ULL << ent->idx, true); return; } @@ -996,7 +996,7 @@ static void cmd_work_handler(struct work_struct *work) poll_timeout(ent); /* make sure we read the descriptor after ownership is SW */ rmb(); - mlx5_cmd_comp_handler(dev, 1UL << ent->idx, (ent->ret == -ETIMEDOUT)); + mlx5_cmd_comp_handler(dev, 1ULL << ent->idx, (ent->ret == -ETIMEDOUT)); } } @@ -1056,7 +1056,7 @@ static void wait_func_handle_exec_timeout(struct mlx5_core_dev *dev, mlx5_command_str(msg_to_opcode(ent->in)), msg_to_opcode(ent->in)); ent->ret = -ETIMEDOUT; - mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true); + mlx5_cmd_comp_handler(dev, 1ULL << ent->idx, true); } static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent) From a80152332e2039467a9c787dae9e4fdc46bd5293 Mon Sep 17 00:00:00 2001 From: Tuo Li Date: Wed, 11 Aug 2021 04:34:58 -0700 Subject: [PATCH 0766/5344] gpu: drm: amd: amdgpu: amdgpu_i2c: fix possible uninitialized-variable access in amdgpu_i2c_router_select_ddc_port() [ Upstream commit a211260c34cfadc6068fece8c9e99e0fe1e2a2b6 ] The variable val is declared without initialization, and its address is passed to amdgpu_i2c_get_byte(). In this function, the value of val is accessed in: DRM_DEBUG("i2c 0x%02x 0x%02x read failed\n", addr, *val); Also, when amdgpu_i2c_get_byte() returns, val may remain uninitialized, but it is accessed in: val &= ~amdgpu_connector->router.ddc_mux_control_pin; To fix this possible uninitialized-variable access, initialize val to 0 in amdgpu_i2c_router_select_ddc_port(). Reported-by: TOTE Robot Signed-off-by: Tuo Li Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/amdgpu/amdgpu_i2c.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_i2c.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_i2c.c index 70dbe343f51df..89cecdba81ace 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_i2c.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_i2c.c @@ -339,7 +339,7 @@ static void amdgpu_i2c_put_byte(struct amdgpu_i2c_chan *i2c_bus, void amdgpu_i2c_router_select_ddc_port(const struct amdgpu_connector *amdgpu_connector) { - u8 val; + u8 val = 0; if (!amdgpu_connector->router.ddc_valid) return; From 5e0c6c614a84694779966aea8bf7eeab8055ead5 Mon Sep 17 00:00:00 2001 From: Tuo Li Date: Tue, 10 Aug 2021 21:07:03 -0700 Subject: [PATCH 0767/5344] drm/display: fix possible null-pointer dereference in dcn10_set_clock() [ Upstream commit 554594567b1fa3da74f88ec7b2dc83d000c58e98 ] The variable dc->clk_mgr is checked in: if (dc->clk_mgr && dc->clk_mgr->funcs->get_clock) This indicates dc->clk_mgr can be NULL. However, it is dereferenced in: if (!dc->clk_mgr->funcs->get_clock) To fix this null-pointer dereference, check dc->clk_mgr and the function pointer dc->clk_mgr->funcs->get_clock earlier, and return if one of them is NULL. Reported-by: TOTE Robot Signed-off-by: Tuo Li Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin --- .../gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c index 60123db7ba02f..bc5ebea1abede 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c @@ -3264,13 +3264,12 @@ static enum dc_status dcn10_set_clock(struct dc *dc, struct dc_clock_config clock_cfg = {0}; struct dc_clocks *current_clocks = &context->bw_ctx.bw.dcn.clk; - if (dc->clk_mgr && dc->clk_mgr->funcs->get_clock) - dc->clk_mgr->funcs->get_clock(dc->clk_mgr, - context, clock_type, &clock_cfg); - - if (!dc->clk_mgr->funcs->get_clock) + if (!dc->clk_mgr || !dc->clk_mgr->funcs->get_clock) return DC_FAIL_UNSUPPORTED_1; + dc->clk_mgr->funcs->get_clock(dc->clk_mgr, + context, clock_type, &clock_cfg); + if (clk_khz > clock_cfg.max_clock_khz) return DC_FAIL_CLK_EXCEED_MAX; @@ -3288,7 +3287,7 @@ static enum dc_status dcn10_set_clock(struct dc *dc, else return DC_ERROR_UNEXPECTED; - if (dc->clk_mgr && dc->clk_mgr->funcs->update_clocks) + if (dc->clk_mgr->funcs->update_clocks) dc->clk_mgr->funcs->update_clocks(dc->clk_mgr, context, true); return DC_OK; From 218f8dff9ce182343cc0476737ad1fcc5cfd5b63 Mon Sep 17 00:00:00 2001 From: Johan Almbladh Date: Mon, 28 Jun 2021 14:32:46 +0200 Subject: [PATCH 0768/5344] mac80211: Fix monitor MTU limit so that A-MSDUs get through [ Upstream commit 79f5962baea74ce1cd4e5949598944bff854b166 ] The maximum MTU was set to 2304, which is the maximum MSDU size. While this is valid for normal WLAN interfaces, it is too low for monitor interfaces. A monitor interface may receive and inject MPDU frames, and the maximum MPDU frame size is larger than 2304. The MPDU may also contain an A-MSDU frame, in which case the size may be much larger than the MTU limit. Since the maximum size of an A-MSDU depends on the PHY mode of the transmitting STA, it is not possible to set an exact MTU limit for a monitor interface. Now the maximum MTU for a monitor interface is unrestricted. Signed-off-by: Johan Almbladh Link: https://lore.kernel.org/r/20210628123246.2070558-1-johan.almbladh@anyfinetworks.com Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin --- net/mac80211/iface.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 6f576306a4d74..ddc001ad90555 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1875,9 +1875,16 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, netdev_set_default_ethtool_ops(ndev, &ieee80211_ethtool_ops); - /* MTU range: 256 - 2304 */ + /* MTU range is normally 256 - 2304, where the upper limit is + * the maximum MSDU size. Monitor interfaces send and receive + * MPDU and A-MSDU frames which may be much larger so we do + * not impose an upper limit in that case. + */ ndev->min_mtu = 256; - ndev->max_mtu = local->hw.max_mtu; + if (type == NL80211_IFTYPE_MONITOR) + ndev->max_mtu = 0; + else + ndev->max_mtu = local->hw.max_mtu; ret = register_netdevice(ndev); if (ret) { From e6626c761b11a199007666e323e7509e72e182fe Mon Sep 17 00:00:00 2001 From: Andreas Obergschwandtner Date: Thu, 29 Jul 2021 16:42:26 +0200 Subject: [PATCH 0769/5344] ARM: tegra: tamonten: Fix UART pad setting [ Upstream commit 2270ad2f4e123336af685ecedd1618701cb4ca1e ] This patch fixes the tristate and pullup configuration for UART 1 to 3 on the Tamonten SOM. Signed-off-by: Andreas Obergschwandtner Signed-off-by: Thierry Reding Signed-off-by: Sasha Levin --- arch/arm/boot/dts/tegra20-tamonten.dtsi | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/arm/boot/dts/tegra20-tamonten.dtsi b/arch/arm/boot/dts/tegra20-tamonten.dtsi index 20137fc578b1b..394a6b4dc69d5 100644 --- a/arch/arm/boot/dts/tegra20-tamonten.dtsi +++ b/arch/arm/boot/dts/tegra20-tamonten.dtsi @@ -185,8 +185,9 @@ nvidia,pins = "ata", "atb", "atc", "atd", "ate", "cdev1", "cdev2", "dap1", "dtb", "gma", "gmb", "gmc", "gmd", "gme", "gpu7", - "gpv", "i2cp", "pta", "rm", "slxa", - "slxk", "spia", "spib", "uac"; + "gpv", "i2cp", "irrx", "irtx", "pta", + "rm", "slxa", "slxk", "spia", "spib", + "uac"; nvidia,pull = ; nvidia,tristate = ; }; @@ -211,7 +212,7 @@ conf_ddc { nvidia,pins = "ddc", "dta", "dtd", "kbca", "kbcb", "kbcc", "kbcd", "kbce", "kbcf", - "sdc"; + "sdc", "uad", "uca"; nvidia,pull = ; nvidia,tristate = ; }; @@ -221,10 +222,9 @@ "lvp0", "owc", "sdb"; nvidia,tristate = ; }; - conf_irrx { - nvidia,pins = "irrx", "irtx", "sdd", "spic", - "spie", "spih", "uaa", "uab", "uad", - "uca", "ucb"; + conf_sdd { + nvidia,pins = "sdd", "spic", "spie", "spih", + "uaa", "uab", "ucb"; nvidia,pull = ; nvidia,tristate = ; }; From debdd878bac6db27248a05b5bea7e16227411ca7 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Tue, 22 Jun 2021 15:44:22 +0200 Subject: [PATCH 0770/5344] arm64: tegra: Fix compatible string for Tegra132 CPUs [ Upstream commit f865d0292ff3c0ca09414436510eb4c815815509 ] The documented compatible string for the CPUs found on Tegra132 is "nvidia,tegra132-denver", rather than the previously used compatible string "nvidia,denver". Signed-off-by: Thierry Reding Signed-off-by: Sasha Levin --- arch/arm64/boot/dts/nvidia/tegra132.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/nvidia/tegra132.dtsi b/arch/arm64/boot/dts/nvidia/tegra132.dtsi index 631a7f77c3869..0b3eb8c0b8df0 100644 --- a/arch/arm64/boot/dts/nvidia/tegra132.dtsi +++ b/arch/arm64/boot/dts/nvidia/tegra132.dtsi @@ -1082,13 +1082,13 @@ cpu@0 { device_type = "cpu"; - compatible = "nvidia,denver"; + compatible = "nvidia,tegra132-denver"; reg = <0>; }; cpu@1 { device_type = "cpu"; - compatible = "nvidia,denver"; + compatible = "nvidia,tegra132-denver"; reg = <1>; }; }; From 5f9ec5886f0c196bdb6bc843102c065d9ed194ad Mon Sep 17 00:00:00 2001 From: Raag Jadav Date: Thu, 5 Aug 2021 00:54:45 +0530 Subject: [PATCH 0771/5344] arm64: dts: ls1046a: fix eeprom entries [ Upstream commit c1a6018d1839c9cb8f807dc863a50102a1a5c412 ] ls1046afrwy and ls1046ardb boards have CAT24C04[1] and CAT24C05[2] eeproms respectively. Both are 4Kb (512 bytes) in size, and compatible with AT24C04[3]. Remove multi-address entries, as both the boards have a single chip each. [1] https://www.onsemi.com/pdf/datasheet/cat24c01-d.pdf [2] https://www.onsemi.com/pdf/datasheet/cat24c03-d.pdf [3] https://ww1.microchip.com/downloads/en/DeviceDoc/doc0180.pdf Signed-off-by: Raag Jadav Acked-by: Li Yang Signed-off-by: Shawn Guo Signed-off-by: Sasha Levin --- arch/arm64/boot/dts/freescale/fsl-ls1046a-frwy.dts | 8 +------- arch/arm64/boot/dts/freescale/fsl-ls1046a-rdb.dts | 7 +------ 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1046a-frwy.dts b/arch/arm64/boot/dts/freescale/fsl-ls1046a-frwy.dts index 3595be0f25277..2d6c73d7d397c 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1046a-frwy.dts +++ b/arch/arm64/boot/dts/freescale/fsl-ls1046a-frwy.dts @@ -83,15 +83,9 @@ }; eeprom@52 { - compatible = "atmel,24c512"; + compatible = "onnn,cat24c04", "atmel,24c04"; reg = <0x52>; }; - - eeprom@53 { - compatible = "atmel,24c512"; - reg = <0x53>; - }; - }; }; }; diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1046a-rdb.dts b/arch/arm64/boot/dts/freescale/fsl-ls1046a-rdb.dts index 2743397591141..8858c1e92f23c 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1046a-rdb.dts +++ b/arch/arm64/boot/dts/freescale/fsl-ls1046a-rdb.dts @@ -58,14 +58,9 @@ }; eeprom@52 { - compatible = "atmel,24c512"; + compatible = "onnn,cat24c05", "atmel,24c04"; reg = <0x52>; }; - - eeprom@53 { - compatible = "atmel,24c512"; - reg = <0x53>; - }; }; &i2c3 { From d47014692a91d1631a1671907cecccfef1659f1a Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 16 Jun 2021 14:19:35 -0700 Subject: [PATCH 0772/5344] nvme-tcp: don't check blk_mq_tag_to_rq when receiving pdu data [ Upstream commit 3b01a9d0caa8276d9ce314e09610f7fb70f49a00 ] We already validate it when receiving the c2hdata pdu header and this is not changing so this is a redundant check. Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg Reviewed-by: Daniel Wagner Signed-off-by: Christoph Hellwig Signed-off-by: Sasha Levin --- drivers/nvme/host/tcp.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index f6427a10a9908..38bbbbbc6f47f 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -642,17 +642,9 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, unsigned int *offset, size_t *len) { struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu; - struct nvme_tcp_request *req; - struct request *rq; - - rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); - if (!rq) { - dev_err(queue->ctrl->ctrl.device, - "queue %d tag %#x not found\n", - nvme_tcp_queue_id(queue), pdu->command_id); - return -ENOENT; - } - req = blk_mq_rq_to_pdu(rq); + struct request *rq = + blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); while (true) { int recv_len, ret; From 93e7279302cfa1bbabe056656f3995c6c1d0d0aa Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 11 Aug 2021 16:20:15 -0700 Subject: [PATCH 0773/5344] Bluetooth: Fix handling of LE Enhanced Connection Complete [ Upstream commit cafae4cd625502f65d1798659c1aa9b62d38cc56 ] LE Enhanced Connection Complete contains the Local RPA used in the connection which must be used when set otherwise there could problems when pairing since the address used by the remote stack could be the Local RPA: BLUETOOTH CORE SPECIFICATION Version 5.2 | Vol 4, Part E page 2396 'Resolvable Private Address being used by the local device for this connection. This is only valid when the Own_Address_Type (from the HCI_LE_Create_Connection, HCI_LE_Set_Advertising_Parameters, HCI_LE_Set_Extended_Advertising_Parameters, or HCI_LE_Extended_Create_Connection commands) is set to 0x02 or 0x03, and the Controller generated a resolvable private address for the local device using a non-zero local IRK. For other Own_Address_Type values, the Controller shall return all zeros.' Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann Signed-off-by: Sasha Levin --- net/bluetooth/hci_event.c | 93 ++++++++++++++++++++++++++------------- 1 file changed, 62 insertions(+), 31 deletions(-) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 82e42d8e2ea03..31469ff084cd3 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4920,9 +4920,64 @@ static void hci_disconn_phylink_complete_evt(struct hci_dev *hdev, } #endif +static void le_conn_update_addr(struct hci_conn *conn, bdaddr_t *bdaddr, + u8 bdaddr_type, bdaddr_t *local_rpa) +{ + if (conn->out) { + conn->dst_type = bdaddr_type; + conn->resp_addr_type = bdaddr_type; + bacpy(&conn->resp_addr, bdaddr); + + /* Check if the controller has set a Local RPA then it must be + * used instead or hdev->rpa. + */ + if (local_rpa && bacmp(local_rpa, BDADDR_ANY)) { + conn->init_addr_type = ADDR_LE_DEV_RANDOM; + bacpy(&conn->init_addr, local_rpa); + } else if (hci_dev_test_flag(conn->hdev, HCI_PRIVACY)) { + conn->init_addr_type = ADDR_LE_DEV_RANDOM; + bacpy(&conn->init_addr, &conn->hdev->rpa); + } else { + hci_copy_identity_address(conn->hdev, &conn->init_addr, + &conn->init_addr_type); + } + } else { + conn->resp_addr_type = conn->hdev->adv_addr_type; + /* Check if the controller has set a Local RPA then it must be + * used instead or hdev->rpa. + */ + if (local_rpa && bacmp(local_rpa, BDADDR_ANY)) { + conn->resp_addr_type = ADDR_LE_DEV_RANDOM; + bacpy(&conn->resp_addr, local_rpa); + } else if (conn->hdev->adv_addr_type == ADDR_LE_DEV_RANDOM) { + /* In case of ext adv, resp_addr will be updated in + * Adv Terminated event. + */ + if (!ext_adv_capable(conn->hdev)) + bacpy(&conn->resp_addr, + &conn->hdev->random_addr); + } else { + bacpy(&conn->resp_addr, &conn->hdev->bdaddr); + } + + conn->init_addr_type = bdaddr_type; + bacpy(&conn->init_addr, bdaddr); + + /* For incoming connections, set the default minimum + * and maximum connection interval. They will be used + * to check if the parameters are in range and if not + * trigger the connection update procedure. + */ + conn->le_conn_min_interval = conn->hdev->le_conn_min_interval; + conn->le_conn_max_interval = conn->hdev->le_conn_max_interval; + } +} + static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, - bdaddr_t *bdaddr, u8 bdaddr_type, u8 role, u16 handle, - u16 interval, u16 latency, u16 supervision_timeout) + bdaddr_t *bdaddr, u8 bdaddr_type, + bdaddr_t *local_rpa, u8 role, u16 handle, + u16 interval, u16 latency, + u16 supervision_timeout) { struct hci_conn_params *params; struct hci_conn *conn; @@ -4970,32 +5025,7 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, cancel_delayed_work(&conn->le_conn_timeout); } - if (!conn->out) { - /* Set the responder (our side) address type based on - * the advertising address type. - */ - conn->resp_addr_type = hdev->adv_addr_type; - if (hdev->adv_addr_type == ADDR_LE_DEV_RANDOM) { - /* In case of ext adv, resp_addr will be updated in - * Adv Terminated event. - */ - if (!ext_adv_capable(hdev)) - bacpy(&conn->resp_addr, &hdev->random_addr); - } else { - bacpy(&conn->resp_addr, &hdev->bdaddr); - } - - conn->init_addr_type = bdaddr_type; - bacpy(&conn->init_addr, bdaddr); - - /* For incoming connections, set the default minimum - * and maximum connection interval. They will be used - * to check if the parameters are in range and if not - * trigger the connection update procedure. - */ - conn->le_conn_min_interval = hdev->le_conn_min_interval; - conn->le_conn_max_interval = hdev->le_conn_max_interval; - } + le_conn_update_addr(conn, bdaddr, bdaddr_type, local_rpa); /* Lookup the identity address from the stored connection * address and address type. @@ -5089,7 +5119,7 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); le_conn_complete_evt(hdev, ev->status, &ev->bdaddr, ev->bdaddr_type, - ev->role, le16_to_cpu(ev->handle), + NULL, ev->role, le16_to_cpu(ev->handle), le16_to_cpu(ev->interval), le16_to_cpu(ev->latency), le16_to_cpu(ev->supervision_timeout)); @@ -5103,7 +5133,7 @@ static void hci_le_enh_conn_complete_evt(struct hci_dev *hdev, BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); le_conn_complete_evt(hdev, ev->status, &ev->bdaddr, ev->bdaddr_type, - ev->role, le16_to_cpu(ev->handle), + &ev->local_rpa, ev->role, le16_to_cpu(ev->handle), le16_to_cpu(ev->interval), le16_to_cpu(ev->latency), le16_to_cpu(ev->supervision_timeout)); @@ -5134,7 +5164,8 @@ static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, struct sk_buff *skb) if (conn) { struct adv_info *adv_instance; - if (hdev->adv_addr_type != ADDR_LE_DEV_RANDOM) + if (hdev->adv_addr_type != ADDR_LE_DEV_RANDOM || + bacmp(&conn->resp_addr, BDADDR_ANY)) return; if (!hdev->cur_adv_instance) { From bb16822597221a4623dae3c72e53aa6f7a761ea6 Mon Sep 17 00:00:00 2001 From: Rajendra Nayak Date: Thu, 12 Aug 2021 16:57:20 +0530 Subject: [PATCH 0774/5344] opp: Don't print an error if required-opps is missing [ Upstream commit 020d86fc0df8b865f6dc168d88a7c2dccabd0a9e ] The 'required-opps' property is considered optional, hence remove the pr_err() in of_parse_required_opp() when we find the property is missing. While at it, also fix the return value of of_get_required_opp_performance_state() when of_parse_required_opp() fails, return a -ENODEV instead of the -EINVAL. Signed-off-by: Rajendra Nayak Reviewed-by: Ulf Hansson Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki Signed-off-by: Sasha Levin --- drivers/opp/of.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 603c688fe23dc..30cc407c8f93f 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -95,15 +95,7 @@ static struct dev_pm_opp *_find_opp_of_np(struct opp_table *opp_table, static struct device_node *of_parse_required_opp(struct device_node *np, int index) { - struct device_node *required_np; - - required_np = of_parse_phandle(np, "required-opps", index); - if (unlikely(!required_np)) { - pr_err("%s: Unable to parse required-opps: %pOF, index: %d\n", - __func__, np, index); - } - - return required_np; + return of_parse_phandle(np, "required-opps", index); } /* The caller must call dev_pm_opp_put_opp_table() after the table is used */ @@ -996,7 +988,7 @@ int of_get_required_opp_performance_state(struct device_node *np, int index) required_np = of_parse_required_opp(np, index); if (!required_np) - return -EINVAL; + return -ENODEV; opp_table = _find_table_of_opp_np(required_np); if (IS_ERR(opp_table)) { From 4715ee6294bce2c4ecd852f65faec373967d1bc9 Mon Sep 17 00:00:00 2001 From: Ulrich Hecht Date: Mon, 16 Aug 2021 18:22:01 +0200 Subject: [PATCH 0775/5344] serial: sh-sci: fix break handling for sysrq [ Upstream commit 87b8061bad9bd4b549b2daf36ffbaa57be2789a2 ] This fixes two issues that cause the sysrq sequence to be inadvertently aborted on SCIF serial consoles: - a NUL character remains in the RX queue after a break has been detected, which is then passed on to uart_handle_sysrq_char() - the break interrupt is handled twice on controllers with multiplexed ERI and BRI interrupts Signed-off-by: Ulrich Hecht Link: https://lore.kernel.org/r/20210816162201.28801-1-uli+renesas@fpond.eu Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/tty/serial/sh-sci.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c index 97ee1fc1cd247..ecff9b2088087 100644 --- a/drivers/tty/serial/sh-sci.c +++ b/drivers/tty/serial/sh-sci.c @@ -1763,6 +1763,10 @@ static irqreturn_t sci_br_interrupt(int irq, void *ptr) /* Handle BREAKs */ sci_handle_breaks(port); + + /* drop invalid character received before break was detected */ + serial_port_in(port, SCxRDR); + sci_clear_SCxSR(port, SCxSR_BREAK_CLEAR(port)); return IRQ_HANDLED; @@ -1842,7 +1846,8 @@ static irqreturn_t sci_mpxed_interrupt(int irq, void *ptr) ret = sci_er_interrupt(irq, ptr); /* Break Interrupt */ - if ((ssr_status & SCxSR_BRK(port)) && err_enabled) + if (s->irqs[SCIx_ERI_IRQ] != s->irqs[SCIx_BRI_IRQ] && + (ssr_status & SCxSR_BRK(port)) && err_enabled) ret = sci_br_interrupt(irq, ptr); /* Overrun Interrupt */ From 148d97433a6e67b64b86435ddf167770ecdcae29 Mon Sep 17 00:00:00 2001 From: Luke Hsiao Date: Mon, 16 Aug 2021 20:51:06 +0000 Subject: [PATCH 0776/5344] tcp: enable data-less, empty-cookie SYN with TFO_SERVER_COOKIE_NOT_REQD [ Upstream commit e3faa49bcecdfcc80e94dd75709d6acb1a5d89f6 ] Since the original TFO server code was implemented in commit 168a8f58059a22feb9e9a2dcc1b8053dbbbc12ef ("tcp: TCP Fast Open Server - main code path") the TFO server code has supported the sysctl bit flag TFO_SERVER_COOKIE_NOT_REQD. Currently, when the TFO_SERVER_ENABLE and TFO_SERVER_COOKIE_NOT_REQD sysctl bit flags are set, a server connection will accept a SYN with N bytes of data (N > 0) that has no TFO cookie, create a new fast open connection, process the incoming data in the SYN, and make the connection ready for accepting. After accepting, the connection is ready for read()/recvmsg() to read the N bytes of data in the SYN, ready for write()/sendmsg() calls and data transmissions to transmit data. This commit changes an edge case in this feature by changing this behavior to apply to (N >= 0) bytes of data in the SYN rather than only (N > 0) bytes of data in the SYN. Now, a server will accept a data-less SYN without a TFO cookie if TFO_SERVER_COOKIE_NOT_REQD is set. Caveat! While this enables a new kind of TFO (data-less empty-cookie SYN), some firewall rules setup may not work if they assume such packets are not legit TFOs and will filter them. Signed-off-by: Luke Hsiao Acked-by: Neal Cardwell Acked-by: Yuchung Cheng Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20210816205105.2533289-1-luke.w.hsiao@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/ipv4/tcp_fastopen.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 9934cea2b020f..db11efaee04a2 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -393,8 +393,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, return NULL; } - if (syn_data && - tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD)) + if (tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD)) goto fastopen; if (foc->len == 0) { From 9236adebab033943856151b809f71730c41ca3b3 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 12 Aug 2021 16:41:42 -0400 Subject: [PATCH 0777/5344] rpc: fix gss_svc_init cleanup on failure [ Upstream commit 5a4753446253a427c0ff1e433b9c4933e5af207c ] The failure case here should be rare, but it's obviously wrong. Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever Signed-off-by: Sasha Levin --- net/sunrpc/auth_gss/svcauth_gss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index d5470c7fe8792..c0016473a255a 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1937,7 +1937,7 @@ gss_svc_init_net(struct net *net) goto out2; return 0; out2: - destroy_use_gss_proxy_proc_entry(net); + rsi_cache_destroy_net(net); out1: rsc_cache_destroy_net(net); return rv; From 16be3c01bc501c7506c3a8e8c8fa26a540b6a309 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 17 Aug 2021 21:42:52 -0700 Subject: [PATCH 0778/5344] staging: rts5208: Fix get_ms_information() heap buffer size [ Upstream commit cbe34165cc1b7d1110b268ba8b9f30843c941639 ] Fix buf allocation size (it needs to be 2 bytes larger). Found when __alloc_size() annotations were added to kmalloc() interfaces. In file included from ./include/linux/string.h:253, from ./include/linux/bitmap.h:10, from ./include/linux/cpumask.h:12, from ./arch/x86/include/asm/paravirt.h:17, from ./arch/x86/include/asm/irqflags.h:63, from ./include/linux/irqflags.h:16, from ./include/linux/rcupdate.h:26, from ./include/linux/rculist.h:11, from ./include/linux/pid.h:5, from ./include/linux/sched.h:14, from ./include/linux/blkdev.h:5, from drivers/staging/rts5208/rtsx_scsi.c:12: In function 'get_ms_information', inlined from 'ms_sp_cmnd' at drivers/staging/rts5208/rtsx_scsi.c:2877:12, inlined from 'rtsx_scsi_handler' at drivers/staging/rts5208/rtsx_scsi.c:3247:12: ./include/linux/fortify-string.h:54:29: warning: '__builtin_memcpy' forming offset [106, 107] is out of the bounds [0, 106] [-Warray-bounds] 54 | #define __underlying_memcpy __builtin_memcpy | ^ ./include/linux/fortify-string.h:417:2: note: in expansion of macro '__underlying_memcpy' 417 | __underlying_##op(p, q, __fortify_size); \ | ^~~~~~~~~~~~~ ./include/linux/fortify-string.h:463:26: note: in expansion of macro '__fortify_memcpy_chk' 463 | #define memcpy(p, q, s) __fortify_memcpy_chk(p, q, s, \ | ^~~~~~~~~~~~~~~~~~~~ drivers/staging/rts5208/rtsx_scsi.c:2851:3: note: in expansion of macro 'memcpy' 2851 | memcpy(buf + i, ms_card->raw_sys_info, 96); | ^~~~~~ Cc: Greg Kroah-Hartman Cc: linux-staging@lists.linux.dev Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20210818044252.1533634-1-keescook@chromium.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/staging/rts5208/rtsx_scsi.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/staging/rts5208/rtsx_scsi.c b/drivers/staging/rts5208/rtsx_scsi.c index 1deb74112ad43..11d9d9155eef2 100644 --- a/drivers/staging/rts5208/rtsx_scsi.c +++ b/drivers/staging/rts5208/rtsx_scsi.c @@ -2802,10 +2802,10 @@ static int get_ms_information(struct scsi_cmnd *srb, struct rtsx_chip *chip) } if (dev_info_id == 0x15) { - buf_len = 0x3A; + buf_len = 0x3C; data_len = 0x3A; } else { - buf_len = 0x6A; + buf_len = 0x6C; data_len = 0x6A; } @@ -2855,11 +2855,7 @@ static int get_ms_information(struct scsi_cmnd *srb, struct rtsx_chip *chip) } rtsx_stor_set_xfer_buf(buf, buf_len, srb); - - if (dev_info_id == 0x15) - scsi_set_resid(srb, scsi_bufflen(srb) - 0x3C); - else - scsi_set_resid(srb, scsi_bufflen(srb) - 0x6C); + scsi_set_resid(srb, scsi_bufflen(srb) - buf_len); kfree(buf); return STATUS_SUCCESS; From 1788054c59d372ccc797e815da3189f781092b77 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Fri, 30 Jul 2021 12:41:49 -0500 Subject: [PATCH 0779/5344] gfs2: Don't call dlm after protocol is unmounted [ Upstream commit d1340f80f0b8066321b499a376780da00560e857 ] In the gfs2 withdraw sequence, the dlm protocol is unmounted with a call to lm_unmount. After a withdraw, users are allowed to unmount the withdrawn file system. But at that point we may still have glocks left over that we need to free via unmount's call to gfs2_gl_hash_clear. These glocks may have never been completed because of whatever problem caused the withdraw (IO errors or whatever). Before this patch, function gdlm_put_lock would still try to call into dlm to unlock these leftover glocks, which resulted in dlm returning -EINVAL because the lock space was abandoned. These glocks were never freed because there was no mechanism after that to free them. This patch adds a check to gdlm_put_lock to see if the locking protocol was inactive (DFL_UNMOUNT flag) and if so, free the glock and not make the invalid call into dlm. I could have combined this "if" with the one that follows, related to leftover glock LVBs, but I felt the code was more readable with its own if clause. Signed-off-by: Bob Peterson Signed-off-by: Sasha Levin --- fs/gfs2/lock_dlm.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 72dec177b3494..94c290a333a0a 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -292,6 +292,11 @@ static void gdlm_put_lock(struct gfs2_glock *gl) gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); gfs2_update_request_times(gl); + /* don't want to call dlm if we've unmounted the lock protocol */ + if (test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) { + gfs2_glock_free(gl); + return; + } /* don't want to skip dlm_unlock writing the lvb when lock has one */ if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) && From 1128cfa799020339a2b61dbe6afa8d6635fec439 Mon Sep 17 00:00:00 2001 From: Li Jun Date: Fri, 18 Jun 2021 16:28:58 +0800 Subject: [PATCH 0780/5344] usb: chipidea: host: fix port index underflow and UBSAN complains [ Upstream commit e5d6a7c6cfae9e714a0e8ff64facd1ac68a784c6 ] If wIndex is 0 (and it often is), these calculations underflow and UBSAN complains, here resolve this by not decrementing the index when it is equal to 0, this copies the solution from commit 85e3990bea49 ("USB: EHCI: avoid undefined pointer arithmetic and placate UBSAN") Reported-by: Zhipeng Wang Signed-off-by: Li Jun Link: https://lore.kernel.org/r/1624004938-2399-1-git-send-email-jun.li@nxp.com Signed-off-by: Peter Chen Signed-off-by: Sasha Levin --- drivers/usb/chipidea/host.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/usb/chipidea/host.c b/drivers/usb/chipidea/host.c index 48e4a5ca18359..f5f56ee07729f 100644 --- a/drivers/usb/chipidea/host.c +++ b/drivers/usb/chipidea/host.c @@ -233,18 +233,26 @@ static int ci_ehci_hub_control( ) { struct ehci_hcd *ehci = hcd_to_ehci(hcd); + unsigned int ports = HCS_N_PORTS(ehci->hcs_params); u32 __iomem *status_reg; - u32 temp; + u32 temp, port_index; unsigned long flags; int retval = 0; struct device *dev = hcd->self.controller; struct ci_hdrc *ci = dev_get_drvdata(dev); - status_reg = &ehci->regs->port_status[(wIndex & 0xff) - 1]; + port_index = wIndex & 0xff; + port_index -= (port_index > 0); + status_reg = &ehci->regs->port_status[port_index]; spin_lock_irqsave(&ehci->lock, flags); if (typeReq == SetPortFeature && wValue == USB_PORT_FEAT_SUSPEND) { + if (!wIndex || wIndex > ports) { + retval = -EPIPE; + goto done; + } + temp = ehci_readl(ehci, status_reg); if ((temp & PORT_PE) == 0 || (temp & PORT_RESET) != 0) { retval = -EPIPE; @@ -273,7 +281,7 @@ static int ci_ehci_hub_control( ehci_writel(ehci, temp, status_reg); } - set_bit((wIndex & 0xff) - 1, &ehci->suspended_ports); + set_bit(port_index, &ehci->suspended_ports); goto done; } From 7dbed1801058b619e68a9ba492ccda259a1becce Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 20 Aug 2021 17:01:59 -0400 Subject: [PATCH 0781/5344] lockd: lockd server-side shouldn't set fl_ops [ Upstream commit 7de875b231edb807387a81cde288aa9e1015ef9e ] Locks have two sets of op arrays, fl_lmops for the lock manager (lockd or nfsd), fl_ops for the filesystem. The server-side lockd code has been setting its own fl_ops, which leads to confusion (and crashes) in the reexport case, where the filesystem expects to be the only one setting fl_ops. And there's no reason for it that I can see-the lm_get/put_owner ops do the same job. Reported-by: Daire Byrne Tested-by: Daire Byrne Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever Signed-off-by: Sasha Levin --- fs/lockd/svclock.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 498cb70c2c0d0..273a81971ed57 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -395,28 +395,10 @@ nlmsvc_release_lockowner(struct nlm_lock *lock) nlmsvc_put_lockowner(lock->fl.fl_owner); } -static void nlmsvc_locks_copy_lock(struct file_lock *new, struct file_lock *fl) -{ - struct nlm_lockowner *nlm_lo = (struct nlm_lockowner *)fl->fl_owner; - new->fl_owner = nlmsvc_get_lockowner(nlm_lo); -} - -static void nlmsvc_locks_release_private(struct file_lock *fl) -{ - nlmsvc_put_lockowner((struct nlm_lockowner *)fl->fl_owner); -} - -static const struct file_lock_operations nlmsvc_lock_ops = { - .fl_copy_lock = nlmsvc_locks_copy_lock, - .fl_release_private = nlmsvc_locks_release_private, -}; - void nlmsvc_locks_init_private(struct file_lock *fl, struct nlm_host *host, pid_t pid) { fl->fl_owner = nlmsvc_find_lockowner(host, pid); - if (fl->fl_owner != NULL) - fl->fl_ops = &nlmsvc_lock_ops; } /* @@ -788,9 +770,21 @@ nlmsvc_notify_blocked(struct file_lock *fl) printk(KERN_WARNING "lockd: notification for unknown block!\n"); } +static fl_owner_t nlmsvc_get_owner(fl_owner_t owner) +{ + return nlmsvc_get_lockowner(owner); +} + +static void nlmsvc_put_owner(fl_owner_t owner) +{ + nlmsvc_put_lockowner(owner); +} + const struct lock_manager_operations nlmsvc_lock_operations = { .lm_notify = nlmsvc_notify_blocked, .lm_grant = nlmsvc_grant_deferred, + .lm_get_owner = nlmsvc_get_owner, + .lm_put_owner = nlmsvc_put_owner, }; /* From 4163576684d1abdcc389d46206f74dfcf64d8a7c Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 27 Jul 2021 16:36:56 -0700 Subject: [PATCH 0782/5344] drm/exynos: Always initialize mapping in exynos_drm_register_dma() [ Upstream commit c626f3864bbbb28bbe06476b0b497c1330aa4463 ] In certain randconfigs, clang warns: drivers/gpu/drm/exynos/exynos_drm_dma.c:121:19: warning: variable 'mapping' is uninitialized when used here [-Wuninitialized] priv->mapping = mapping; ^~~~~~~ drivers/gpu/drm/exynos/exynos_drm_dma.c:111:16: note: initialize the variable 'mapping' to silence this warning void *mapping; ^ = NULL 1 warning generated. This occurs when CONFIG_EXYNOS_IOMMU is enabled and both CONFIG_ARM_DMA_USE_IOMMU and CONFIG_IOMMU_DMA are disabled, which makes the code look like void *mapping; if (0) mapping = arm_iommu_create_mapping() else if (0) mapping = iommu_get_domain_for_dev() ... priv->mapping = mapping; Add an else branch that initializes mapping to the -ENODEV error pointer so that there is no more warning and the driver does not change during runtime. Reported-by: kernel test robot Signed-off-by: Nathan Chancellor Signed-off-by: Inki Dae Signed-off-by: Sasha Levin --- drivers/gpu/drm/exynos/exynos_drm_dma.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/exynos/exynos_drm_dma.c b/drivers/gpu/drm/exynos/exynos_drm_dma.c index 58b89ec11b0eb..a3c9d8b9e1a18 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_dma.c +++ b/drivers/gpu/drm/exynos/exynos_drm_dma.c @@ -140,6 +140,8 @@ int exynos_drm_register_dma(struct drm_device *drm, struct device *dev, EXYNOS_DEV_ADDR_START, EXYNOS_DEV_ADDR_SIZE); else if (IS_ENABLED(CONFIG_IOMMU_DMA)) mapping = iommu_get_domain_for_dev(priv->dma_dev); + else + mapping = ERR_PTR(-ENODEV); if (IS_ERR(mapping)) return PTR_ERR(mapping); From a2799f8353120683a789d332a5b48f16f0eb4e61 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 9 Aug 2021 18:01:31 +1000 Subject: [PATCH 0783/5344] m68knommu: only set CONFIG_ISA_DMA_API for ColdFire sub-arch [ Upstream commit db87db65c1059f3be04506d122f8ec9b2fa3b05e ] > Hi Arnd, > > First bad commit (maybe != root cause): > > tree: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git master > head: 2f73937c9aa561e2082839bc1a8efaac75d6e244 > commit: 47fd22f2b84765a2f7e3f150282497b902624547 [4771/5318] cs89x0: rework driver configuration > config: m68k-randconfig-c003-20210804 (attached as .config) > compiler: m68k-linux-gcc (GCC) 10.3.0 > reproduce (this is a W=1 build): > wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross > chmod +x ~/bin/make.cross > # https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=47fd22f2b84765a2f7e3f150282497b902624547 > git remote add linux-next https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git > git fetch --no-tags linux-next master > git checkout 47fd22f2b84765a2f7e3f150282497b902624547 > # save the attached .config to linux build tree > COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-10.3.0 make.cross ARCH=m68k > > If you fix the issue, kindly add following tag as appropriate > Reported-by: kernel test robot > > All errors (new ones prefixed by >>): > > In file included from include/linux/kernel.h:19, > from include/linux/list.h:9, > from include/linux/module.h:12, > from drivers/net/ethernet/cirrus/cs89x0.c:51: > drivers/net/ethernet/cirrus/cs89x0.c: In function 'net_open': > drivers/net/ethernet/cirrus/cs89x0.c:897:20: error: implicit declaration of function 'isa_virt_to_bus'; did you mean 'virt_to_bus'? [-Werror=implicit-function-declaration] > 897 | (unsigned long)isa_virt_to_bus(lp->dma_buff)); > | ^~~~~~~~~~~~~~~ > include/linux/printk.h:141:17: note: in definition of macro 'no_printk' > 141 | printk(fmt, ##__VA_ARGS__); \ > | ^~~~~~~~~~~ > drivers/net/ethernet/cirrus/cs89x0.c:86:3: note: in expansion of macro 'pr_debug' > 86 | pr_##level(fmt, ##__VA_ARGS__); \ > | ^~~ > drivers/net/ethernet/cirrus/cs89x0.c:894:3: note: in expansion of macro 'cs89_dbg' > 894 | cs89_dbg(1, debug, "%s: dma %lx %lx\n", > | ^~~~~~~~ > >> drivers/net/ethernet/cirrus/cs89x0.c:914:3: error: implicit declaration of function 'disable_dma'; did you mean 'disable_irq'? [-Werror=implicit-function-declaration] As far as I can tell, this is a bug with the m68kmmu architecture, not with my driver: The CONFIG_ISA_DMA_API option is provided for coldfire, which implements it, but dragonball also sets the option as a side-effect, without actually implementing the interfaces. The patch below should fix it. Signed-off-by: Arnd Bergmann Signed-off-by: Greg Ungerer Signed-off-by: Sasha Levin --- arch/m68k/Kconfig.bus | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/m68k/Kconfig.bus b/arch/m68k/Kconfig.bus index 9d0a3a23d50e5..355c51309ed85 100644 --- a/arch/m68k/Kconfig.bus +++ b/arch/m68k/Kconfig.bus @@ -63,7 +63,7 @@ source "drivers/zorro/Kconfig" endif -if !MMU +if COLDFIRE config ISA_DMA_API def_bool !M5272 From e94639f7d266a08718e722a8c3a0472dd8b0f2da Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Mon, 2 Aug 2021 09:34:00 -0300 Subject: [PATCH 0784/5344] btrfs: tree-log: check btrfs_lookup_data_extent return value [ Upstream commit 3736127a3aa805602b7a2ad60ec9cfce68065fbb ] Function btrfs_lookup_data_extent calls btrfs_search_slot to verify if the EXTENT_ITEM exists in the extent tree. btrfs_search_slot can return values bellow zero if an error happened. Function replay_one_extent currently checks if the search found something (0 returned) and increments the reference, and if not, it seems to evaluate as 'not found'. Fix the condition by checking if the value was bellow zero and return early. Reviewed-by: Filipe Manana Signed-off-by: Marcos Paulo de Souza Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/tree-log.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 5412361d0c270..8ea4b3da85d1a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -719,7 +719,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, */ ret = btrfs_lookup_data_extent(fs_info, ins.objectid, ins.offset); - if (ret == 0) { + if (ret < 0) { + goto out; + } else if (ret == 0) { btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, ins.objectid, ins.offset, 0); From b38a8554c50287bf6b8df7c3c34612517cc3fd3f Mon Sep 17 00:00:00 2001 From: Cezary Rojewski Date: Wed, 18 Aug 2021 09:57:36 +0200 Subject: [PATCH 0785/5344] ASoC: Intel: Skylake: Fix module configuration for KPB and MIXER [ Upstream commit e4e0633bcadc950b4b4af06c7f1bb7f7e3e86321 ] KeyPhrasebuffer, Mixin and Mixout modules configuration is described by firmware's basic module configuration structure. There are no extended parameters required. Update functions taking part in building INIT_INSTANCE IPC payload to reflect that. Signed-off-by: Cezary Rojewski Tested-by: Lukasz Majczak Link: https://lore.kernel.org/r/20210818075742.1515155-6-cezary.rojewski@intel.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/intel/skylake/skl-messages.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/sound/soc/intel/skylake/skl-messages.c b/sound/soc/intel/skylake/skl-messages.c index 476ef1897961d..79c6cf2c14bfb 100644 --- a/sound/soc/intel/skylake/skl-messages.c +++ b/sound/soc/intel/skylake/skl-messages.c @@ -802,9 +802,12 @@ static u16 skl_get_module_param_size(struct skl_dev *skl, case SKL_MODULE_TYPE_BASE_OUTFMT: case SKL_MODULE_TYPE_MIC_SELECT: - case SKL_MODULE_TYPE_KPB: return sizeof(struct skl_base_outfmt_cfg); + case SKL_MODULE_TYPE_MIXER: + case SKL_MODULE_TYPE_KPB: + return sizeof(struct skl_base_cfg); + default: /* * return only base cfg when no specific module type is @@ -857,10 +860,14 @@ static int skl_set_module_format(struct skl_dev *skl, case SKL_MODULE_TYPE_BASE_OUTFMT: case SKL_MODULE_TYPE_MIC_SELECT: - case SKL_MODULE_TYPE_KPB: skl_set_base_outfmt_format(skl, module_config, *param_data); break; + case SKL_MODULE_TYPE_MIXER: + case SKL_MODULE_TYPE_KPB: + skl_set_base_module_format(skl, module_config, *param_data); + break; + default: skl_set_base_module_format(skl, module_config, *param_data); break; From 37127c526e62e1369d24ff533eef28688d180433 Mon Sep 17 00:00:00 2001 From: Gustaw Lewandowski Date: Wed, 18 Aug 2021 09:57:37 +0200 Subject: [PATCH 0786/5344] ASoC: Intel: Skylake: Fix passing loadable flag for module [ Upstream commit c5ed9c547cba1dc1238c6e8a0c290fd62ee6e127 ] skl_get_module_info() tries to set mconfig->module->loadable before mconfig->module has been assigned thus flag was always set to false and driver did not try to load module binaries. Signed-off-by: Gustaw Lewandowski Signed-off-by: Cezary Rojewski Tested-by: Lukasz Majczak Link: https://lore.kernel.org/r/20210818075742.1515155-7-cezary.rojewski@intel.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/intel/skylake/skl-pcm.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/sound/soc/intel/skylake/skl-pcm.c b/sound/soc/intel/skylake/skl-pcm.c index 7f287424af9b7..439dd4ba690c4 100644 --- a/sound/soc/intel/skylake/skl-pcm.c +++ b/sound/soc/intel/skylake/skl-pcm.c @@ -1333,21 +1333,6 @@ static int skl_get_module_info(struct skl_dev *skl, return -EIO; } - list_for_each_entry(module, &skl->uuid_list, list) { - if (guid_equal(uuid_mod, &module->uuid)) { - mconfig->id.module_id = module->id; - if (mconfig->module) - mconfig->module->loadable = module->is_loadable; - ret = 0; - break; - } - } - - if (ret) - return ret; - - uuid_mod = &module->uuid; - ret = -EIO; for (i = 0; i < skl->nr_modules; i++) { skl_module = skl->modules[i]; uuid_tplg = &skl_module->uuid; @@ -1357,10 +1342,18 @@ static int skl_get_module_info(struct skl_dev *skl, break; } } + if (skl->nr_modules && ret) return ret; + ret = -EIO; list_for_each_entry(module, &skl->uuid_list, list) { + if (guid_equal(uuid_mod, &module->uuid)) { + mconfig->id.module_id = module->id; + mconfig->module->loadable = module->is_loadable; + ret = 0; + } + for (i = 0; i < MAX_IN_QUEUE; i++) { pin_id = &mconfig->m_in_pin[i].id; if (guid_equal(&pin_id->mod_uuid, &module->uuid)) @@ -1374,7 +1367,7 @@ static int skl_get_module_info(struct skl_dev *skl, } } - return 0; + return ret; } static int skl_populate_modules(struct skl_dev *skl) From f8235164cdf19b98faadbaab9ebd3d3a187cbbbd Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 20 Aug 2021 15:47:22 +0100 Subject: [PATCH 0787/5344] of: Don't allow __of_attached_node_sysfs() without CONFIG_SYSFS [ Upstream commit 6211e9cb2f8faf7faae0b6caf844bfe9527cc607 ] Trying to boot without SYSFS, but with OF_DYNAMIC quickly results in a crash: [ 0.088460] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000070 [...] [ 0.103927] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 5.14.0-rc3 #4179 [ 0.105810] Hardware name: linux,dummy-virt (DT) [ 0.107147] pstate: 80000005 (Nzcv daif -PAN -UAO -TCO BTYPE=--) [ 0.108876] pc : kernfs_find_and_get_ns+0x3c/0x7c [ 0.110244] lr : kernfs_find_and_get_ns+0x3c/0x7c [...] [ 0.134087] Call trace: [ 0.134800] kernfs_find_and_get_ns+0x3c/0x7c [ 0.136054] safe_name+0x4c/0xd0 [ 0.136994] __of_attach_node_sysfs+0xf8/0x124 [ 0.138287] of_core_init+0x90/0xfc [ 0.139296] driver_init+0x30/0x4c [ 0.140283] kernel_init_freeable+0x160/0x1b8 [ 0.141543] kernel_init+0x30/0x140 [ 0.142561] ret_from_fork+0x10/0x18 While not having sysfs isn't a very common option these days, it is still expected that such configuration would work. Paper over it by bailing out from __of_attach_node_sysfs() if CONFIG_SYSFS isn't enabled. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210820144722.169226-1-maz@kernel.org Signed-off-by: Rob Herring Signed-off-by: Sasha Levin --- drivers/of/kobj.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/of/kobj.c b/drivers/of/kobj.c index a32e60b024b8d..6675b5e56960c 100644 --- a/drivers/of/kobj.c +++ b/drivers/of/kobj.c @@ -119,7 +119,7 @@ int __of_attach_node_sysfs(struct device_node *np) struct property *pp; int rc; - if (!of_kset) + if (!IS_ENABLED(CONFIG_SYSFS) || !of_kset) return 0; np->kobj.kset = of_kset; From 04000d34b36d3f5e2147da8e2f880c7169c8bffc Mon Sep 17 00:00:00 2001 From: Manish Narani Date: Tue, 15 Jun 2021 16:13:54 +0530 Subject: [PATCH 0788/5344] mmc: sdhci-of-arasan: Check return value of non-void funtions [ Upstream commit 66bad6ed2204fdb78a0a8fb89d824397106a5471 ] At a couple of places, the return values of the non-void functions were not getting checked. This was reported by the coverity tool. Modify the code to check the return values of the same. Addresses-Coverity: ("check_return") Signed-off-by: Manish Narani Acked-by: Adrian Hunter Link: https://lore.kernel.org/r/1623753837-21035-5-git-send-email-manish.narani@xilinx.com Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- drivers/mmc/host/sdhci-of-arasan.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/mmc/host/sdhci-of-arasan.c b/drivers/mmc/host/sdhci-of-arasan.c index 7023cbec4017b..dd10f7abf5a71 100644 --- a/drivers/mmc/host/sdhci-of-arasan.c +++ b/drivers/mmc/host/sdhci-of-arasan.c @@ -192,7 +192,12 @@ static void sdhci_arasan_set_clock(struct sdhci_host *host, unsigned int clock) * through low speeds without power cycling. */ sdhci_set_clock(host, host->max_clk); - phy_power_on(sdhci_arasan->phy); + if (phy_power_on(sdhci_arasan->phy)) { + pr_err("%s: Cannot power on phy.\n", + mmc_hostname(host->mmc)); + return; + } + sdhci_arasan->is_phy_on = true; /* @@ -228,7 +233,12 @@ static void sdhci_arasan_set_clock(struct sdhci_host *host, unsigned int clock) msleep(20); if (ctrl_phy) { - phy_power_on(sdhci_arasan->phy); + if (phy_power_on(sdhci_arasan->phy)) { + pr_err("%s: Cannot power on phy.\n", + mmc_hostname(host->mmc)); + return; + } + sdhci_arasan->is_phy_on = true; } } @@ -416,7 +426,9 @@ static int sdhci_arasan_suspend(struct device *dev) ret = phy_power_off(sdhci_arasan->phy); if (ret) { dev_err(dev, "Cannot power off phy.\n"); - sdhci_resume_host(host); + if (sdhci_resume_host(host)) + dev_err(dev, "Cannot resume host.\n"); + return ret; } sdhci_arasan->is_phy_on = false; From b8c4bdf4090fc4f46b6c6c6680d5194ce81cd7d3 Mon Sep 17 00:00:00 2001 From: Thomas Hebb Date: Sun, 1 Aug 2021 04:46:14 -0700 Subject: [PATCH 0789/5344] mmc: rtsx_pci: Fix long reads when clock is prescaled [ Upstream commit 3ac5e45291f3f0d699a721357380d4593bc2dcb3 ] For unexplained reasons, the prescaler register for this device needs to be cleared (set to 1) while performing a data read or else the command will hang. This does not appear to affect the real clock rate sent out on the bus, so I assume it's purely to work around a hardware bug. During normal operation, the prescaler is already set to 1, so nothing needs to be done. However, in "initial mode" (which is used for sub-MHz clock speeds, like the core sets while enumerating cards), it's set to 128 and so we need to reset it during data reads. We currently fail to do this for long reads. This has no functional affect on the driver's operation currently written, as the MMC core always sets a clock above 1MHz before attempting any long reads. However, the core could conceivably set any clock speed at any time and the driver should still work, so I think this fix is worthwhile. I personally encountered this issue while performing data recovery on an external chip. My connections had poor signal integrity, so I modified the core code to reduce the clock speed. Without this change, I saw the card enumerate but was unable to actually read any data. Writes don't seem to work in the situation described above even with this change (and even if the workaround is extended to encompass data write commands). I was not able to find a way to get them working. Signed-off-by: Thomas Hebb Link: https://lore.kernel.org/r/2fef280d8409ab0100c26c6ac7050227defd098d.1627818365.git.tommyhebb@gmail.com Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- drivers/mmc/host/rtsx_pci_sdmmc.c | 36 ++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/drivers/mmc/host/rtsx_pci_sdmmc.c b/drivers/mmc/host/rtsx_pci_sdmmc.c index 11087976ab19c..9ff718b61c72e 100644 --- a/drivers/mmc/host/rtsx_pci_sdmmc.c +++ b/drivers/mmc/host/rtsx_pci_sdmmc.c @@ -539,9 +539,22 @@ static int sd_write_long_data(struct realtek_pci_sdmmc *host, return 0; } +static inline void sd_enable_initial_mode(struct realtek_pci_sdmmc *host) +{ + rtsx_pci_write_register(host->pcr, SD_CFG1, + SD_CLK_DIVIDE_MASK, SD_CLK_DIVIDE_128); +} + +static inline void sd_disable_initial_mode(struct realtek_pci_sdmmc *host) +{ + rtsx_pci_write_register(host->pcr, SD_CFG1, + SD_CLK_DIVIDE_MASK, SD_CLK_DIVIDE_0); +} + static int sd_rw_multi(struct realtek_pci_sdmmc *host, struct mmc_request *mrq) { struct mmc_data *data = mrq->data; + int err; if (host->sg_count < 0) { data->error = host->sg_count; @@ -550,22 +563,19 @@ static int sd_rw_multi(struct realtek_pci_sdmmc *host, struct mmc_request *mrq) return data->error; } - if (data->flags & MMC_DATA_READ) - return sd_read_long_data(host, mrq); + if (data->flags & MMC_DATA_READ) { + if (host->initial_mode) + sd_disable_initial_mode(host); - return sd_write_long_data(host, mrq); -} + err = sd_read_long_data(host, mrq); -static inline void sd_enable_initial_mode(struct realtek_pci_sdmmc *host) -{ - rtsx_pci_write_register(host->pcr, SD_CFG1, - SD_CLK_DIVIDE_MASK, SD_CLK_DIVIDE_128); -} + if (host->initial_mode) + sd_enable_initial_mode(host); -static inline void sd_disable_initial_mode(struct realtek_pci_sdmmc *host) -{ - rtsx_pci_write_register(host->pcr, SD_CFG1, - SD_CLK_DIVIDE_MASK, SD_CLK_DIVIDE_0); + return err; + } + + return sd_write_long_data(host, mrq); } static void sd_normal_rw(struct realtek_pci_sdmmc *host, From e33ae8f3340a4f03d3d020ce5dbc7c004d545b44 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Fri, 20 Aug 2021 09:55:53 +0800 Subject: [PATCH 0790/5344] selftests/bpf: Enlarge select() timeout for test_maps [ Upstream commit 2d82d73da35b72b53fe0d96350a2b8d929d07e42 ] 0Day robot observed that it's easily timeout on a heavy load host. ------------------- # selftests: bpf: test_maps # Fork 1024 tasks to 'test_update_delete' # Fork 1024 tasks to 'test_update_delete' # Fork 100 tasks to 'test_hashmap' # Fork 100 tasks to 'test_hashmap_percpu' # Fork 100 tasks to 'test_hashmap_sizes' # Fork 100 tasks to 'test_hashmap_walk' # Fork 100 tasks to 'test_arraymap' # Fork 100 tasks to 'test_arraymap_percpu' # Failed sockmap unexpected timeout not ok 3 selftests: bpf: test_maps # exit=1 # selftests: bpf: test_lru_map # nr_cpus:8 ------------------- Since this test will be scheduled by 0Day to a random host that could have only a few cpus(2-8), enlarge the timeout to avoid a false NG report. In practice, i tried to pin it to only one cpu by 'taskset 0x01 ./test_maps', and knew 10S is likely enough, but i still perfer to a larger value 30. Reported-by: kernel test robot Signed-off-by: Li Zhijian Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210820015556.23276-2-lizhijian@cn.fujitsu.com Signed-off-by: Sasha Levin --- tools/testing/selftests/bpf/test_maps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 1c4219ceced2f..45c7a55f0b8b5 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -972,7 +972,7 @@ static void test_sockmap(unsigned int tasks, void *data) FD_ZERO(&w); FD_SET(sfd[3], &w); - to.tv_sec = 1; + to.tv_sec = 30; to.tv_usec = 0; s = select(sfd[3] + 1, &w, NULL, NULL, &to); if (s == -1) { From 62723ba505a2c4008e53c47dbd2ddfa2a36a259f Mon Sep 17 00:00:00 2001 From: Nishad Kamdar Date: Wed, 25 Aug 2021 00:47:26 +0530 Subject: [PATCH 0791/5344] mmc: core: Return correct emmc response in case of ioctl error [ Upstream commit e72a55f2e5ddcfb3dce0701caf925ce435b87682 ] When a read/write command is sent via ioctl to the kernel, and the command fails, the actual error response of the emmc is not sent to the user. IOCTL read/write tests are carried out using commands 17 (Single BLock Read), 24 (Single Block Write), 18 (Multi Block Read), 25 (Multi Block Write) The tests are carried out on a 64Gb emmc device. All of these tests try to access an "out of range" sector address (0x09B2FFFF). It is seen that without the patch the response received by the user is not OUT_OF_RANGE error (R1 response 31st bit is not set) as per JEDEC specification. After applying the patch proper response is seen. This is because the function returns without copying the response to the user in case of failure. This patch fixes the issue. Hence, this memcpy is required whether we get an error response or not. Therefor it is moved up from the current position up to immediately after we have called mmc_wait_for_req(). The test code and the output of only the CMD17 is included in the commit to limit the message length. CMD17 (Test Code Snippet): ========================== printf("Forming CMD%d\n", opt_idx); /* single block read */ cmd.blksz = 512; cmd.blocks = 1; cmd.write_flag = 0; cmd.opcode = 17; //cmd.arg = atoi(argv[3]); cmd.arg = 0x09B2FFFF; /* Expecting response R1B */ cmd.flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_ADTC; memset(data, 0, sizeof(__u8) * 512); mmc_ioc_cmd_set_data(cmd, data); printf("Sending CMD%d: ARG[0x%08x]\n", opt_idx, cmd.arg); if(ioctl(fd, MMC_IOC_CMD, &cmd)) perror("Error"); printf("\nResponse: %08x\n", cmd.response[0]); CMD17 (Output without patch): ============================= test@test-LIVA-Z:~$ sudo ./mmc cmd_test /dev/mmcblk0 17 Entering the do_mmc_commands:Device: /dev/mmcblk0 nargs:4 Entering the do_mmc_commands:Device: /dev/mmcblk0 options[17, 0x09B2FFF] Forming CMD17 Sending CMD17: ARG[0x09b2ffff] Error: Connection timed out Response: 00000000 (Incorrect response) CMD17 (Output with patch): ========================== test@test-LIVA-Z:~$ sudo ./mmc cmd_test /dev/mmcblk0 17 [sudo] password for test: Entering the do_mmc_commands:Device: /dev/mmcblk0 nargs:4 Entering the do_mmc_commands:Device: /dev/mmcblk0 options[17, 09B2FFFF] Forming CMD17 Sending CMD17: ARG[0x09b2ffff] Error: Connection timed out Response: 80000900 (Correct OUT_OF_ERROR response as per JEDEC specification) Signed-off-by: Nishad Kamdar Reviewed-by: Avri Altman Link: https://lore.kernel.org/r/20210824191726.8296-1-nishadkamdar@gmail.com Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- drivers/mmc/core/block.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 8322d22a59c45..e92f9373e2274 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -591,6 +591,7 @@ static int __mmc_blk_ioctl_cmd(struct mmc_card *card, struct mmc_blk_data *md, } mmc_wait_for_req(card->host, &mrq); + memcpy(&idata->ic.response, cmd.resp, sizeof(cmd.resp)); if (cmd.error) { dev_err(mmc_dev(card->host), "%s: cmd error %d\n", @@ -640,8 +641,6 @@ static int __mmc_blk_ioctl_cmd(struct mmc_card *card, struct mmc_blk_data *md, if (idata->ic.postsleep_min_us) usleep_range(idata->ic.postsleep_min_us, idata->ic.postsleep_max_us); - memcpy(&(idata->ic.response), cmd.resp, sizeof(cmd.resp)); - if (idata->rpmb || (cmd.flags & MMC_RSP_R1B) == MMC_RSP_R1B) { /* * Ensure RPMB/R1B command has completed by polling CMD13 From c18844acf18688c0f0ac579c545dea5a78401093 Mon Sep 17 00:00:00 2001 From: Ding Hui Date: Tue, 17 Aug 2021 22:55:10 +0800 Subject: [PATCH 0792/5344] cifs: fix wrong release in sess_alloc_buffer() failed path [ Upstream commit d72c74197b70bc3c95152f351a568007bffa3e11 ] smb_buf is allocated by small_smb_init_no_tc(), and buf type is CIFS_SMALL_BUFFER, so we should use cifs_small_buf_release() to release it in failed path. Signed-off-by: Ding Hui Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/cifs/sess.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 85bd644f9773b..30f841a880acd 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -610,7 +610,7 @@ sess_alloc_buffer(struct sess_data *sess_data, int wct) return 0; out_free_smb_buf: - kfree(smb_buf); + cifs_small_buf_release(smb_buf); sess_data->iov[0].iov_base = NULL; sess_data->iov[0].iov_len = 0; sess_data->buf0_type = CIFS_NO_BUFFER; From 284848d42c3e3ab5d9d998851be8510253744034 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Fri, 20 Aug 2021 15:35:01 +0300 Subject: [PATCH 0793/5344] Revert "USB: xhci: fix U1/U2 handling for hardware with XHCI_INTEL_HOST quirk set" [ Upstream commit 2847c46c61486fd8bca9136a6e27177212e78c69 ] This reverts commit 5d5323a6f3625f101dbfa94ba3ef7706cce38760. That commit effectively disabled Intel host initiated U1/U2 lpm for devices with periodic endpoints. Before that commit we disabled host initiated U1/U2 lpm if the exit latency was larger than any periodic endpoint service interval, this is according to xhci spec xhci 1.1 specification section 4.23.5.2 After that commit we incorrectly checked that service interval was smaller than U1/U2 inactivity timeout. This is not relevant, and can't happen for Intel hosts as previously set U1/U2 timeout = 105% * service interval. Patch claimed it solved cases where devices can't be enumerated because of bandwidth issues. This might be true but it's a side effect of accidentally turning off lpm. exit latency calculations have been revised since then Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20210820123503.2605901-5-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/usb/host/xhci.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index a3813c75a3de8..505da4999e208 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -4662,19 +4662,19 @@ static u16 xhci_calculate_u1_timeout(struct xhci_hcd *xhci, { unsigned long long timeout_ns; - if (xhci->quirks & XHCI_INTEL_HOST) - timeout_ns = xhci_calculate_intel_u1_timeout(udev, desc); - else - timeout_ns = udev->u1_params.sel; - /* Prevent U1 if service interval is shorter than U1 exit latency */ if (usb_endpoint_xfer_int(desc) || usb_endpoint_xfer_isoc(desc)) { - if (xhci_service_interval_to_ns(desc) <= timeout_ns) { + if (xhci_service_interval_to_ns(desc) <= udev->u1_params.mel) { dev_dbg(&udev->dev, "Disable U1, ESIT shorter than exit latency\n"); return USB3_LPM_DISABLED; } } + if (xhci->quirks & XHCI_INTEL_HOST) + timeout_ns = xhci_calculate_intel_u1_timeout(udev, desc); + else + timeout_ns = udev->u1_params.sel; + /* The U1 timeout is encoded in 1us intervals. * Don't return a timeout of zero, because that's USB3_LPM_DISABLED. */ @@ -4726,19 +4726,19 @@ static u16 xhci_calculate_u2_timeout(struct xhci_hcd *xhci, { unsigned long long timeout_ns; - if (xhci->quirks & XHCI_INTEL_HOST) - timeout_ns = xhci_calculate_intel_u2_timeout(udev, desc); - else - timeout_ns = udev->u2_params.sel; - /* Prevent U2 if service interval is shorter than U2 exit latency */ if (usb_endpoint_xfer_int(desc) || usb_endpoint_xfer_isoc(desc)) { - if (xhci_service_interval_to_ns(desc) <= timeout_ns) { + if (xhci_service_interval_to_ns(desc) <= udev->u2_params.mel) { dev_dbg(&udev->dev, "Disable U2, ESIT shorter than exit latency\n"); return USB3_LPM_DISABLED; } } + if (xhci->quirks & XHCI_INTEL_HOST) + timeout_ns = xhci_calculate_intel_u2_timeout(udev, desc); + else + timeout_ns = udev->u2_params.sel; + /* The U2 timeout is encoded in 256us intervals */ timeout_ns = DIV_ROUND_UP_ULL(timeout_ns, 256 * 1000); /* If the necessary timeout value is bigger than what we can set in the From f697762c4f36305a9c0b679c2eeb5dcc1469d666 Mon Sep 17 00:00:00 2001 From: Nadezda Lutovinova Date: Thu, 19 Aug 2021 19:33:23 +0300 Subject: [PATCH 0794/5344] usb: musb: musb_dsps: request_irq() after initializing musb [ Upstream commit 7c75bde329d7e2a93cf86a5c15c61f96f1446cdc ] If IRQ occurs between calling dsps_setup_optional_vbus_irq() and dsps_create_musb_pdev(), then null pointer dereference occurs since glue->musb wasn't initialized yet. The patch puts initializing of neccesery data before registration of the interrupt handler. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Nadezda Lutovinova Link: https://lore.kernel.org/r/20210819163323.17714-1-lutovinova@ispras.ru Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/usb/musb/musb_dsps.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/usb/musb/musb_dsps.c b/drivers/usb/musb/musb_dsps.c index 327d4f7baaf7c..89d659cef5c63 100644 --- a/drivers/usb/musb/musb_dsps.c +++ b/drivers/usb/musb/musb_dsps.c @@ -890,23 +890,22 @@ static int dsps_probe(struct platform_device *pdev) if (!glue->usbss_base) return -ENXIO; - if (usb_get_dr_mode(&pdev->dev) == USB_DR_MODE_PERIPHERAL) { - ret = dsps_setup_optional_vbus_irq(pdev, glue); - if (ret) - goto err_iounmap; - } - platform_set_drvdata(pdev, glue); pm_runtime_enable(&pdev->dev); ret = dsps_create_musb_pdev(glue, pdev); if (ret) goto err; + if (usb_get_dr_mode(&pdev->dev) == USB_DR_MODE_PERIPHERAL) { + ret = dsps_setup_optional_vbus_irq(pdev, glue); + if (ret) + goto err; + } + return 0; err: pm_runtime_disable(&pdev->dev); -err_iounmap: iounmap(glue->usbss_base); return ret; } From 5d8f2ce37e5b39b3063936a6f9d6bc4322b84707 Mon Sep 17 00:00:00 2001 From: Anirudh Rayabharam Date: Sat, 21 Aug 2021 00:31:21 +0530 Subject: [PATCH 0795/5344] usbip: give back URBs for unsent unlink requests during cleanup [ Upstream commit 258c81b341c8025d79073ce2d6ce19dcdc7d10d2 ] In vhci_device_unlink_cleanup(), the URBs for unsent unlink requests are not given back. This sometimes causes usb_kill_urb to wait indefinitely for that urb to be given back. syzbot has reported a hung task issue [1] for this. To fix this, give back the urbs corresponding to unsent unlink requests (unlink_tx list) similar to how urbs corresponding to unanswered unlink requests (unlink_rx list) are given back. [1]: https://syzkaller.appspot.com/bug?id=08f12df95ae7da69814e64eb5515d5a85ed06b76 Reported-by: syzbot+74d6ef051d3d2eacf428@syzkaller.appspotmail.com Tested-by: syzbot+74d6ef051d3d2eacf428@syzkaller.appspotmail.com Reviewed-by: Shuah Khan Signed-off-by: Anirudh Rayabharam Link: https://lore.kernel.org/r/20210820190122.16379-2-mail@anirudhrb.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/usb/usbip/vhci_hcd.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/drivers/usb/usbip/vhci_hcd.c b/drivers/usb/usbip/vhci_hcd.c index 98636fbf71882..46a46cde2070a 100644 --- a/drivers/usb/usbip/vhci_hcd.c +++ b/drivers/usb/usbip/vhci_hcd.c @@ -952,8 +952,32 @@ static void vhci_device_unlink_cleanup(struct vhci_device *vdev) spin_lock(&vdev->priv_lock); list_for_each_entry_safe(unlink, tmp, &vdev->unlink_tx, list) { + struct urb *urb; + + /* give back urb of unsent unlink request */ pr_info("unlink cleanup tx %lu\n", unlink->unlink_seqnum); + + urb = pickup_urb_and_free_priv(vdev, unlink->unlink_seqnum); + if (!urb) { + list_del(&unlink->list); + kfree(unlink); + continue; + } + + urb->status = -ENODEV; + + usb_hcd_unlink_urb_from_ep(hcd, urb); + list_del(&unlink->list); + + spin_unlock(&vdev->priv_lock); + spin_unlock_irqrestore(&vhci->lock, flags); + + usb_hcd_giveback_urb(hcd, urb, urb->status); + + spin_lock_irqsave(&vhci->lock, flags); + spin_lock(&vdev->priv_lock); + kfree(unlink); } From 66432c5a5bf835c8032d93993819c92c701f80e3 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 19 Aug 2021 16:59:37 -0600 Subject: [PATCH 0796/5344] usbip:vhci_hcd USB port can get stuck in the disabled state [ Upstream commit 66cce9e73ec61967ed1f97f30cee79bd9a2bb7ee ] When a remote usb device is attached to the local Virtual USB Host Controller Root Hub port, the bound device driver may send a port reset command. vhci_hcd accepts port resets only when the device doesn't have port address assigned to it. When reset happens device is in assigned/used state and vhci_hcd rejects it leaving the port in a stuck state. This problem was found when a blue-tooth or xbox wireless dongle was passed through using usbip. A few drivers reset the port during probe including mt76 driver specific to this bug report. Fix the problem with a change to honor reset requests when device is in used state (VDEV_ST_USED). Reported-and-tested-by: Michael Suggested-by: Michael Signed-off-by: Shuah Khan Link: https://lore.kernel.org/r/20210819225937.41037-1-skhan@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/usb/usbip/vhci_hcd.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/usb/usbip/vhci_hcd.c b/drivers/usb/usbip/vhci_hcd.c index 46a46cde2070a..170abb06a8a4d 100644 --- a/drivers/usb/usbip/vhci_hcd.c +++ b/drivers/usb/usbip/vhci_hcd.c @@ -455,8 +455,14 @@ static int vhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, vhci_hcd->port_status[rhport] &= ~(1 << USB_PORT_FEAT_RESET); vhci_hcd->re_timeout = 0; + /* + * A few drivers do usb reset during probe when + * the device could be in VDEV_ST_USED state + */ if (vhci_hcd->vdev[rhport].ud.status == - VDEV_ST_NOTASSIGNED) { + VDEV_ST_NOTASSIGNED || + vhci_hcd->vdev[rhport].ud.status == + VDEV_ST_USED) { usbip_dbg_vhci_rh( " enable rhport %d (status %u)\n", rhport, From a37e87e3fe64fdd8d23ead184d8540859c40d205 Mon Sep 17 00:00:00 2001 From: Sugar Zhang Date: Thu, 26 Aug 2021 12:01:50 +0800 Subject: [PATCH 0797/5344] ASoC: rockchip: i2s: Fix regmap_ops hang [ Upstream commit 53ca9b9777b95cdd689181d7c547e38dc79adad0 ] API 'set_fmt' maybe called when PD is off, in the situation, any register access will hang the system. so, enable PD before r/w register. Signed-off-by: Sugar Zhang Link: https://lore.kernel.org/r/1629950520-14190-4-git-send-email-sugar.zhang@rock-chips.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/rockchip/rockchip_i2s.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/sound/soc/rockchip/rockchip_i2s.c b/sound/soc/rockchip/rockchip_i2s.c index 61c984f10d8e6..f48b146cd96a1 100644 --- a/sound/soc/rockchip/rockchip_i2s.c +++ b/sound/soc/rockchip/rockchip_i2s.c @@ -186,7 +186,9 @@ static int rockchip_i2s_set_fmt(struct snd_soc_dai *cpu_dai, { struct rk_i2s_dev *i2s = to_info(cpu_dai); unsigned int mask = 0, val = 0; + int ret = 0; + pm_runtime_get_sync(cpu_dai->dev); mask = I2S_CKR_MSS_MASK; switch (fmt & SND_SOC_DAIFMT_MASTER_MASK) { case SND_SOC_DAIFMT_CBS_CFS: @@ -199,7 +201,8 @@ static int rockchip_i2s_set_fmt(struct snd_soc_dai *cpu_dai, i2s->is_master_mode = false; break; default: - return -EINVAL; + ret = -EINVAL; + goto err_pm_put; } regmap_update_bits(i2s->regmap, I2S_CKR, mask, val); @@ -213,7 +216,8 @@ static int rockchip_i2s_set_fmt(struct snd_soc_dai *cpu_dai, val = I2S_CKR_CKP_POS; break; default: - return -EINVAL; + ret = -EINVAL; + goto err_pm_put; } regmap_update_bits(i2s->regmap, I2S_CKR, mask, val); @@ -236,7 +240,8 @@ static int rockchip_i2s_set_fmt(struct snd_soc_dai *cpu_dai, val = I2S_TXCR_TFS_PCM | I2S_TXCR_PBM_MODE(1); break; default: - return -EINVAL; + ret = -EINVAL; + goto err_pm_put; } regmap_update_bits(i2s->regmap, I2S_TXCR, mask, val); @@ -259,12 +264,16 @@ static int rockchip_i2s_set_fmt(struct snd_soc_dai *cpu_dai, val = I2S_RXCR_TFS_PCM | I2S_RXCR_PBM_MODE(1); break; default: - return -EINVAL; + ret = -EINVAL; + goto err_pm_put; } regmap_update_bits(i2s->regmap, I2S_RXCR, mask, val); - return 0; +err_pm_put: + pm_runtime_put(cpu_dai->dev); + + return ret; } static int rockchip_i2s_hw_params(struct snd_pcm_substream *substream, From 890bf801ac289a6489472ba8a1605b17722fe554 Mon Sep 17 00:00:00 2001 From: Xiaotan Luo Date: Thu, 26 Aug 2021 12:02:36 +0800 Subject: [PATCH 0798/5344] ASoC: rockchip: i2s: Fixup config for DAIFMT_DSP_A/B [ Upstream commit 1bf56843e664eef2525bdbfae6a561e98910f676 ] - DSP_A: PCM delay 1 bit mode, L data MSB after FRM LRC - DSP_B: PCM no delay mode, L data MSB during FRM LRC Signed-off-by: Xiaotan Luo Signed-off-by: Sugar Zhang Link: https://lore.kernel.org/r/1629950562-14281-3-git-send-email-sugar.zhang@rock-chips.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/rockchip/rockchip_i2s.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sound/soc/rockchip/rockchip_i2s.c b/sound/soc/rockchip/rockchip_i2s.c index f48b146cd96a1..086c90e095770 100644 --- a/sound/soc/rockchip/rockchip_i2s.c +++ b/sound/soc/rockchip/rockchip_i2s.c @@ -233,12 +233,12 @@ static int rockchip_i2s_set_fmt(struct snd_soc_dai *cpu_dai, case SND_SOC_DAIFMT_I2S: val = I2S_TXCR_IBM_NORMAL; break; - case SND_SOC_DAIFMT_DSP_A: /* PCM no delay mode */ - val = I2S_TXCR_TFS_PCM; - break; - case SND_SOC_DAIFMT_DSP_B: /* PCM delay 1 mode */ + case SND_SOC_DAIFMT_DSP_A: /* PCM delay 1 bit mode */ val = I2S_TXCR_TFS_PCM | I2S_TXCR_PBM_MODE(1); break; + case SND_SOC_DAIFMT_DSP_B: /* PCM no delay mode */ + val = I2S_TXCR_TFS_PCM; + break; default: ret = -EINVAL; goto err_pm_put; @@ -257,12 +257,12 @@ static int rockchip_i2s_set_fmt(struct snd_soc_dai *cpu_dai, case SND_SOC_DAIFMT_I2S: val = I2S_RXCR_IBM_NORMAL; break; - case SND_SOC_DAIFMT_DSP_A: /* PCM no delay mode */ - val = I2S_RXCR_TFS_PCM; - break; - case SND_SOC_DAIFMT_DSP_B: /* PCM delay 1 mode */ + case SND_SOC_DAIFMT_DSP_A: /* PCM delay 1 bit mode */ val = I2S_RXCR_TFS_PCM | I2S_RXCR_PBM_MODE(1); break; + case SND_SOC_DAIFMT_DSP_B: /* PCM no delay mode */ + val = I2S_RXCR_TFS_PCM; + break; default: ret = -EINVAL; goto err_pm_put; From 9720355a4619b5cf6e06a1e4c1a1e3ae1fe87add Mon Sep 17 00:00:00 2001 From: Sean Keely Date: Thu, 19 Aug 2021 21:26:14 -0500 Subject: [PATCH 0799/5344] drm/amdkfd: Account for SH/SE count when setting up cu masks. [ Upstream commit 1ec06c2dee679e9f089e78ed20cb74ee90155f61 ] On systems with multiple SH per SE compute_static_thread_mgmt_se# is split into independent masks, one for each SH, in the upper and lower 16 bits. We need to detect this and apply cu masking to each SH. The cu mask bits are assigned first to each SE, then to alternate SHs, then finally to higher CU id. This ensures that the maximum number of SPIs are engaged as early as possible while balancing CU assignment to each SH. v2: Use max SH/SE rather than max SH in cu_per_sh. v3: Fix comment blocks, ensure se_mask is initially zero filled, and correctly assign se.sh.cu positions to unset bits in cu_mask. Signed-off-by: Sean Keely Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 84 +++++++++++++++----- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h | 1 + 2 files changed, 64 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c index 88813dad731fa..c021519af8106 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c @@ -98,36 +98,78 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm, uint32_t *se_mask) { struct kfd_cu_info cu_info; - uint32_t cu_per_se[KFD_MAX_NUM_SE] = {0}; - int i, se, sh, cu = 0; - + uint32_t cu_per_sh[KFD_MAX_NUM_SE][KFD_MAX_NUM_SH_PER_SE] = {0}; + int i, se, sh, cu; amdgpu_amdkfd_get_cu_info(mm->dev->kgd, &cu_info); if (cu_mask_count > cu_info.cu_active_number) cu_mask_count = cu_info.cu_active_number; + /* Exceeding these bounds corrupts the stack and indicates a coding error. + * Returning with no CU's enabled will hang the queue, which should be + * attention grabbing. + */ + if (cu_info.num_shader_engines > KFD_MAX_NUM_SE) { + pr_err("Exceeded KFD_MAX_NUM_SE, chip reports %d\n", cu_info.num_shader_engines); + return; + } + if (cu_info.num_shader_arrays_per_engine > KFD_MAX_NUM_SH_PER_SE) { + pr_err("Exceeded KFD_MAX_NUM_SH, chip reports %d\n", + cu_info.num_shader_arrays_per_engine * cu_info.num_shader_engines); + return; + } + /* Count active CUs per SH. + * + * Some CUs in an SH may be disabled. HW expects disabled CUs to be + * represented in the high bits of each SH's enable mask (the upper and lower + * 16 bits of se_mask) and will take care of the actual distribution of + * disabled CUs within each SH automatically. + * Each half of se_mask must be filled only on bits 0-cu_per_sh[se][sh]-1. + * + * See note on Arcturus cu_bitmap layout in gfx_v9_0_get_cu_info. + */ for (se = 0; se < cu_info.num_shader_engines; se++) for (sh = 0; sh < cu_info.num_shader_arrays_per_engine; sh++) - cu_per_se[se] += hweight32(cu_info.cu_bitmap[se % 4][sh + (se / 4)]); - - /* Symmetrically map cu_mask to all SEs: - * cu_mask[0] bit0 -> se_mask[0] bit0; - * cu_mask[0] bit1 -> se_mask[1] bit0; - * ... (if # SE is 4) - * cu_mask[0] bit4 -> se_mask[0] bit1; + cu_per_sh[se][sh] = hweight32(cu_info.cu_bitmap[se % 4][sh + (se / 4)]); + + /* Symmetrically map cu_mask to all SEs & SHs: + * se_mask programs up to 2 SH in the upper and lower 16 bits. + * + * Examples + * Assuming 1 SH/SE, 4 SEs: + * cu_mask[0] bit0 -> se_mask[0] bit0 + * cu_mask[0] bit1 -> se_mask[1] bit0 + * ... + * cu_mask[0] bit4 -> se_mask[0] bit1 + * ... + * + * Assuming 2 SH/SE, 4 SEs + * cu_mask[0] bit0 -> se_mask[0] bit0 (SE0,SH0,CU0) + * cu_mask[0] bit1 -> se_mask[1] bit0 (SE1,SH0,CU0) + * ... + * cu_mask[0] bit4 -> se_mask[0] bit16 (SE0,SH1,CU0) + * cu_mask[0] bit5 -> se_mask[1] bit16 (SE1,SH1,CU0) + * ... + * cu_mask[0] bit8 -> se_mask[0] bit1 (SE0,SH0,CU1) * ... + * + * First ensure all CUs are disabled, then enable user specified CUs. */ - se = 0; - for (i = 0; i < cu_mask_count; i++) { - if (cu_mask[i / 32] & (1 << (i % 32))) - se_mask[se] |= 1 << cu; - - do { - se++; - if (se == cu_info.num_shader_engines) { - se = 0; - cu++; + for (i = 0; i < cu_info.num_shader_engines; i++) + se_mask[i] = 0; + + i = 0; + for (cu = 0; cu < 16; cu++) { + for (sh = 0; sh < cu_info.num_shader_arrays_per_engine; sh++) { + for (se = 0; se < cu_info.num_shader_engines; se++) { + if (cu_per_sh[se][sh] > cu) { + if (cu_mask[i / 32] & (1 << (i % 32))) + se_mask[se] |= 1 << (cu + sh * 16); + i++; + if (i == cu_mask_count) + return; + } } - } while (cu >= cu_per_se[se] && cu < 32); + } } } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h index fbdb16418847c..4edc012e31387 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h @@ -27,6 +27,7 @@ #include "kfd_priv.h" #define KFD_MAX_NUM_SE 8 +#define KFD_MAX_NUM_SH_PER_SE 2 /** * struct mqd_manager From 9dbbe2c4bd09a9abed6e327015df9eae638c1d88 Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Mon, 2 Aug 2021 17:28:24 +0300 Subject: [PATCH 0800/5344] iwlwifi: mvm: fix a memory leak in iwl_mvm_mac_ctxt_beacon_changed [ Upstream commit 0f5d44ac6e55551798dd3da0ff847c8df5990822 ] If beacon_inject_active is true, we will return without freeing beacon. Fid that by freeing it before returning. Signed-off-by: Zhang Qilong [reworded the commit message] Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210802172232.d16206ca60fc.I9984a9b442c84814c307cee3213044e24d26f38a@changeid Signed-off-by: Luca Coelho Signed-off-by: Sasha Levin --- drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c index 9c417dd062913..7736621dca653 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c @@ -1043,8 +1043,10 @@ int iwl_mvm_mac_ctxt_beacon_changed(struct iwl_mvm *mvm, return -ENOMEM; #ifdef CONFIG_IWLWIFI_DEBUGFS - if (mvm->beacon_inject_active) + if (mvm->beacon_inject_active) { + dev_kfree_skb(beacon); return -EBUSY; + } #endif ret = iwl_mvm_mac_ctxt_send_beacon(mvm, vif, beacon); From 4c3ba97159a13ff553f393593e11a2346c612dc5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 2 Aug 2021 17:28:27 +0300 Subject: [PATCH 0801/5344] iwlwifi: mvm: avoid static queue number aliasing [ Upstream commit c6ce1c74ef2923b8ffd85f7f8b486f804f343b39 ] When TVQM is enabled (iwl_mvm_has_new_tx_api() is true), then queue numbers are just sequentially assigned 0, 1, 2, ... Prior to TVQM, in DQA, there were some statically allocated queue numbers: * IWL_MVM_DQA_AUX_QUEUE == 1, * both IWL_MVM_DQA_INJECT_MONITOR_QUEUE and IWL_MVM_DQA_P2P_DEVICE_QUEUE == 2, and * IWL_MVM_DQA_AP_PROBE_RESP_QUEUE == 9. Now, these values are assigned to the members mvm->aux_queue, mvm->snif_queue, mvm->probe_queue and mvm->p2p_dev_queue by default. Normally, this doesn't really matter, and if TVQM is in fact available we override them to the real values after allocating a queue for use there. However, this allocation doesn't always happen. For example, for mvm->p2p_dev_queue (== 2) it only happens when the P2P Device interface is started, if any. If it's not started, the value in mvm->p2p_dev_queue remains 2. This wouldn't really matter all that much if it weren't for iwl_mvm_is_static_queue() which checks a queue number against one of those four static numbers. Now, if no P2P Device or monitor interface is added then queue 2 may be dynamically allocated, yet alias mvm->p2p_dev_queue or mvm->snif_queue, and thus iwl_mvm_is_static_queue() erroneously returns true for it. If it then gets full, all interface queues are stopped, instead of just backpressuring against the one TXQ that's really the only affected one. This clearly can lead to issues, as everything is stopped even if just a single TXQ filled its corresponding HW queue, if it happens to have an appropriate number (2 or 9, AUX is always reassigned.) Due to a mac80211 bug, this also led to a situation in which the queues remained stopped across a deauthentication and then attempts to connect to a new AP started failing, but that's fixed separately. Fix all of this by simply initializing the queue numbers to the invalid value until they're used, if TVQM is enabled, and also setting them back to that value when the queues are later freed again. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210802172232.2e47e623f9e2.I9b0830dafbb68ef35b7b8f0f46160abec02ac7d0@changeid Signed-off-by: Luca Coelho Signed-off-by: Sasha Levin --- drivers/net/wireless/intel/iwlwifi/mvm/ops.c | 24 +++++++++++++--- drivers/net/wireless/intel/iwlwifi/mvm/sta.c | 30 ++++++++++++-------- 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c index 8b0576cde797e..a9aab6c690e85 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c @@ -687,10 +687,26 @@ iwl_op_mode_mvm_start(struct iwl_trans *trans, const struct iwl_cfg *cfg, mvm->fw_restart = iwlwifi_mod_params.fw_restart ? -1 : 0; - mvm->aux_queue = IWL_MVM_DQA_AUX_QUEUE; - mvm->snif_queue = IWL_MVM_DQA_INJECT_MONITOR_QUEUE; - mvm->probe_queue = IWL_MVM_DQA_AP_PROBE_RESP_QUEUE; - mvm->p2p_dev_queue = IWL_MVM_DQA_P2P_DEVICE_QUEUE; + if (iwl_mvm_has_new_tx_api(mvm)) { + /* + * If we have the new TX/queue allocation API initialize them + * all to invalid numbers. We'll rewrite the ones that we need + * later, but that doesn't happen for all of them all of the + * time (e.g. P2P Device is optional), and if a dynamic queue + * ends up getting number 2 (IWL_MVM_DQA_P2P_DEVICE_QUEUE) then + * iwl_mvm_is_static_queue() erroneously returns true, and we + * might have things getting stuck. + */ + mvm->aux_queue = IWL_MVM_INVALID_QUEUE; + mvm->snif_queue = IWL_MVM_INVALID_QUEUE; + mvm->probe_queue = IWL_MVM_INVALID_QUEUE; + mvm->p2p_dev_queue = IWL_MVM_INVALID_QUEUE; + } else { + mvm->aux_queue = IWL_MVM_DQA_AUX_QUEUE; + mvm->snif_queue = IWL_MVM_DQA_INJECT_MONITOR_QUEUE; + mvm->probe_queue = IWL_MVM_DQA_AP_PROBE_RESP_QUEUE; + mvm->p2p_dev_queue = IWL_MVM_DQA_P2P_DEVICE_QUEUE; + } mvm->sf_state = SF_UNINIT; if (iwl_mvm_has_unified_ucode(mvm)) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c index 40cafcf40ccf0..5df4bbb6c6de3 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c @@ -346,8 +346,9 @@ static int iwl_mvm_invalidate_sta_queue(struct iwl_mvm *mvm, int queue, } static int iwl_mvm_disable_txq(struct iwl_mvm *mvm, struct ieee80211_sta *sta, - int queue, u8 tid, u8 flags) + u16 *queueptr, u8 tid, u8 flags) { + int queue = *queueptr; struct iwl_scd_txq_cfg_cmd cmd = { .scd_queue = queue, .action = SCD_CFG_DISABLE_QUEUE, @@ -356,6 +357,7 @@ static int iwl_mvm_disable_txq(struct iwl_mvm *mvm, struct ieee80211_sta *sta, if (iwl_mvm_has_new_tx_api(mvm)) { iwl_trans_txq_free(mvm->trans, queue); + *queueptr = IWL_MVM_INVALID_QUEUE; return 0; } @@ -517,6 +519,7 @@ static int iwl_mvm_free_inactive_queue(struct iwl_mvm *mvm, int queue, u8 sta_id, tid; unsigned long disable_agg_tids = 0; bool same_sta; + u16 queue_tmp = queue; int ret; lockdep_assert_held(&mvm->mutex); @@ -539,7 +542,7 @@ static int iwl_mvm_free_inactive_queue(struct iwl_mvm *mvm, int queue, iwl_mvm_invalidate_sta_queue(mvm, queue, disable_agg_tids, false); - ret = iwl_mvm_disable_txq(mvm, old_sta, queue, tid, 0); + ret = iwl_mvm_disable_txq(mvm, old_sta, &queue_tmp, tid, 0); if (ret) { IWL_ERR(mvm, "Failed to free inactive queue %d (ret=%d)\n", @@ -1209,6 +1212,7 @@ static int iwl_mvm_sta_alloc_queue(struct iwl_mvm *mvm, unsigned int wdg_timeout = iwl_mvm_get_wd_timeout(mvm, mvmsta->vif, false, false); int queue = -1; + u16 queue_tmp; unsigned long disable_agg_tids = 0; enum iwl_mvm_agg_state queue_state; bool shared_queue = false, inc_ssn; @@ -1357,7 +1361,8 @@ static int iwl_mvm_sta_alloc_queue(struct iwl_mvm *mvm, return 0; out_err: - iwl_mvm_disable_txq(mvm, sta, queue, tid, 0); + queue_tmp = queue; + iwl_mvm_disable_txq(mvm, sta, &queue_tmp, tid, 0); return ret; } @@ -1795,7 +1800,7 @@ static void iwl_mvm_disable_sta_queues(struct iwl_mvm *mvm, if (mvm_sta->tid_data[i].txq_id == IWL_MVM_INVALID_QUEUE) continue; - iwl_mvm_disable_txq(mvm, sta, mvm_sta->tid_data[i].txq_id, i, + iwl_mvm_disable_txq(mvm, sta, &mvm_sta->tid_data[i].txq_id, i, 0); mvm_sta->tid_data[i].txq_id = IWL_MVM_INVALID_QUEUE; } @@ -2005,7 +2010,7 @@ static int iwl_mvm_add_int_sta_with_queue(struct iwl_mvm *mvm, int macidx, ret = iwl_mvm_add_int_sta_common(mvm, sta, NULL, macidx, maccolor); if (ret) { if (!iwl_mvm_has_new_tx_api(mvm)) - iwl_mvm_disable_txq(mvm, NULL, *queue, + iwl_mvm_disable_txq(mvm, NULL, queue, IWL_MAX_TID_COUNT, 0); return ret; } @@ -2073,7 +2078,7 @@ int iwl_mvm_rm_snif_sta(struct iwl_mvm *mvm, struct ieee80211_vif *vif) if (WARN_ON_ONCE(mvm->snif_sta.sta_id == IWL_MVM_INVALID_STA)) return -EINVAL; - iwl_mvm_disable_txq(mvm, NULL, mvm->snif_queue, IWL_MAX_TID_COUNT, 0); + iwl_mvm_disable_txq(mvm, NULL, &mvm->snif_queue, IWL_MAX_TID_COUNT, 0); ret = iwl_mvm_rm_sta_common(mvm, mvm->snif_sta.sta_id); if (ret) IWL_WARN(mvm, "Failed sending remove station\n"); @@ -2090,7 +2095,7 @@ int iwl_mvm_rm_aux_sta(struct iwl_mvm *mvm) if (WARN_ON_ONCE(mvm->aux_sta.sta_id == IWL_MVM_INVALID_STA)) return -EINVAL; - iwl_mvm_disable_txq(mvm, NULL, mvm->aux_queue, IWL_MAX_TID_COUNT, 0); + iwl_mvm_disable_txq(mvm, NULL, &mvm->aux_queue, IWL_MAX_TID_COUNT, 0); ret = iwl_mvm_rm_sta_common(mvm, mvm->aux_sta.sta_id); if (ret) IWL_WARN(mvm, "Failed sending remove station\n"); @@ -2186,7 +2191,7 @@ static void iwl_mvm_free_bcast_sta_queues(struct iwl_mvm *mvm, struct ieee80211_vif *vif) { struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(vif); - int queue; + u16 *queueptr, queue; lockdep_assert_held(&mvm->mutex); @@ -2195,10 +2200,10 @@ static void iwl_mvm_free_bcast_sta_queues(struct iwl_mvm *mvm, switch (vif->type) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_ADHOC: - queue = mvm->probe_queue; + queueptr = &mvm->probe_queue; break; case NL80211_IFTYPE_P2P_DEVICE: - queue = mvm->p2p_dev_queue; + queueptr = &mvm->p2p_dev_queue; break; default: WARN(1, "Can't free bcast queue on vif type %d\n", @@ -2206,7 +2211,8 @@ static void iwl_mvm_free_bcast_sta_queues(struct iwl_mvm *mvm, return; } - iwl_mvm_disable_txq(mvm, NULL, queue, IWL_MAX_TID_COUNT, 0); + queue = *queueptr; + iwl_mvm_disable_txq(mvm, NULL, queueptr, IWL_MAX_TID_COUNT, 0); if (iwl_mvm_has_new_tx_api(mvm)) return; @@ -2441,7 +2447,7 @@ int iwl_mvm_rm_mcast_sta(struct iwl_mvm *mvm, struct ieee80211_vif *vif) iwl_mvm_flush_sta(mvm, &mvmvif->mcast_sta, true, 0); - iwl_mvm_disable_txq(mvm, NULL, mvmvif->cab_queue, 0, 0); + iwl_mvm_disable_txq(mvm, NULL, &mvmvif->cab_queue, 0, 0); ret = iwl_mvm_rm_sta_common(mvm, mvmvif->mcast_sta.sta_id); if (ret) From ed16f498aef106aa6f188dd6818ea846d021086c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 5 Aug 2021 13:19:31 +0300 Subject: [PATCH 0802/5344] iwlwifi: mvm: fix access to BSS elements [ Upstream commit 6c608cd6962ebdf84fd3de6d42f88ed64d2f4e1b ] BSS elements are protected using RCU, so we need to use RCU properly to access them, fix that. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210805130823.fd8b5791ab44.Iba26800a6301078d3782fb249c476dd8ac2bf3c6@changeid Signed-off-by: Luca Coelho Signed-off-by: Sasha Levin --- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 09b1a6beee77c..081cbc9ec7368 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -2970,16 +2970,20 @@ static void iwl_mvm_check_he_obss_narrow_bw_ru_iter(struct wiphy *wiphy, void *_data) { struct iwl_mvm_he_obss_narrow_bw_ru_data *data = _data; + const struct cfg80211_bss_ies *ies; const struct element *elem; - elem = cfg80211_find_elem(WLAN_EID_EXT_CAPABILITY, bss->ies->data, - bss->ies->len); + rcu_read_lock(); + ies = rcu_dereference(bss->ies); + elem = cfg80211_find_elem(WLAN_EID_EXT_CAPABILITY, ies->data, + ies->len); if (!elem || elem->datalen < 10 || !(elem->data[10] & WLAN_EXT_CAPA10_OBSS_NARROW_BW_RU_TOLERANCE_SUPPORT)) { data->tolerated = false; } + rcu_read_unlock(); } static void iwl_mvm_check_he_obss_narrow_bw_ru(struct ieee80211_hw *hw, From 6807438cc8dccd12e33c95fa83ed93248dd4d6d3 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Thu, 24 Sep 2020 20:58:50 +0300 Subject: [PATCH 0803/5344] net/mlx5: DR, Enable QP retransmission [ Upstream commit ec449ed8230cd30769de3cb70ee0fce293047372 ] Under high stress, SW steering might get stuck on polling for completion that never comes. For such cases QP needs to have protocol retransmission mechanism enabled. Currently the retransmission timeout is defined as 0 (unlimited). Fix this by defining a real timeout. Signed-off-by: Yevgeny Kliteynik Reviewed-by: Alex Vesker Signed-off-by: Saeed Mahameed Signed-off-by: Sasha Levin --- drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c index fae3c89649631..e253afa5e7419 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c @@ -603,6 +603,7 @@ static int dr_cmd_modify_qp_rtr2rts(struct mlx5_core_dev *mdev, MLX5_SET(qpc, qpc, log_ack_req_freq, 0); MLX5_SET(qpc, qpc, retry_count, attr->retry_cnt); MLX5_SET(qpc, qpc, rnr_retry, attr->rnr_retry); + MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */ return mlx5_core_qp_modify(mdev, MLX5_CMD_OP_RTR2RTS_QP, 0, qpc, &dr_qp->mqp); From e7b2996741c20a01dc4d0339f8884c79e372b256 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 30 Jul 2021 11:07:10 +0100 Subject: [PATCH 0804/5344] parport: remove non-zero check on count [ Upstream commit 0be883a0d795d9146f5325de582584147dd0dcdc ] The check for count appears to be incorrect since a non-zero count check occurs a couple of statements earlier. Currently the check is always false and the dev->port->irq != PARPORT_IRQ_NONE part of the check is never tested and the if statement is dead-code. Fix this by removing the check on count. Note that this code is pre-git history, so I can't find a sha for it. Acked-by: Sudip Mukherjee Signed-off-by: Colin Ian King Addresses-Coverity: ("Logically dead code") Link: https://lore.kernel.org/r/20210730100710.27405-1-colin.king@canonical.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/parport/ieee1284_ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/parport/ieee1284_ops.c b/drivers/parport/ieee1284_ops.c index 5d41dda6da4e7..75daa16f38b7f 100644 --- a/drivers/parport/ieee1284_ops.c +++ b/drivers/parport/ieee1284_ops.c @@ -535,7 +535,7 @@ size_t parport_ieee1284_ecp_read_data (struct parport *port, goto out; /* Yield the port for a while. */ - if (count && dev->port->irq != PARPORT_IRQ_NONE) { + if (dev->port->irq != PARPORT_IRQ_NONE) { parport_release (dev); schedule_timeout_interruptible(msecs_to_jiffies(40)); parport_claim_or_block (dev); From 7aab6a5e11bd08214d7cf09f965ff73967e65b72 Mon Sep 17 00:00:00 2001 From: Zekun Shen Date: Sat, 19 Jun 2021 09:29:14 -0400 Subject: [PATCH 0805/5344] ath9k: fix OOB read ar9300_eeprom_restore_internal [ Upstream commit 23151b9ae79e3bc4f6a0c4cd3a7f355f68dad128 ] Bad header can have large length field which can cause OOB. cptr is the last bytes for read, and the eeprom is parsed from high to low address. The OOB, triggered by the condition length > cptr could cause memory error with a read on negative index. There are some sanity check around length, but it is not compared with cptr (the remaining bytes). Here, the corrupted/bad EEPROM can cause panic. I was able to reproduce the crash, but I cannot find the log and the reproducer now. After I applied the patch, the bug is no longer reproducible. Signed-off-by: Zekun Shen Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/YM3xKsQJ0Hw2hjrc@Zekuns-MBP-16.fios-router.home Signed-off-by: Sasha Levin --- drivers/net/wireless/ath/ath9k/ar9003_eeprom.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath9k/ar9003_eeprom.c b/drivers/net/wireless/ath/ath9k/ar9003_eeprom.c index b4885a700296e..b0a4ca3559fd8 100644 --- a/drivers/net/wireless/ath/ath9k/ar9003_eeprom.c +++ b/drivers/net/wireless/ath/ath9k/ar9003_eeprom.c @@ -3351,7 +3351,8 @@ static int ar9300_eeprom_restore_internal(struct ath_hw *ah, "Found block at %x: code=%d ref=%d length=%d major=%d minor=%d\n", cptr, code, reference, length, major, minor); if ((!AR_SREV_9485(ah) && length >= 1024) || - (AR_SREV_9485(ah) && length > EEPROM_DATA_LEN_9485)) { + (AR_SREV_9485(ah) && length > EEPROM_DATA_LEN_9485) || + (length > cptr)) { ath_dbg(common, EEPROM, "Skipping bad header\n"); cptr -= COMP_HDR_LEN; continue; From 9defa7a908ed61b6276bea84ec2052b50fdff652 Mon Sep 17 00:00:00 2001 From: Miaoqing Pan Date: Mon, 9 Aug 2021 12:05:16 +0800 Subject: [PATCH 0806/5344] ath9k: fix sleeping in atomic context [ Upstream commit 7c48662b9d56666219f526a71ace8c15e6e12f1f ] The problem is that gpio_free() can sleep and the cfg_soc() can be called with spinlocks held. One problematic call tree is: --> ath_reset_internal() takes &sc->sc_pcu_lock spin lock --> ath9k_hw_reset() --> ath9k_hw_gpio_request_in() --> ath9k_hw_gpio_request() --> ath9k_hw_gpio_cfg_soc() Remove gpio_free(), use error message instead, so we should make sure there is no GPIO conflict. Also remove ath9k_hw_gpio_free() from ath9k_hw_apply_gpio_override(), as gpio_mask will never be set for SOC chips. Signed-off-by: Miaoqing Pan Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1628481916-15030-1-git-send-email-miaoqing@codeaurora.org Signed-off-by: Sasha Levin --- drivers/net/wireless/ath/ath9k/hw.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/ath/ath9k/hw.c b/drivers/net/wireless/ath/ath9k/hw.c index 9fd8e64288ffa..7e2e22b6bbbc5 100644 --- a/drivers/net/wireless/ath/ath9k/hw.c +++ b/drivers/net/wireless/ath/ath9k/hw.c @@ -1622,7 +1622,6 @@ static void ath9k_hw_apply_gpio_override(struct ath_hw *ah) ath9k_hw_gpio_request_out(ah, i, NULL, AR_GPIO_OUTPUT_MUX_AS_OUTPUT); ath9k_hw_set_gpio(ah, i, !!(ah->gpio_val & BIT(i))); - ath9k_hw_gpio_free(ah, i); } } @@ -2730,14 +2729,17 @@ static void ath9k_hw_gpio_cfg_output_mux(struct ath_hw *ah, u32 gpio, u32 type) static void ath9k_hw_gpio_cfg_soc(struct ath_hw *ah, u32 gpio, bool out, const char *label) { + int err; + if (ah->caps.gpio_requested & BIT(gpio)) return; - /* may be requested by BSP, free anyway */ - gpio_free(gpio); - - if (gpio_request_one(gpio, out ? GPIOF_OUT_INIT_LOW : GPIOF_IN, label)) + err = gpio_request_one(gpio, out ? GPIOF_OUT_INIT_LOW : GPIOF_IN, label); + if (err) { + ath_err(ath9k_hw_common(ah), "request GPIO%d failed:%d\n", + gpio, err); return; + } ah->caps.gpio_requested |= BIT(gpio); } From 3ac6c6e17a9e32991773d0b615e3ace17c685744 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B4=87?= Date: Mon, 30 Aug 2021 18:28:01 +0800 Subject: [PATCH 0807/5344] net: fix NULL pointer reference in cipso_v4_doi_free [ Upstream commit 733c99ee8be9a1410287cdbb943887365e83b2d6 ] In netlbl_cipsov4_add_std() when 'doi_def->map.std' alloc failed, we sometime observe panic: BUG: kernel NULL pointer dereference, address: ... RIP: 0010:cipso_v4_doi_free+0x3a/0x80 ... Call Trace: netlbl_cipsov4_add_std+0xf4/0x8c0 netlbl_cipsov4_add+0x13f/0x1b0 genl_family_rcv_msg_doit.isra.15+0x132/0x170 genl_rcv_msg+0x125/0x240 This is because in cipso_v4_doi_free() there is no check on 'doi_def->map.std' when 'doi_def->type' equal 1, which is possibe, since netlbl_cipsov4_add_std() haven't initialize it before alloc 'doi_def->map.std'. This patch just add the check to prevent panic happen for similar cases. Reported-by: Abaci Signed-off-by: Michael Wang Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/netlabel/netlabel_cipso_v4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c index 8cd3daf0e3db6..1778e4e8ce247 100644 --- a/net/netlabel/netlabel_cipso_v4.c +++ b/net/netlabel/netlabel_cipso_v4.c @@ -144,8 +144,8 @@ static int netlbl_cipsov4_add_std(struct genl_info *info, return -ENOMEM; doi_def->map.std = kzalloc(sizeof(*doi_def->map.std), GFP_KERNEL); if (doi_def->map.std == NULL) { - ret_val = -ENOMEM; - goto add_std_failure; + kfree(doi_def); + return -ENOMEM; } doi_def->type = CIPSO_V4_MAP_TRANS; From 9c66b2d483caa57b98786a5f1bdf42422bc35ad7 Mon Sep 17 00:00:00 2001 From: Haimin Zhang Date: Mon, 30 Aug 2021 11:47:01 +0800 Subject: [PATCH 0808/5344] fix array-index-out-of-bounds in taprio_change [ Upstream commit efe487fce3061d94222c6501d7be3aa549b3dc78 ] syzbot report an array-index-out-of-bounds in taprio_change index 16 is out of range for type '__u16 [16]' that's because mqprio->num_tc is lager than TC_MAX_QUEUE,so we check the return value of netdev_set_num_tc. Reported-by: syzbot+2b3e5fb6c7ef285a94f6@syzkaller.appspotmail.com Signed-off-by: Haimin Zhang Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/sched/sch_taprio.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 53f181206eef4..e14a66ce4884d 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1503,7 +1503,9 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, taprio_set_picos_per_byte(dev, q); if (mqprio) { - netdev_set_num_tc(dev, mqprio->num_tc); + err = netdev_set_num_tc(dev, mqprio->num_tc); + if (err) + goto free_sched; for (i = 0; i < mqprio->num_tc; i++) netdev_set_tc_queue(dev, i, mqprio->count[i], From ee09050340f553bbc393eca3aa156fb01674cd32 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 31 Aug 2021 16:40:18 +0800 Subject: [PATCH 0809/5344] net: w5100: check return value after calling platform_get_resource() [ Upstream commit a39ff4a47f3e1da3b036817ef436b1a9be10783a ] It will cause null-ptr-deref if platform_get_resource() returns NULL, we need check the return value. Signed-off-by: Yang Yingliang Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/wiznet/w5100.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/wiznet/w5100.c b/drivers/net/ethernet/wiznet/w5100.c index bede1ff289c59..a65b7291e12a2 100644 --- a/drivers/net/ethernet/wiznet/w5100.c +++ b/drivers/net/ethernet/wiznet/w5100.c @@ -1052,6 +1052,8 @@ static int w5100_mmio_probe(struct platform_device *pdev) mac_addr = data->mac_addr; mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!mem) + return -EINVAL; if (resource_size(mem) < W5100_BUS_DIRECT_SIZE) ops = &w5100_mmio_indirect_ops; else From cbb004b928b86e9fb8742b8cd5d44df7de80bf20 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 30 Aug 2021 05:42:27 -0400 Subject: [PATCH 0810/5344] parisc: fix crash with signals and alloca commit 030f653078316a9cc9ca6bd1b0234dcf858be35d upstream. I was debugging some crashes on parisc and I found out that there is a crash possibility if a function using alloca is interrupted by a signal. The reason for the crash is that the gcc alloca implementation leaves garbage in the upper 32 bits of the sp register. This normally doesn't matter (the upper bits are ignored because the PSW W-bit is clear), however the signal delivery routine in the kernel uses full 64 bits of sp and it fails with -EFAULT if the upper 32 bits are not zero. I created this program that demonstrates the problem: #include #include #include #include static __attribute__((noinline,noclone)) void aa(int *size) { void * volatile p = alloca(-*size); while (1) ; } static void handler(int sig) { write(1, "signal delivered\n", 17); _exit(0); } int main(void) { int size = -0x100; signal(SIGALRM, handler); alarm(1); aa(&size); } If you compile it with optimizations, it will crash. The "aa" function has this disassembly: 000106a0 : 106a0: 08 03 02 41 copy r3,r1 106a4: 08 1e 02 43 copy sp,r3 106a8: 6f c1 00 80 stw,ma r1,40(sp) 106ac: 37 dc 3f c1 ldo -20(sp),ret0 106b0: 0c 7c 12 90 stw ret0,8(r3) 106b4: 0f 40 10 9c ldw 0(r26),ret0 ; ret0 = 0x00000000FFFFFF00 106b8: 97 9c 00 7e subi 3f,ret0,ret0 ; ret0 = 0xFFFFFFFF0000013F 106bc: d7 80 1c 1a depwi 0,31,6,ret0 ; ret0 = 0xFFFFFFFF00000100 106c0: 0b 9e 0a 1e add,l sp,ret0,sp ; sp = 0xFFFFFFFFxxxxxxxx 106c4: e8 1f 1f f7 b,l,n 106c4 ,r0 This patch fixes the bug by truncating the "usp" variable to 32 bits. Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Signed-off-by: Helge Deller Signed-off-by: Greg Kroah-Hartman --- arch/parisc/kernel/signal.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c index def33d4d5c0ef..396ca3b75d860 100644 --- a/arch/parisc/kernel/signal.c +++ b/arch/parisc/kernel/signal.c @@ -238,6 +238,12 @@ setup_rt_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs, #endif usp = (regs->gr[30] & ~(0x01UL)); +#ifdef CONFIG_64BIT + if (is_compat_task()) { + /* The gcc alloca implementation leaves garbage in the upper 32 bits of sp */ + usp = (compat_uint_t)usp; + } +#endif /*FIXME: frame_size parameter is unused, remove it. */ frame = get_sigframe(&ksig->ka, usp, sizeof(*frame)); From ef1f2e57d8ee274c4e1a8311e3094b4e581ba320 Mon Sep 17 00:00:00 2001 From: chenying Date: Mon, 16 Aug 2021 18:02:56 +0800 Subject: [PATCH 0811/5344] ovl: fix BUG_ON() in may_delete() when called from ovl_cleanup() commit 52d5a0c6bd8a89f460243ed937856354f8f253a3 upstream. If function ovl_instantiate() returns an error, ovl_cleanup will be called and try to remove newdentry from wdir, but the newdentry has been moved to udir at this time. This will causes BUG_ON(victim->d_parent->d_inode != dir) in fs/namei.c:may_delete. Signed-off-by: chenying Fixes: 01b39dcc9568 ("ovl: use inode_insert5() to hash a newly created inode") Link: https://lore.kernel.org/linux-unionfs/e6496a94-a161-dc04-c38a-d2544633acb4@bytedance.com/ Cc: # v4.18 Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/overlayfs/dir.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 6509ec3cb3730..073be36b0686c 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -513,8 +513,10 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, goto out_cleanup; } err = ovl_instantiate(dentry, inode, newdentry, hardlink); - if (err) - goto out_cleanup; + if (err) { + ovl_cleanup(udir, newdentry); + dput(newdentry); + } out_dput: dput(upper); out_unlock: From fd905a568843f1ed747cc7443b378fdd953c6a73 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Tue, 20 Apr 2021 20:01:47 +0200 Subject: [PATCH 0812/5344] scsi: BusLogic: Fix missing pr_cont() use commit 44d01fc86d952f5a8b8b32bdb4841504d5833d95 upstream. Update BusLogic driver's messaging system to use pr_cont() for continuation lines, bringing messy output: pci 0000:00:13.0: PCI->APIC IRQ transform: INT A -> IRQ 17 scsi: ***** BusLogic SCSI Driver Version 2.1.17 of 12 September 2013 ***** scsi: Copyright 1995-1998 by Leonard N. Zubkoff scsi0: Configuring BusLogic Model BT-958 PCI Wide Ultra SCSI Host Adapter scsi0: Firmware Version: 5.07B, I/O Address: 0x7000, IRQ Channel: 17/Level scsi0: PCI Bus: 0, Device: 19, Address: 0xE0012000, Host Adapter SCSI ID: 7 scsi0: Parity Checking: Enabled, Extended Translation: Enabled scsi0: Synchronous Negotiation: Ultra, Wide Negotiation: Enabled scsi0: Disconnect/Reconnect: Enabled, Tagged Queuing: Enabled scsi0: Scatter/Gather Limit: 128 of 8192 segments, Mailboxes: 211 scsi0: Driver Queue Depth: 211, Host Adapter Queue Depth: 192 scsi0: Tagged Queue Depth: Automatic , Untagged Queue Depth: 3 scsi0: SCSI Bus Termination: Both Enabled , SCAM: Disabled scsi0: *** BusLogic BT-958 Initialized Successfully *** scsi host0: BusLogic BT-958 back to order: pci 0000:00:13.0: PCI->APIC IRQ transform: INT A -> IRQ 17 scsi: ***** BusLogic SCSI Driver Version 2.1.17 of 12 September 2013 ***** scsi: Copyright 1995-1998 by Leonard N. Zubkoff scsi0: Configuring BusLogic Model BT-958 PCI Wide Ultra SCSI Host Adapter scsi0: Firmware Version: 5.07B, I/O Address: 0x7000, IRQ Channel: 17/Level scsi0: PCI Bus: 0, Device: 19, Address: 0xE0012000, Host Adapter SCSI ID: 7 scsi0: Parity Checking: Enabled, Extended Translation: Enabled scsi0: Synchronous Negotiation: Ultra, Wide Negotiation: Enabled scsi0: Disconnect/Reconnect: Enabled, Tagged Queuing: Enabled scsi0: Scatter/Gather Limit: 128 of 8192 segments, Mailboxes: 211 scsi0: Driver Queue Depth: 211, Host Adapter Queue Depth: 192 scsi0: Tagged Queue Depth: Automatic, Untagged Queue Depth: 3 scsi0: SCSI Bus Termination: Both Enabled, SCAM: Disabled scsi0: *** BusLogic BT-958 Initialized Successfully *** scsi host0: BusLogic BT-958 Also diagnostic output such as with the BusLogic=TraceConfiguration parameter is affected and becomes vertical and therefore hard to read. This has now been corrected, e.g.: pci 0000:00:13.0: PCI->APIC IRQ transform: INT A -> IRQ 17 blogic_cmd(86) Status = 30: 4 ==> 4: FF 05 93 00 blogic_cmd(95) Status = 28: (Modify I/O Address) blogic_cmd(91) Status = 30: 1 ==> 1: 01 blogic_cmd(04) Status = 30: 4 ==> 4: 41 41 35 30 blogic_cmd(8D) Status = 30: 14 ==> 14: 45 DC 00 20 00 00 00 00 00 40 30 37 42 1D scsi: ***** BusLogic SCSI Driver Version 2.1.17 of 12 September 2013 ***** scsi: Copyright 1995-1998 by Leonard N. Zubkoff blogic_cmd(04) Status = 30: 4 ==> 4: 41 41 35 30 blogic_cmd(0B) Status = 30: 3 ==> 3: 00 08 07 blogic_cmd(0D) Status = 30: 34 ==> 34: 03 01 07 04 00 00 00 00 00 00 00 00 00 00 00 00 FF 42 44 46 FF 00 00 00 00 00 00 00 00 00 FF 00 FF 00 blogic_cmd(8D) Status = 30: 14 ==> 14: 45 DC 00 20 00 00 00 00 00 40 30 37 42 1D blogic_cmd(84) Status = 30: 1 ==> 1: 37 blogic_cmd(8B) Status = 30: 5 ==> 5: 39 35 38 20 20 blogic_cmd(85) Status = 30: 1 ==> 1: 42 blogic_cmd(86) Status = 30: 4 ==> 4: FF 05 93 00 blogic_cmd(91) Status = 30: 64 ==> 64: 41 46 3E 20 39 35 38 20 20 00 C4 00 04 01 07 2F 07 04 35 FF FF FF FF FF FF FF FF FF FF 01 00 FE FF 08 FF FF 00 00 00 00 00 00 00 01 00 01 00 00 FF FF 00 00 00 00 00 00 00 00 00 00 00 00 00 FC scsi0: Configuring BusLogic Model BT-958 PCI Wide Ultra SCSI Host Adapter etc. Link: https://lore.kernel.org/r/alpine.DEB.2.21.2104201940430.44318@angie.orcam.me.uk Fixes: 4bcc595ccd80 ("printk: reinstate KERN_CONT for printing continuation lines") Cc: stable@vger.kernel.org # v4.9+ Acked-by: Khalid Aziz Signed-off-by: Maciej W. Rozycki Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/BusLogic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/BusLogic.c b/drivers/scsi/BusLogic.c index 6e988233fb81f..6a54556119dd6 100644 --- a/drivers/scsi/BusLogic.c +++ b/drivers/scsi/BusLogic.c @@ -3601,7 +3601,7 @@ static void blogic_msg(enum blogic_msglevel msglevel, char *fmt, if (buf[0] != '\n' || len > 1) printk("%sscsi%d: %s", blogic_msglevelmap[msglevel], adapter->host_no, buf); } else - printk("%s", buf); + pr_cont("%s", buf); } else { if (begin) { if (adapter != NULL && adapter->adapter_initd) @@ -3609,7 +3609,7 @@ static void blogic_msg(enum blogic_msglevel msglevel, char *fmt, else printk("%s%s", blogic_msglevelmap[msglevel], buf); } else - printk("%s", buf); + pr_cont("%s", buf); } begin = (buf[len - 1] == '\n'); } From b5892ebffbd32f76dcc77bc988195cf74eba4023 Mon Sep 17 00:00:00 2001 From: Saurav Kashyap Date: Mon, 9 Aug 2021 21:37:17 -0700 Subject: [PATCH 0813/5344] scsi: qla2xxx: Changes to support kdump kernel commit 62e0dec59c1e139dab55aff5aa442adc97804271 upstream. Avoid allocating firmware dump and only allocate a single queue for a kexec kernel. Link: https://lore.kernel.org/r/20210810043720.1137-12-njavali@marvell.com Cc: stable@vger.kernel.org Reviewed-by: Himanshu Madhani Signed-off-by: Saurav Kashyap Signed-off-by: Nilesh Javali Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/qla2xxx/qla_os.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 052ce78814075..28cbefe715e59 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -2799,6 +2800,11 @@ qla2x00_probe_one(struct pci_dev *pdev, const struct pci_device_id *id) return ret; } + if (is_kdump_kernel()) { + ql2xmqsupport = 0; + ql2xallocfwdump = 0; + } + /* This may fail but that's ok */ pci_enable_pcie_error_reporting(pdev); From 51c55bf7a80683183fbc76cdffd086a0d56bef63 Mon Sep 17 00:00:00 2001 From: Saurav Kashyap Date: Mon, 9 Aug 2021 21:37:19 -0700 Subject: [PATCH 0814/5344] scsi: qla2xxx: Sync queue idx with queue_pair_map idx commit c8fadf019964d0eb1da410ba8b629494d3339db9 upstream. The first invocation of function find_first_zero_bit will return 0 and queue_id gets set to 0. An index of queue_pair_map also gets set to 0. qpair_id = find_first_zero_bit(ha->qpair_qid_map, ha->max_qpairs); set_bit(qpair_id, ha->qpair_qid_map); ha->queue_pair_map[qpair_id] = qpair; In the alloc_queue callback driver checks the map, if queue is already allocated: ha->queue_pair_map[qidx] This works fine as long as max_qpairs is greater than nvme_max_hw_queues(8) since the size of the queue_pair_map is equal to max_qpair. In case nr_cpus is less than 8, max_qpairs is less than 8. This creates wrong value returned as qpair. [ 1572.353669] qla2xxx [0000:24:00.3]-2121:6: Returning existing qpair of 4e00000000000000 for idx=2 [ 1572.354458] general protection fault: 0000 [#1] SMP PTI [ 1572.354461] CPU: 1 PID: 44 Comm: kworker/1:1H Kdump: loaded Tainted: G IOE --------- - - 4.18.0-304.el8.x86_64 #1 [ 1572.354462] Hardware name: HP ProLiant DL380p Gen8, BIOS P70 03/01/2013 [ 1572.354467] Workqueue: kblockd blk_mq_run_work_fn [ 1572.354485] RIP: 0010:qla_nvme_post_cmd+0x92/0x760 [qla2xxx] [ 1572.354486] Code: 84 24 5c 01 00 00 00 00 b8 0a 74 1e 66 83 79 48 00 0f 85 a8 03 00 00 48 8b 44 24 08 48 89 ee 4c 89 e7 8b 50 24 e8 5e 8e 00 00 41 ff 47 04 0f ae f0 41 f6 47 24 04 74 19 f0 41 ff 4f 04 b8 f0 [ 1572.354487] RSP: 0018:ffff9c81c645fc90 EFLAGS: 00010246 [ 1572.354489] RAX: 0000000000000001 RBX: ffff8ea3e5070138 RCX: 0000000000000001 [ 1572.354490] RDX: 0000000000000001 RSI: 0000000000000001 RDI: ffff8ea4c866b800 [ 1572.354491] RBP: ffff8ea4c866b800 R08: 0000000000005010 R09: ffff8ea4c866b800 [ 1572.354492] R10: 0000000000000001 R11: 000000069d1ca3ff R12: ffff8ea4bc460000 [ 1572.354493] R13: ffff8ea3e50702b0 R14: ffff8ea4c4c16a58 R15: 4e00000000000000 [ 1572.354494] FS: 0000000000000000(0000) GS:ffff8ea4dfd00000(0000) knlGS:0000000000000000 [ 1572.354495] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1572.354496] CR2: 000055884504fa58 CR3: 00000005a1410001 CR4: 00000000000606e0 [ 1572.354497] Call Trace: [ 1572.354503] ? check_preempt_curr+0x62/0x90 [ 1572.354506] ? dma_direct_map_sg+0x72/0x1f0 [ 1572.354509] ? nvme_fc_start_fcp_op.part.32+0x175/0x460 [nvme_fc] [ 1572.354511] ? blk_mq_dispatch_rq_list+0x11c/0x730 [ 1572.354515] ? __switch_to_asm+0x35/0x70 [ 1572.354516] ? __switch_to_asm+0x41/0x70 [ 1572.354518] ? __switch_to_asm+0x35/0x70 [ 1572.354519] ? __switch_to_asm+0x41/0x70 [ 1572.354521] ? __switch_to_asm+0x35/0x70 [ 1572.354522] ? __switch_to_asm+0x41/0x70 [ 1572.354523] ? __switch_to_asm+0x35/0x70 [ 1572.354525] ? entry_SYSCALL_64_after_hwframe+0xb9/0xca [ 1572.354527] ? __switch_to_asm+0x41/0x70 [ 1572.354529] ? __blk_mq_sched_dispatch_requests+0xc6/0x170 [ 1572.354531] ? blk_mq_sched_dispatch_requests+0x30/0x60 [ 1572.354532] ? __blk_mq_run_hw_queue+0x51/0xd0 [ 1572.354535] ? process_one_work+0x1a7/0x360 [ 1572.354537] ? create_worker+0x1a0/0x1a0 [ 1572.354538] ? worker_thread+0x30/0x390 [ 1572.354540] ? create_worker+0x1a0/0x1a0 [ 1572.354541] ? kthread+0x116/0x130 [ 1572.354543] ? kthread_flush_work_fn+0x10/0x10 [ 1572.354545] ? ret_from_fork+0x35/0x40 Fix is to use index 0 for admin and first IO queue. Link: https://lore.kernel.org/r/20210810043720.1137-14-njavali@marvell.com Fixes: e84067d74301 ("scsi: qla2xxx: Add FC-NVMe F/W initialization and transport registration") Cc: stable@vger.kernel.org Reviewed-by: Himanshu Madhani Signed-off-by: Saurav Kashyap Signed-off-by: Nilesh Javali Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/qla2xxx/qla_nvme.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c index 11656e864fca9..97453c12b7358 100644 --- a/drivers/scsi/qla2xxx/qla_nvme.c +++ b/drivers/scsi/qla2xxx/qla_nvme.c @@ -84,8 +84,9 @@ static int qla_nvme_alloc_queue(struct nvme_fc_local_port *lport, struct qla_hw_data *ha; struct qla_qpair *qpair; - if (!qidx) - qidx++; + /* Map admin queue and 1st IO queue to index 0 */ + if (qidx) + qidx--; vha = (struct scsi_qla_host *)lport->private; ha = vha->hw; From f84a9d3e40afb016b0391007c0f5a92c2ab07924 Mon Sep 17 00:00:00 2001 From: "Pratik R. Sampat" Date: Wed, 28 Jul 2021 17:35:00 +0530 Subject: [PATCH 0815/5344] cpufreq: powernv: Fix init_chip_info initialization in numa=off commit f34ee9cb2c5ac5af426fee6fa4591a34d187e696 upstream. In the numa=off kernel command-line configuration init_chip_info() loops around the number of chips and attempts to copy the cpumask of that node which is NULL for all iterations after the first chip. Hence, store the cpu mask for each chip instead of derving cpumask from node while populating the "chips" struct array and copy that to the chips[i].mask Fixes: 053819e0bf84 ("cpufreq: powernv: Handle throttling due to Pmax capping at chip level") Cc: stable@vger.kernel.org # v4.3+ Reported-by: Shirisha Ganta Signed-off-by: Pratik R. Sampat Reviewed-by: Gautham R. Shenoy [mpe: Rename goto label to out_free_chip_cpu_mask] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210728120500.87549-2-psampat@linux.ibm.com Signed-off-by: Greg Kroah-Hartman --- drivers/cpufreq/powernv-cpufreq.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index bc6ccf2c7aae0..c636c9ba01008 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -36,6 +36,7 @@ #define MAX_PSTATE_SHIFT 32 #define LPSTATE_SHIFT 48 #define GPSTATE_SHIFT 56 +#define MAX_NR_CHIPS 32 #define MAX_RAMP_DOWN_TIME 5120 /* @@ -1050,12 +1051,20 @@ static int init_chip_info(void) unsigned int *chip; unsigned int cpu, i; unsigned int prev_chip_id = UINT_MAX; + cpumask_t *chip_cpu_mask; int ret = 0; chip = kcalloc(num_possible_cpus(), sizeof(*chip), GFP_KERNEL); if (!chip) return -ENOMEM; + /* Allocate a chip cpu mask large enough to fit mask for all chips */ + chip_cpu_mask = kcalloc(MAX_NR_CHIPS, sizeof(cpumask_t), GFP_KERNEL); + if (!chip_cpu_mask) { + ret = -ENOMEM; + goto free_and_return; + } + for_each_possible_cpu(cpu) { unsigned int id = cpu_to_chip_id(cpu); @@ -1063,22 +1072,25 @@ static int init_chip_info(void) prev_chip_id = id; chip[nr_chips++] = id; } + cpumask_set_cpu(cpu, &chip_cpu_mask[nr_chips-1]); } chips = kcalloc(nr_chips, sizeof(struct chip), GFP_KERNEL); if (!chips) { ret = -ENOMEM; - goto free_and_return; + goto out_free_chip_cpu_mask; } for (i = 0; i < nr_chips; i++) { chips[i].id = chip[i]; - cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i])); + cpumask_copy(&chips[i].mask, &chip_cpu_mask[i]); INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn); for_each_cpu(cpu, &chips[i].mask) per_cpu(chip_info, cpu) = &chips[i]; } +out_free_chip_cpu_mask: + kfree(chip_cpu_mask); free_and_return: kfree(chip); return ret; From 12f151486417f4fd533abd55ef2466816b7fe0a5 Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Sat, 24 Jul 2021 01:17:46 +0200 Subject: [PATCH 0816/5344] s390/pv: fix the forcing of the swiotlb commit 93ebb6828723b8aef114415c4dc3518342f7dcad upstream. Since commit 903cd0f315fe ("swiotlb: Use is_swiotlb_force_bounce for swiotlb data bouncing") if code sets swiotlb_force it needs to do so before the swiotlb is initialised. Otherwise io_tlb_default_mem->force_bounce will not get set to true, and devices that use (the default) swiotlb will not bounce despite switolb_force having the value of SWIOTLB_FORCE. Let us restore swiotlb functionality for PV by fulfilling this new requirement. This change addresses what turned out to be a fragility in commit 64e1f0c531d1 ("s390/mm: force swiotlb for protected virtualization"), which ain't exactly broken in its original context, but could give us some more headache if people backport the broken change and forget this fix. Signed-off-by: Halil Pasic Tested-by: Christian Borntraeger Reviewed-by: Christian Borntraeger Fixes: 903cd0f315fe ("swiotlb: Use is_swiotlb_force_bounce for swiotlb data bouncing") Fixes: 64e1f0c531d1 ("s390/mm: force swiotlb for protected virtualization") Cc: stable@vger.kernel.org #5.3+ Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Greg Kroah-Hartman --- arch/s390/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index c1d96e588152b..5521f593cd20a 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -168,9 +168,9 @@ static void pv_init(void) return; /* make sure bounce buffers are shared */ + swiotlb_force = SWIOTLB_FORCE; swiotlb_init(1); swiotlb_update_mem_attributes(); - swiotlb_force = SWIOTLB_FORCE; } void __init mem_init(void) From cc2f7eed16fcf468e5cbc6c8f632bc39317c135d Mon Sep 17 00:00:00 2001 From: Patryk Duda Date: Tue, 18 May 2021 16:07:58 +0200 Subject: [PATCH 0817/5344] platform/chrome: cros_ec_proto: Send command again when timeout occurs commit 3abc16af57c9939724df92fcbda296b25cc95168 upstream. Sometimes kernel is trying to probe Fingerprint MCU (FPMCU) when it hasn't initialized SPI yet. This can happen because FPMCU is restarted during system boot and kernel can send message in short window eg. between sysjump to RW and SPI initialization. Cc: # 4.4+ Signed-off-by: Patryk Duda Link: https://lore.kernel.org/r/20210518140758.29318-1-pdk@semihalf.com Signed-off-by: Benson Leung Signed-off-by: Greg Kroah-Hartman --- drivers/platform/chrome/cros_ec_proto.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/platform/chrome/cros_ec_proto.c b/drivers/platform/chrome/cros_ec_proto.c index f659f96bda128..9b575e9dd71c5 100644 --- a/drivers/platform/chrome/cros_ec_proto.c +++ b/drivers/platform/chrome/cros_ec_proto.c @@ -213,6 +213,15 @@ static int cros_ec_host_command_proto_query(struct cros_ec_device *ec_dev, msg->insize = sizeof(struct ec_response_get_protocol_info); ret = send_command(ec_dev, msg); + /* + * Send command once again when timeout occurred. + * Fingerprint MCU (FPMCU) is restarted during system boot which + * introduces small window in which FPMCU won't respond for any + * messages sent by kernel. There is no need to wait before next + * attempt because we waited at least EC_MSG_DEADLINE_MS. + */ + if (ret == -ETIMEDOUT) + ret = send_command(ec_dev, msg); if (ret < 0) { dev_dbg(ec_dev->dev, From 55bbe60cb7727f36b8e6c6f63dddf0a42e9fcf2f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 23 Jul 2021 15:19:31 -0700 Subject: [PATCH 0818/5344] lib/test_stackinit: Fix static initializer test commit f9398f15605a50110bf570aaa361163a85113dd1 upstream. The static initializer test got accidentally converted to a dynamic initializer. Fix this and retain the giant padding hole without using an aligned struct member. Fixes: 50ceaa95ea09 ("lib: Introduce test_stackinit module") Cc: Ard Biesheuvel Cc: stable@vger.kernel.org Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20210723221933.3431999-2-keescook@chromium.org Signed-off-by: Greg Kroah-Hartman --- lib/test_stackinit.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/lib/test_stackinit.c b/lib/test_stackinit.c index 2d7d257a430e6..35d398b065e4f 100644 --- a/lib/test_stackinit.c +++ b/lib/test_stackinit.c @@ -67,10 +67,10 @@ static bool range_contains(char *haystack_start, size_t haystack_size, #define INIT_STRUCT_none /**/ #define INIT_STRUCT_zero = { } #define INIT_STRUCT_static_partial = { .two = 0, } -#define INIT_STRUCT_static_all = { .one = arg->one, \ - .two = arg->two, \ - .three = arg->three, \ - .four = arg->four, \ +#define INIT_STRUCT_static_all = { .one = 0, \ + .two = 0, \ + .three = 0, \ + .four = 0, \ } #define INIT_STRUCT_dynamic_partial = { .two = arg->two, } #define INIT_STRUCT_dynamic_all = { .one = arg->one, \ @@ -84,8 +84,7 @@ static bool range_contains(char *haystack_start, size_t haystack_size, var.one = 0; \ var.two = 0; \ var.three = 0; \ - memset(&var.four, 0, \ - sizeof(var.four)) + var.four = 0 /* * @name: unique string name for the test @@ -208,18 +207,13 @@ struct test_small_hole { unsigned long four; }; -/* Try to trigger unhandled padding in a structure. */ -struct test_aligned { - u32 internal1; - u64 internal2; -} __aligned(64); - +/* Trigger unhandled padding in a structure. */ struct test_big_hole { u8 one; u8 two; u8 three; /* 61 byte padding hole here. */ - struct test_aligned four; + u8 four __aligned(64); } __aligned(64); struct test_trailing_hole { From 061f493416424efbf3144ffd71fc76b3fddbf91f Mon Sep 17 00:00:00 2001 From: Jan Hoffmann Date: Wed, 1 Sep 2021 20:49:33 +0200 Subject: [PATCH 0819/5344] net: dsa: lantiq_gswip: fix maximum frame length commit 552799f8b3b0074d2617f53a63a088f9514a66e3 upstream. Currently, outgoing packets larger than 1496 bytes are dropped when tagged VLAN is used on a switch port. Add the frame check sequence length to the value of the register GSWIP_MAC_FLEN to fix this. This matches the lantiq_ppa vendor driver, which uses a value consisting of 1518 bytes for the MAC frame, plus the lengths of special tag and VLAN tags. Fixes: 14fceff4771e ("net: dsa: Add Lantiq / Intel DSA driver for vrx200") Cc: stable@vger.kernel.org Signed-off-by: Jan Hoffmann Acked-by: Hauke Mehrtens Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/dsa/lantiq_gswip.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c index af3d56636a076..3225de0f655f2 100644 --- a/drivers/net/dsa/lantiq_gswip.c +++ b/drivers/net/dsa/lantiq_gswip.c @@ -837,7 +837,8 @@ static int gswip_setup(struct dsa_switch *ds) gswip_switch_mask(priv, 0, GSWIP_MAC_CTRL_2_MLEN, GSWIP_MAC_CTRL_2p(cpu_port)); - gswip_switch_w(priv, VLAN_ETH_FRAME_LEN + 8, GSWIP_MAC_FLEN); + gswip_switch_w(priv, VLAN_ETH_FRAME_LEN + 8 + ETH_FCS_LEN, + GSWIP_MAC_FLEN); gswip_switch_mask(priv, 0, GSWIP_BM_QUEUE_GCTRL_GL_MOD, GSWIP_BM_QUEUE_GCTRL); From eea33f3f26af52774f7a51686dc6580084c3587d Mon Sep 17 00:00:00 2001 From: David Heidelberg Date: Wed, 11 Aug 2021 19:06:31 +0200 Subject: [PATCH 0820/5344] drm/msi/mdp4: populate priv->kms in mdp4_kms_init commit cb0927ab80d224c9074f53d1a55b087d12ec5a85 upstream. Without this fix boot throws NULL ptr exception at msm_dsi_manager_setup_encoder on devices like Nexus 7 2013 (MDP4 v4.4). Fixes: 03436e3ec69c ("drm/msm/dsi: Move setup_encoder to modeset_init") Cc: Signed-off-by: David Heidelberg Link: https://lore.kernel.org/r/20210811170631.39296-1-david@ixit.cz Signed-off-by: Rob Clark Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/msm/disp/mdp4/mdp4_kms.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/disp/mdp4/mdp4_kms.c b/drivers/gpu/drm/msm/disp/mdp4/mdp4_kms.c index 5d50e93efe363..4f0c6d58e06fa 100644 --- a/drivers/gpu/drm/msm/disp/mdp4/mdp4_kms.c +++ b/drivers/gpu/drm/msm/disp/mdp4/mdp4_kms.c @@ -405,6 +405,7 @@ struct msm_kms *mdp4_kms_init(struct drm_device *dev) { struct platform_device *pdev = to_platform_device(dev->dev); struct mdp4_platform_config *config = mdp4_get_config(pdev); + struct msm_drm_private *priv = dev->dev_private; struct mdp4_kms *mdp4_kms; struct msm_kms *kms = NULL; struct msm_gem_address_space *aspace; @@ -419,7 +420,8 @@ struct msm_kms *mdp4_kms_init(struct drm_device *dev) mdp_kms_init(&mdp4_kms->base, &kms_funcs); - kms = &mdp4_kms->base.base; + priv->kms = &mdp4_kms->base.base; + kms = priv->kms; mdp4_kms->dev = dev; From 3941478b2afb9f777b93951bdf6181a8f9313cf1 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Tue, 22 Jun 2021 12:23:38 -0400 Subject: [PATCH 0821/5344] drm/amdgpu: Fix BUG_ON assert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit ea7acd7c5967542353430947f3faf699e70602e5 upstream. With added CPU domain to placement you can have now 3 placemnts at once. CC: stable@kernel.org Signed-off-by: Andrey Grodzovsky Reviewed-by: Christian König Link: https://patchwork.freedesktop.org/patch/msgid/20210622162339.761651-5-andrey.grodzovsky@amd.com Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 28361a9c5addc..532d1842f6a30 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -200,7 +200,7 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain) c++; } - BUG_ON(c >= AMDGPU_BO_MAX_PLACEMENTS); + BUG_ON(c > AMDGPU_BO_MAX_PLACEMENTS); placement->num_placement = c; placement->placement = places; From 95400788d96ace69a51984ad67630e2aef574d12 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Tue, 24 Aug 2021 13:30:25 -0400 Subject: [PATCH 0822/5344] drm/panfrost: Simplify lock_region calculation commit b5fab345654c603c07525100d744498f28786929 upstream. In lock_region, simplify the calculation of the region_width parameter. This field is the size, but encoded as ceil(log2(size)) - 1. ceil(log2(size)) may be computed directly as fls(size - 1). However, we want to use the 64-bit versions as the amount to lock can exceed 32-bits. This avoids undefined (and completely wrong) behaviour when locking all memory (size ~0). In this case, the old code would "round up" ~0 to the nearest page, overflowing to 0. Since fls(0) == 0, this would calculate a region width of 10 + 0 = 10. But then the code would shift by (region_width - 11) = -1. As shifting by a negative number is undefined, UBSAN flags the bug. Of course, even if it were defined the behaviour is wrong, instead of locking all memory almost none would get locked. The new form of the calculation corrects this special case and avoids the undefined behaviour. Signed-off-by: Alyssa Rosenzweig Reported-and-tested-by: Chris Morgan Fixes: f3ba91228e8e ("drm/panfrost: Add initial panfrost driver") Cc: Reviewed-by: Steven Price Reviewed-by: Rob Herring Signed-off-by: Steven Price Link: https://patchwork.freedesktop.org/patch/msgid/20210824173028.7528-2-alyssa.rosenzweig@collabora.com Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/panfrost/panfrost_mmu.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c index bfd503d220881..dc21e30bbd0b3 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.c +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c @@ -56,21 +56,12 @@ static void lock_region(struct panfrost_device *pfdev, u32 as_nr, { u8 region_width; u64 region = iova & PAGE_MASK; - /* - * fls returns: - * 1 .. 32 - * - * 10 + fls(num_pages) - * results in the range (11 .. 42) - */ - - size = round_up(size, PAGE_SIZE); - region_width = 10 + fls(size >> PAGE_SHIFT); - if ((size >> PAGE_SHIFT) != (1ul << (region_width - 11))) { - /* not pow2, so must go up to the next pow2 */ - region_width += 1; - } + /* The size is encoded as ceil(log2) minus(1), which may be calculated + * with fls. The size must be clamped to hardware bounds. + */ + size = max_t(u64, size, PAGE_SIZE); + region_width = fls64(size - 1) - 1; region |= region_width; /* Lock the region that needs to be updated */ From 8a4f544c7ddcaaf664fd21cd709e3750e0b5d407 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Tue, 24 Aug 2021 13:30:26 -0400 Subject: [PATCH 0823/5344] drm/panfrost: Use u64 for size in lock_region commit a77b58825d7221d4a45c47881c35a47ba003aa73 upstream. Mali virtual addresses are 48-bit. Use a u64 instead of size_t to ensure we can express the "lock everything" condition as ~0ULL without overflow. This code was silently broken on any platform where a size_t is less than 48-bits; in particular, it was broken on 32-bit armv7 platforms which remain in use with panfrost. (Mainly RK3288) Signed-off-by: Alyssa Rosenzweig Suggested-by: Rob Herring Tested-by: Chris Morgan Reviewed-by: Steven Price Reviewed-by: Rob Herring Fixes: f3ba91228e8e ("drm/panfrost: Add initial panfrost driver") Cc: Signed-off-by: Steven Price Link: https://patchwork.freedesktop.org/patch/msgid/20210824173028.7528-3-alyssa.rosenzweig@collabora.com Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/panfrost/panfrost_mmu.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c index dc21e30bbd0b3..0f2d458385167 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.c +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c @@ -52,7 +52,7 @@ static int write_cmd(struct panfrost_device *pfdev, u32 as_nr, u32 cmd) } static void lock_region(struct panfrost_device *pfdev, u32 as_nr, - u64 iova, size_t size) + u64 iova, u64 size) { u8 region_width; u64 region = iova & PAGE_MASK; @@ -72,7 +72,7 @@ static void lock_region(struct panfrost_device *pfdev, u32 as_nr, static int mmu_hw_do_operation_locked(struct panfrost_device *pfdev, int as_nr, - u64 iova, size_t size, u32 op) + u64 iova, u64 size, u32 op) { if (as_nr < 0) return 0; @@ -89,7 +89,7 @@ static int mmu_hw_do_operation_locked(struct panfrost_device *pfdev, int as_nr, static int mmu_hw_do_operation(struct panfrost_device *pfdev, struct panfrost_mmu *mmu, - u64 iova, size_t size, u32 op) + u64 iova, u64 size, u32 op) { int ret; @@ -106,7 +106,7 @@ static void panfrost_mmu_enable(struct panfrost_device *pfdev, struct panfrost_m u64 transtab = cfg->arm_mali_lpae_cfg.transtab; u64 memattr = cfg->arm_mali_lpae_cfg.memattr; - mmu_hw_do_operation_locked(pfdev, as_nr, 0, ~0UL, AS_COMMAND_FLUSH_MEM); + mmu_hw_do_operation_locked(pfdev, as_nr, 0, ~0ULL, AS_COMMAND_FLUSH_MEM); mmu_write(pfdev, AS_TRANSTAB_LO(as_nr), transtab & 0xffffffffUL); mmu_write(pfdev, AS_TRANSTAB_HI(as_nr), transtab >> 32); @@ -122,7 +122,7 @@ static void panfrost_mmu_enable(struct panfrost_device *pfdev, struct panfrost_m static void panfrost_mmu_disable(struct panfrost_device *pfdev, u32 as_nr) { - mmu_hw_do_operation_locked(pfdev, as_nr, 0, ~0UL, AS_COMMAND_FLUSH_MEM); + mmu_hw_do_operation_locked(pfdev, as_nr, 0, ~0ULL, AS_COMMAND_FLUSH_MEM); mmu_write(pfdev, AS_TRANSTAB_LO(as_nr), 0); mmu_write(pfdev, AS_TRANSTAB_HI(as_nr), 0); @@ -222,7 +222,7 @@ static size_t get_pgsize(u64 addr, size_t size) static void panfrost_mmu_flush_range(struct panfrost_device *pfdev, struct panfrost_mmu *mmu, - u64 iova, size_t size) + u64 iova, u64 size) { if (mmu->as < 0) return; From fe3fb39bcd7f8d481f0f5da81398b1816e74ff9d Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Tue, 24 Aug 2021 13:30:27 -0400 Subject: [PATCH 0824/5344] drm/panfrost: Clamp lock region to Bifrost minimum commit bd7ffbc3ca12629aeb66fb9e28cf42b7f37e3e3b upstream. When locking a region, we currently clamp to a PAGE_SIZE as the minimum lock region. While this is valid for Midgard, it is invalid for Bifrost, where the minimum locking size is 8x larger than the 4k page size. Add a hardware definition for the minimum lock region size (corresponding to KBASE_LOCK_REGION_MIN_SIZE_LOG2 in kbase) and respect it. Signed-off-by: Alyssa Rosenzweig Tested-by: Chris Morgan Reviewed-by: Steven Price Reviewed-by: Rob Herring Cc: Signed-off-by: Steven Price Link: https://patchwork.freedesktop.org/patch/msgid/20210824173028.7528-4-alyssa.rosenzweig@collabora.com Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/panfrost/panfrost_mmu.c | 2 +- drivers/gpu/drm/panfrost/panfrost_regs.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c index 0f2d458385167..8a014dc115712 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.c +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c @@ -60,7 +60,7 @@ static void lock_region(struct panfrost_device *pfdev, u32 as_nr, /* The size is encoded as ceil(log2) minus(1), which may be calculated * with fls. The size must be clamped to hardware bounds. */ - size = max_t(u64, size, PAGE_SIZE); + size = max_t(u64, size, AS_LOCK_REGION_MIN_SIZE); region_width = fls64(size - 1) - 1; region |= region_width; diff --git a/drivers/gpu/drm/panfrost/panfrost_regs.h b/drivers/gpu/drm/panfrost/panfrost_regs.h index eddaa62ad8b0e..2ae3a4d301d39 100644 --- a/drivers/gpu/drm/panfrost/panfrost_regs.h +++ b/drivers/gpu/drm/panfrost/panfrost_regs.h @@ -318,6 +318,8 @@ #define AS_FAULTSTATUS_ACCESS_TYPE_READ (0x2 << 8) #define AS_FAULTSTATUS_ACCESS_TYPE_WRITE (0x3 << 8) +#define AS_LOCK_REGION_MIN_SIZE (1ULL << 15) + #define gpu_write(dev, reg, data) writel(data, dev->iomem + reg) #define gpu_read(dev, reg) readl(dev->iomem + reg) From 0ce83b9dafa0af96f3fe6c9635ca2581abfa0e95 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Tue, 10 Aug 2021 23:23:44 +0800 Subject: [PATCH 0825/5344] btrfs: fix upper limit for max_inline for page size 64K commit 6f93e834fa7c5faa0372e46828b4b2a966ac61d7 upstream. The mount option max_inline ranges from 0 to the sectorsize (which is now equal to page size). But we parse the mount options too early and before the actual sectorsize is read from the superblock. So the upper limit of max_inline is unaware of the actual sectorsize and is limited by the temporary sectorsize 4096, even on a system where the default sectorsize is 64K. Fix this by reading the superblock sectorsize before the mount option parse. Reported-by: Alexander Tsvetkov CC: stable@vger.kernel.org # 5.4+ Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Anand Jain Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/disk-io.c | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index dacd67dca43fe..946ae198b3449 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2894,6 +2894,29 @@ int open_ctree(struct super_block *sb, */ fs_info->compress_type = BTRFS_COMPRESS_ZLIB; + /* + * Flag our filesystem as having big metadata blocks if they are bigger + * than the page size + */ + if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) { + if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) + btrfs_info(fs_info, + "flagging fs with big metadata feature"); + features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; + } + + /* Set up fs_info before parsing mount options */ + nodesize = btrfs_super_nodesize(disk_super); + sectorsize = btrfs_super_sectorsize(disk_super); + stripesize = sectorsize; + fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids)); + fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); + + /* Cache block sizes */ + fs_info->nodesize = nodesize; + fs_info->sectorsize = sectorsize; + fs_info->stripesize = stripesize; + ret = btrfs_parse_options(fs_info, options, sb->s_flags); if (ret) { err = ret; @@ -2920,28 +2943,6 @@ int open_ctree(struct super_block *sb, if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) btrfs_info(fs_info, "has skinny extents"); - /* - * flag our filesystem as having big metadata blocks if - * they are bigger than the page size - */ - if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) { - if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) - btrfs_info(fs_info, - "flagging fs with big metadata feature"); - features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; - } - - nodesize = btrfs_super_nodesize(disk_super); - sectorsize = btrfs_super_sectorsize(disk_super); - stripesize = sectorsize; - fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids)); - fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); - - /* Cache block sizes */ - fs_info->nodesize = nodesize; - fs_info->sectorsize = sectorsize; - fs_info->stripesize = stripesize; - /* * mixed block groups end up with duplicate but slightly offset * extent buffers for the same range. It leads to corruptions From f77fb5326fa0485869aa4a39b00ab292b61b90ae Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 3 Sep 2021 10:49:37 +0200 Subject: [PATCH 0826/5344] xen: reset legacy rtc flag for PV domU commit f68aa100d815b5b4467fd1c3abbe3b99d65fd028 upstream. A Xen PV guest doesn't have a legacy RTC device, so reset the legacy RTC flag. Otherwise the following WARN splat will occur at boot: [ 1.333404] WARNING: CPU: 1 PID: 1 at /home/gross/linux/head/drivers/rtc/rtc-mc146818-lib.c:25 mc146818_get_time+0x1be/0x210 [ 1.333404] Modules linked in: [ 1.333404] CPU: 1 PID: 1 Comm: swapper/0 Tainted: G W 5.14.0-rc7-default+ #282 [ 1.333404] RIP: e030:mc146818_get_time+0x1be/0x210 [ 1.333404] Code: c0 64 01 c5 83 fd 45 89 6b 14 7f 06 83 c5 64 89 6b 14 41 83 ec 01 b8 02 00 00 00 44 89 63 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 <0f> 0b 48 c7 c7 30 0e ef 82 4c 89 e6 e8 71 2a 24 00 48 c7 c0 ff ff [ 1.333404] RSP: e02b:ffffc90040093df8 EFLAGS: 00010002 [ 1.333404] RAX: 00000000000000ff RBX: ffffc90040093e34 RCX: 0000000000000000 [ 1.333404] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 000000000000000d [ 1.333404] RBP: ffffffff82ef0e30 R08: ffff888005013e60 R09: 0000000000000000 [ 1.333404] R10: ffffffff82373e9b R11: 0000000000033080 R12: 0000000000000200 [ 1.333404] R13: 0000000000000000 R14: 0000000000000002 R15: ffffffff82cdc6d4 [ 1.333404] FS: 0000000000000000(0000) GS:ffff88807d440000(0000) knlGS:0000000000000000 [ 1.333404] CS: 10000e030 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1.333404] CR2: 0000000000000000 CR3: 000000000260a000 CR4: 0000000000050660 [ 1.333404] Call Trace: [ 1.333404] ? wakeup_sources_sysfs_init+0x30/0x30 [ 1.333404] ? rdinit_setup+0x2b/0x2b [ 1.333404] early_resume_init+0x23/0xa4 [ 1.333404] ? cn_proc_init+0x36/0x36 [ 1.333404] do_one_initcall+0x3e/0x200 [ 1.333404] kernel_init_freeable+0x232/0x28e [ 1.333404] ? rest_init+0xd0/0xd0 [ 1.333404] kernel_init+0x16/0x120 [ 1.333404] ret_from_fork+0x1f/0x30 Cc: Fixes: 8d152e7a5c7537 ("x86/rtc: Replace paravirt rtc check with platform legacy quirk") Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Link: https://lore.kernel.org/r/20210903084937.19392-3-jgross@suse.com Signed-off-by: Juergen Gross Signed-off-by: Greg Kroah-Hartman --- arch/x86/xen/enlighten_pv.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 3e66feff524a8..b99074ca5e686 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1183,6 +1183,11 @@ static void __init xen_dom0_set_legacy_features(void) x86_platform.legacy.rtc = 1; } +static void __init xen_domu_set_legacy_features(void) +{ + x86_platform.legacy.rtc = 0; +} + /* First C function to be called on Xen boot */ asmlinkage __visible void __init xen_start_kernel(void) { @@ -1353,6 +1358,8 @@ asmlinkage __visible void __init xen_start_kernel(void) add_preferred_console("xenboot", 0, NULL); if (pci_xen) x86_init.pci.arch_init = pci_xen_init; + x86_platform.set_legacy_features = + xen_domu_set_legacy_features; } else { const struct dom0_vga_console_info *info = (void *)((char *)xen_start_info + From c13779fae76658fedcdfc68a11acf8e132ad1d61 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 9 Sep 2021 17:53:56 +0100 Subject: [PATCH 0827/5344] arm64/sve: Use correct size when reinitialising SVE state commit e35ac9d0b56e9efefaeeb84b635ea26c2839ea86 upstream. When we need a buffer for SVE register state we call sve_alloc() to make sure that one is there. In order to avoid repeated allocations and frees we keep the buffer around unless we change vector length and just memset() it to ensure a clean register state. The function that deals with this takes the task to operate on as an argument, however in the case where we do a memset() we initialise using the SVE state size for the current task rather than the task passed as an argument. This is only an issue in the case where we are setting the register state for a task via ptrace and the task being configured has a different vector length to the task tracing it. In the case where the buffer is larger in the traced process we will leak old state from the traced process to itself, in the case where the buffer is smaller in the traced process we will overflow the buffer and corrupt memory. Fixes: bc0ee4760364 ("arm64/sve: Core task context handling") Cc: # 4.15.x Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20210909165356.10675-1-broonie@kernel.org Signed-off-by: Catalin Marinas Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kernel/fpsimd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index e7d1fa00e2e56..37eec4d5ba908 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -498,7 +498,7 @@ size_t sve_state_size(struct task_struct const *task) void sve_alloc(struct task_struct *task) { if (task->thread.sve_state) { - memset(task->thread.sve_state, 0, sve_state_size(current)); + memset(task->thread.sve_state, 0, sve_state_size(task)); return; } From a5953cc4f16562891d77a44622692c089471ba90 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 3 Sep 2021 10:49:36 +0200 Subject: [PATCH 0828/5344] PM: base: power: don't try to use non-existing RTC for storing data commit 0560204b360a332c321124dbc5cdfd3364533a74 upstream. If there is no legacy RTC device, don't try to use it for storing trace data across suspend/resume. Cc: Signed-off-by: Juergen Gross Reviewed-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20210903084937.19392-2-jgross@suse.com Signed-off-by: Juergen Gross Signed-off-by: Greg Kroah-Hartman --- drivers/base/power/trace.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/base/power/trace.c b/drivers/base/power/trace.c index 977d27bd1a220..9a9dec657c166 100644 --- a/drivers/base/power/trace.c +++ b/drivers/base/power/trace.c @@ -13,6 +13,7 @@ #include #include #include +#include #include @@ -165,6 +166,9 @@ void generate_pm_trace(const void *tracedata, unsigned int user) const char *file = *(const char **)(tracedata + 2); unsigned int user_hash_value, file_hash_value; + if (!x86_platform.legacy.rtc) + return; + user_hash_value = user % USERHASH; file_hash_value = hash_string(lineno, file, FILEHASH); set_magic_time(user_hash_value, file_hash_value, dev_hash_value); @@ -267,6 +271,9 @@ static struct notifier_block pm_trace_nb = { static int early_resume_init(void) { + if (!x86_platform.legacy.rtc) + return 0; + hash_value_early_read = read_magic_time(); register_pm_notifier(&pm_trace_nb); return 0; @@ -277,6 +284,9 @@ static int late_resume_init(void) unsigned int val = hash_value_early_read; unsigned int user, file, dev; + if (!x86_platform.legacy.rtc) + return 0; + user = val % USERHASH; val = val / USERHASH; file = val % FILEHASH; From 2a5211b247184120eaeec918f840dc23128e4260 Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Fri, 3 Sep 2021 14:33:11 +0800 Subject: [PATCH 0829/5344] PCI: Add AMD GPU multi-function power dependencies commit 60b78ed088ebe1a872ee1320b6c5ad6ee2c4bd9a upstream. Some AMD GPUs have built-in USB xHCI and USB Type-C UCSI controllers with power dependencies between the GPU and the other functions as in 6d2e369f0d4c ("PCI: Add NVIDIA GPU multi-function power dependencies"). Add device link support for the AMD integrated USB xHCI and USB Type-C UCSI controllers. Without this, runtime power management, including GPU resume and temp and fan sensors don't work correctly. Reported-at: https://gitlab.freedesktop.org/drm/amd/-/issues/1704 Link: https://lore.kernel.org/r/20210903063311.3606226-1-evan.quan@amd.com Signed-off-by: Evan Quan Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/pci/quirks.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 391316a2bfb59..34c68a7313db1 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -5394,7 +5394,7 @@ DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID, PCI_CLASS_MULTIMEDIA_HD_AUDIO, 8, quirk_gpu_hda); /* - * Create device link for NVIDIA GPU with integrated USB xHCI Host + * Create device link for GPUs with integrated USB xHCI Host * controller to VGA. */ static void quirk_gpu_usb(struct pci_dev *usb) @@ -5403,9 +5403,11 @@ static void quirk_gpu_usb(struct pci_dev *usb) } DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID, PCI_CLASS_SERIAL_USB, 8, quirk_gpu_usb); +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_ATI, PCI_ANY_ID, + PCI_CLASS_SERIAL_USB, 8, quirk_gpu_usb); /* - * Create device link for NVIDIA GPU with integrated Type-C UCSI controller + * Create device link for GPUs with integrated Type-C UCSI controller * to VGA. Currently there is no class code defined for UCSI device over PCI * so using UNKNOWN class for now and it will be updated when UCSI * over PCI gets a class code. @@ -5418,6 +5420,9 @@ static void quirk_gpu_usb_typec_ucsi(struct pci_dev *ucsi) DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID, PCI_CLASS_SERIAL_UNKNOWN, 8, quirk_gpu_usb_typec_ucsi); +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_ATI, PCI_ANY_ID, + PCI_CLASS_SERIAL_UNKNOWN, 8, + quirk_gpu_usb_typec_ucsi); /* * Enable the NVIDIA GPU integrated HDA controller if the BIOS left it From ef492d9751b52699276de05d4cfad2577c4e92f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ernst=20Sj=C3=B6strand?= Date: Thu, 2 Sep 2021 09:50:27 +0200 Subject: [PATCH 0830/5344] drm/amd/amdgpu: Increase HWIP_MAX_INSTANCE to 10 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 67a44e659888569a133a8f858c8230e9d7aad1d5 upstream. Seems like newer cards can have even more instances now. Found by UBSAN: array-index-out-of-bounds in drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c:318:29 index 8 is out of range for type 'uint32_t *[8]' Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1697 Cc: stable@vger.kernel.org Signed-off-by: Ernst Sjöstrand Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index d1e278e999eeb..1b2fa83798304 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -762,7 +762,7 @@ enum amd_hw_ip_block_type { MAX_HWIP }; -#define HWIP_MAX_INSTANCE 8 +#define HWIP_MAX_INSTANCE 10 struct amd_powerplay { void *pp_handle; From 0b3b2f0827e50939ab7fe0634d95f96920c4d39d Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Fri, 20 Aug 2021 22:18:23 +0200 Subject: [PATCH 0831/5344] drm/etnaviv: return context from etnaviv_iommu_context_get commit 78edefc05e41352099ffb8f06f8d9b2d091e29cd upstream. Being able to have the refcount manipulation in an assignment makes it much easier to parse the code. Cc: stable@vger.kernel.org # 5.4 Signed-off-by: Lucas Stach Tested-by: Michael Walle Tested-by: Marek Vasut Reviewed-by: Christian Gmeiner Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/etnaviv/etnaviv_buffer.c | 3 +-- drivers/gpu/drm/etnaviv/etnaviv_gem.c | 3 +-- drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c | 3 +-- drivers/gpu/drm/etnaviv/etnaviv_gpu.c | 6 ++---- drivers/gpu/drm/etnaviv/etnaviv_mmu.h | 4 +++- 5 files changed, 8 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_buffer.c b/drivers/gpu/drm/etnaviv/etnaviv_buffer.c index 0c9c40720ca9a..35225ff8792dd 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_buffer.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_buffer.c @@ -397,8 +397,7 @@ void etnaviv_buffer_queue(struct etnaviv_gpu *gpu, u32 exec_state, if (switch_mmu_context) { struct etnaviv_iommu_context *old_context = gpu->mmu_context; - etnaviv_iommu_context_get(mmu_context); - gpu->mmu_context = mmu_context; + gpu->mmu_context = etnaviv_iommu_context_get(mmu_context); etnaviv_iommu_context_put(old_context); } diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c index cb1faaac380a3..519948637186e 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c @@ -304,8 +304,7 @@ struct etnaviv_vram_mapping *etnaviv_gem_mapping_get( list_del(&mapping->obj_node); } - etnaviv_iommu_context_get(mmu_context); - mapping->context = mmu_context; + mapping->context = etnaviv_iommu_context_get(mmu_context); mapping->use = 1; ret = etnaviv_iommu_map_gem(mmu_context, etnaviv_obj, diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c index 1ba83a90cdef6..7085b08b1db42 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c @@ -534,8 +534,7 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data, goto err_submit_objects; submit->ctx = file->driver_priv; - etnaviv_iommu_context_get(submit->ctx->mmu); - submit->mmu_context = submit->ctx->mmu; + submit->mmu_context = etnaviv_iommu_context_get(submit->ctx->mmu); submit->exec_state = args->exec_state; submit->flags = args->flags; diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c index 85de8551ce866..0fa4885f9c183 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c @@ -1307,12 +1307,10 @@ struct dma_fence *etnaviv_gpu_submit(struct etnaviv_gem_submit *submit) } if (!gpu->mmu_context) { - etnaviv_iommu_context_get(submit->mmu_context); - gpu->mmu_context = submit->mmu_context; + gpu->mmu_context = etnaviv_iommu_context_get(submit->mmu_context); etnaviv_gpu_start_fe_idleloop(gpu); } else { - etnaviv_iommu_context_get(gpu->mmu_context); - submit->prev_mmu_context = gpu->mmu_context; + submit->prev_mmu_context = etnaviv_iommu_context_get(gpu->mmu_context); } if (submit->nr_pmrs) { diff --git a/drivers/gpu/drm/etnaviv/etnaviv_mmu.h b/drivers/gpu/drm/etnaviv/etnaviv_mmu.h index d1d6902fd13be..e4a0b7d09c2ea 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_mmu.h +++ b/drivers/gpu/drm/etnaviv/etnaviv_mmu.h @@ -105,9 +105,11 @@ void etnaviv_iommu_dump(struct etnaviv_iommu_context *ctx, void *buf); struct etnaviv_iommu_context * etnaviv_iommu_context_init(struct etnaviv_iommu_global *global, struct etnaviv_cmdbuf_suballoc *suballoc); -static inline void etnaviv_iommu_context_get(struct etnaviv_iommu_context *ctx) +static inline struct etnaviv_iommu_context * +etnaviv_iommu_context_get(struct etnaviv_iommu_context *ctx) { kref_get(&ctx->refcount); + return ctx; } void etnaviv_iommu_context_put(struct etnaviv_iommu_context *ctx); void etnaviv_iommu_restore(struct etnaviv_gpu *gpu, From ccd29caebee5b85059be1a78fc771438a9452cc6 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Fri, 20 Aug 2021 22:18:24 +0200 Subject: [PATCH 0832/5344] drm/etnaviv: put submit prev MMU context when it exists commit cda7532916f7bc860b36a1806cb8352e6f63dacb upstream. The prev context is the MMU context at the time of the job queueing in hardware. As a job might be queued multiple times due to recovery after a GPU hang, we need to make sure to put the stale prev MMU context from a prior queuing, to avoid the reference and thus the MMU context leaking. Cc: stable@vger.kernel.org # 5.4 Signed-off-by: Lucas Stach Tested-by: Michael Walle Tested-by: Marek Vasut Reviewed-by: Christian Gmeiner Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/etnaviv/etnaviv_gpu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c index 0fa4885f9c183..fc90ea6973b2d 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c @@ -1310,6 +1310,8 @@ struct dma_fence *etnaviv_gpu_submit(struct etnaviv_gem_submit *submit) gpu->mmu_context = etnaviv_iommu_context_get(submit->mmu_context); etnaviv_gpu_start_fe_idleloop(gpu); } else { + if (submit->prev_mmu_context) + etnaviv_iommu_context_put(submit->prev_mmu_context); submit->prev_mmu_context = etnaviv_iommu_context_get(gpu->mmu_context); } From d721a768ebbe83b15a021cd317305097dbf6440e Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Fri, 20 Aug 2021 22:18:25 +0200 Subject: [PATCH 0833/5344] drm/etnaviv: stop abusing mmu_context as FE running marker commit 23e0f5a57d0ecec86e1fc82194acd94aede21a46 upstream. While the DMA frontend can only be active when the MMU context is set, the reverse isn't necessarily true, as the frontend can be stopped while the MMU state is kept. Stop treating mmu_context being set as a indication that the frontend is running and instead add a explicit property. Cc: stable@vger.kernel.org # 5.4 Signed-off-by: Lucas Stach Tested-by: Michael Walle Tested-by: Marek Vasut Reviewed-by: Christian Gmeiner Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/etnaviv/etnaviv_gpu.c | 10 ++++++++-- drivers/gpu/drm/etnaviv/etnaviv_gpu.h | 1 + 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c index fc90ea6973b2d..60410c2fd2be2 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c @@ -545,6 +545,8 @@ static int etnaviv_hw_reset(struct etnaviv_gpu *gpu) /* We rely on the GPU running, so program the clock */ etnaviv_gpu_update_clock(gpu); + gpu->fe_running = false; + return 0; } @@ -607,6 +609,8 @@ void etnaviv_gpu_start_fe(struct etnaviv_gpu *gpu, u32 address, u16 prefetch) VIVS_MMUv2_SEC_COMMAND_CONTROL_ENABLE | VIVS_MMUv2_SEC_COMMAND_CONTROL_PREFETCH(prefetch)); } + + gpu->fe_running = true; } static void etnaviv_gpu_start_fe_idleloop(struct etnaviv_gpu *gpu) @@ -1306,7 +1310,7 @@ struct dma_fence *etnaviv_gpu_submit(struct etnaviv_gem_submit *submit) goto out_unlock; } - if (!gpu->mmu_context) { + if (!gpu->fe_running) { gpu->mmu_context = etnaviv_iommu_context_get(submit->mmu_context); etnaviv_gpu_start_fe_idleloop(gpu); } else { @@ -1530,7 +1534,7 @@ int etnaviv_gpu_wait_idle(struct etnaviv_gpu *gpu, unsigned int timeout_ms) static int etnaviv_gpu_hw_suspend(struct etnaviv_gpu *gpu) { - if (gpu->initialized && gpu->mmu_context) { + if (gpu->initialized && gpu->fe_running) { /* Replace the last WAIT with END */ mutex_lock(&gpu->lock); etnaviv_buffer_end(gpu); @@ -1545,6 +1549,8 @@ static int etnaviv_gpu_hw_suspend(struct etnaviv_gpu *gpu) etnaviv_iommu_context_put(gpu->mmu_context); gpu->mmu_context = NULL; + + gpu->fe_running = false; } gpu->exec_state = -1; diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.h b/drivers/gpu/drm/etnaviv/etnaviv_gpu.h index 8f9bd4edc96a5..02478c75f8968 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.h +++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.h @@ -101,6 +101,7 @@ struct etnaviv_gpu { struct workqueue_struct *wq; struct drm_gpu_scheduler sched; bool initialized; + bool fe_running; /* 'ring'-buffer: */ struct etnaviv_cmdbuf buffer; From c0e2b3961b130941f7c186b85d97dc57796b137d Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Fri, 20 Aug 2021 22:18:26 +0200 Subject: [PATCH 0834/5344] drm/etnaviv: keep MMU context across runtime suspend/resume commit 8f3eea9d01d7b0f95b0fe04187c0059019ada85b upstream. The MMU state may be kept across a runtime suspend/resume cycle, as we avoid a full hardware reset to keep the latency of the runtime PM small. Don't pretend that the MMU state is lost in driver state. The MMU context is pushed out when new HW jobs with a different context are coming in. The only exception to this is when the GPU is unbound, in which case we need to make sure to also free the last active context. Cc: stable@vger.kernel.org # 5.4 Reported-by: Michael Walle Signed-off-by: Lucas Stach Tested-by: Michael Walle Tested-by: Marek Vasut Reviewed-by: Christian Gmeiner Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/etnaviv/etnaviv_gpu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c index 60410c2fd2be2..efec0bbf85230 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c @@ -1547,9 +1547,6 @@ static int etnaviv_gpu_hw_suspend(struct etnaviv_gpu *gpu) */ etnaviv_gpu_wait_idle(gpu, 100); - etnaviv_iommu_context_put(gpu->mmu_context); - gpu->mmu_context = NULL; - gpu->fe_running = false; } @@ -1698,6 +1695,9 @@ static void etnaviv_gpu_unbind(struct device *dev, struct device *master, etnaviv_gpu_hw_suspend(gpu); #endif + if (gpu->mmu_context) + etnaviv_iommu_context_put(gpu->mmu_context); + if (gpu->initialized) { etnaviv_cmdbuf_free(&gpu->buffer); etnaviv_iommu_global_fini(gpu); From a1815d32f76ebf9dbaeae286a6eeef57ef32043b Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Fri, 20 Aug 2021 22:18:27 +0200 Subject: [PATCH 0835/5344] drm/etnaviv: exec and MMU state is lost when resetting the GPU commit 725cbc7884c37f3b4f1777bc1aea6432cded8ca5 upstream. When the GPU is reset both the current exec state, as well as all MMU state is lost. Move the driver side state tracking into the reset function to keep hardware and software state from diverging. Cc: stable@vger.kernel.org # 5.4 Signed-off-by: Lucas Stach Tested-by: Michael Walle Tested-by: Marek Vasut Reviewed-by: Christian Gmeiner Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/etnaviv/etnaviv_gpu.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c index efec0bbf85230..70d80b2016b6e 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c @@ -546,6 +546,8 @@ static int etnaviv_hw_reset(struct etnaviv_gpu *gpu) etnaviv_gpu_update_clock(gpu); gpu->fe_running = false; + gpu->exec_state = -1; + gpu->mmu_context = NULL; return 0; } @@ -794,7 +796,6 @@ int etnaviv_gpu_init(struct etnaviv_gpu *gpu) /* Now program the hardware */ mutex_lock(&gpu->lock); etnaviv_gpu_hw_init(gpu); - gpu->exec_state = -1; mutex_unlock(&gpu->lock); pm_runtime_mark_last_busy(gpu->dev); @@ -998,8 +999,6 @@ void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu) spin_unlock(&gpu->event_spinlock); etnaviv_gpu_hw_init(gpu); - gpu->exec_state = -1; - gpu->mmu_context = NULL; mutex_unlock(&gpu->lock); pm_runtime_mark_last_busy(gpu->dev); From 74f6b2ae37821ab9b67b2f1716d02308d8558b67 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Fri, 20 Aug 2021 22:18:28 +0200 Subject: [PATCH 0836/5344] drm/etnaviv: fix MMU context leak on GPU reset commit f978a5302f5566480c58ffae64a16d34456801bd upstream. After a reset the GPU is no longer using the MMU context and may be restarted with a different context. While the mmu_state proeprly was cleared, the context wasn't unreferenced, leading to a memory leak. Cc: stable@vger.kernel.org # 5.4 Reported-by: Michael Walle Signed-off-by: Lucas Stach Tested-by: Michael Walle Tested-by: Marek Vasut Reviewed-by: Christian Gmeiner Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/etnaviv/etnaviv_gpu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c index 70d80b2016b6e..342435c6e94ef 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c @@ -547,6 +547,8 @@ static int etnaviv_hw_reset(struct etnaviv_gpu *gpu) gpu->fe_running = false; gpu->exec_state = -1; + if (gpu->mmu_context) + etnaviv_iommu_context_put(gpu->mmu_context); gpu->mmu_context = NULL; return 0; From 9db321cac9c409f1905d392736c2977c4b0cc3c8 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Fri, 20 Aug 2021 22:18:29 +0200 Subject: [PATCH 0837/5344] drm/etnaviv: reference MMU context when setting up hardware state commit d6408538f091fb22d47f792d4efa58143d56c3fb upstream. Move the refcount manipulation of the MMU context to the point where the hardware state is programmed. At that point it is also known if a previous MMU state is still there, or the state needs to be reprogrammed with a potentially different context. Cc: stable@vger.kernel.org # 5.4 Signed-off-by: Lucas Stach Tested-by: Michael Walle Tested-by: Marek Vasut Reviewed-by: Christian Gmeiner Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/etnaviv/etnaviv_gpu.c | 24 +++++++++++----------- drivers/gpu/drm/etnaviv/etnaviv_iommu.c | 4 ++++ drivers/gpu/drm/etnaviv/etnaviv_iommu_v2.c | 8 ++++++++ 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c index 342435c6e94ef..db35736d47af2 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c @@ -617,17 +617,19 @@ void etnaviv_gpu_start_fe(struct etnaviv_gpu *gpu, u32 address, u16 prefetch) gpu->fe_running = true; } -static void etnaviv_gpu_start_fe_idleloop(struct etnaviv_gpu *gpu) +static void etnaviv_gpu_start_fe_idleloop(struct etnaviv_gpu *gpu, + struct etnaviv_iommu_context *context) { - u32 address = etnaviv_cmdbuf_get_va(&gpu->buffer, - &gpu->mmu_context->cmdbuf_mapping); u16 prefetch; + u32 address; /* setup the MMU */ - etnaviv_iommu_restore(gpu, gpu->mmu_context); + etnaviv_iommu_restore(gpu, context); /* Start command processor */ prefetch = etnaviv_buffer_init(gpu); + address = etnaviv_cmdbuf_get_va(&gpu->buffer, + &gpu->mmu_context->cmdbuf_mapping); etnaviv_gpu_start_fe(gpu, address, prefetch); } @@ -1311,14 +1313,12 @@ struct dma_fence *etnaviv_gpu_submit(struct etnaviv_gem_submit *submit) goto out_unlock; } - if (!gpu->fe_running) { - gpu->mmu_context = etnaviv_iommu_context_get(submit->mmu_context); - etnaviv_gpu_start_fe_idleloop(gpu); - } else { - if (submit->prev_mmu_context) - etnaviv_iommu_context_put(submit->prev_mmu_context); - submit->prev_mmu_context = etnaviv_iommu_context_get(gpu->mmu_context); - } + if (!gpu->fe_running) + etnaviv_gpu_start_fe_idleloop(gpu, submit->mmu_context); + + if (submit->prev_mmu_context) + etnaviv_iommu_context_put(submit->prev_mmu_context); + submit->prev_mmu_context = etnaviv_iommu_context_get(gpu->mmu_context); if (submit->nr_pmrs) { gpu->event[event[1]].sync_point = &sync_point_perfmon_sample_pre; diff --git a/drivers/gpu/drm/etnaviv/etnaviv_iommu.c b/drivers/gpu/drm/etnaviv/etnaviv_iommu.c index 1a7c89a67bea3..afe5dd6a9925b 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_iommu.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_iommu.c @@ -92,6 +92,10 @@ static void etnaviv_iommuv1_restore(struct etnaviv_gpu *gpu, struct etnaviv_iommuv1_context *v1_context = to_v1_context(context); u32 pgtable; + if (gpu->mmu_context) + etnaviv_iommu_context_put(gpu->mmu_context); + gpu->mmu_context = etnaviv_iommu_context_get(context); + /* set base addresses */ gpu_write(gpu, VIVS_MC_MEMORY_BASE_ADDR_RA, context->global->memory_base); gpu_write(gpu, VIVS_MC_MEMORY_BASE_ADDR_FE, context->global->memory_base); diff --git a/drivers/gpu/drm/etnaviv/etnaviv_iommu_v2.c b/drivers/gpu/drm/etnaviv/etnaviv_iommu_v2.c index f8bf488e9d717..d664ae29ae209 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_iommu_v2.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_iommu_v2.c @@ -172,6 +172,10 @@ static void etnaviv_iommuv2_restore_nonsec(struct etnaviv_gpu *gpu, if (gpu_read(gpu, VIVS_MMUv2_CONTROL) & VIVS_MMUv2_CONTROL_ENABLE) return; + if (gpu->mmu_context) + etnaviv_iommu_context_put(gpu->mmu_context); + gpu->mmu_context = etnaviv_iommu_context_get(context); + prefetch = etnaviv_buffer_config_mmuv2(gpu, (u32)v2_context->mtlb_dma, (u32)context->global->bad_page_dma); @@ -192,6 +196,10 @@ static void etnaviv_iommuv2_restore_sec(struct etnaviv_gpu *gpu, if (gpu_read(gpu, VIVS_MMUv2_SEC_CONTROL) & VIVS_MMUv2_SEC_CONTROL_ENABLE) return; + if (gpu->mmu_context) + etnaviv_iommu_context_put(gpu->mmu_context); + gpu->mmu_context = etnaviv_iommu_context_get(context); + gpu_write(gpu, VIVS_MMUv2_PTA_ADDRESS_LOW, lower_32_bits(context->global->v2.pta_dma)); gpu_write(gpu, VIVS_MMUv2_PTA_ADDRESS_HIGH, From 3ae8bfe2166254e12368e6520e529b9f7fb3fa9c Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Fri, 20 Aug 2021 22:18:30 +0200 Subject: [PATCH 0838/5344] drm/etnaviv: add missing MMU context put when reaping MMU mapping commit f2faea8b64125852fa9acc6771c07fc0311a039b upstream. When we forcefully evict a mapping from the the address space and thus the MMU context, the MMU context is leaked, as the mapping no longer points to it, so it doesn't get freed when the GEM object is destroyed. Add the mssing context put to fix the leak. Cc: stable@vger.kernel.org # 5.4 Signed-off-by: Lucas Stach Tested-by: Michael Walle Tested-by: Marek Vasut Reviewed-by: Christian Gmeiner Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/etnaviv/etnaviv_mmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_mmu.c b/drivers/gpu/drm/etnaviv/etnaviv_mmu.c index 3607d348c2980..707f5c1a58740 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_mmu.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_mmu.c @@ -204,6 +204,7 @@ static int etnaviv_iommu_find_iova(struct etnaviv_iommu_context *context, */ list_for_each_entry_safe(m, n, &list, scan_node) { etnaviv_iommu_remove_mapping(context, m); + etnaviv_iommu_context_put(m->context); m->context = NULL; list_del_init(&m->mmu_node); list_del_init(&m->scan_node); From 64a011f86456b8e43dd49b4561398cd1f3366e0b Mon Sep 17 00:00:00 2001 From: Alexander Egorenkov Date: Thu, 9 Sep 2021 12:20:56 +0200 Subject: [PATCH 0839/5344] s390/sclp: fix Secure-IPL facility detection commit d76b14f3971a0638b6cd0da289f8b48acee287d0 upstream. Prevent out-of-range access if the returned SCLP SCCB response is smaller in size than the address of the Secure-IPL flag. Fixes: c9896acc7851 ("s390/ipl: Provide has_secure sysfs attribute") Cc: stable@vger.kernel.org # 5.2+ Signed-off-by: Alexander Egorenkov Reviewed-by: Christian Borntraeger Signed-off-by: Vasily Gorbik Signed-off-by: Greg Kroah-Hartman --- drivers/s390/char/sclp_early.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c index cc5e84b80c699..faa3a4b8ed91d 100644 --- a/drivers/s390/char/sclp_early.c +++ b/drivers/s390/char/sclp_early.c @@ -40,13 +40,14 @@ static void __init sclp_early_facilities_detect(struct read_info_sccb *sccb) sclp.has_gisaf = !!(sccb->fac118 & 0x08); sclp.has_hvs = !!(sccb->fac119 & 0x80); sclp.has_kss = !!(sccb->fac98 & 0x01); - sclp.has_sipl = !!(sccb->cbl & 0x4000); if (sccb->fac85 & 0x02) S390_lowcore.machine_flags |= MACHINE_FLAG_ESOP; if (sccb->fac91 & 0x40) S390_lowcore.machine_flags |= MACHINE_FLAG_TLB_GUEST; if (sccb->cpuoff > 134) sclp.has_diag318 = !!(sccb->byte_134 & 0x80); + if (sccb->cpuoff > 137) + sclp.has_sipl = !!(sccb->cbl & 0x4000); sclp.rnmax = sccb->rnmax ? sccb->rnmax : sccb->rnmax2; sclp.rzm = sccb->rnsize ? sccb->rnsize : sccb->rnsize2; sclp.rzm <<= 20; From c7cd004575722e422afcda5dbeaa3c1f9db30f16 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 23 Jul 2021 13:25:36 -0400 Subject: [PATCH 0840/5344] tipc: fix an use-after-free issue in tipc_recvmsg commit cc19862ffe454a5b632ca202e5a51bfec9f89fd2 upstream. syzbot reported an use-after-free crash: BUG: KASAN: use-after-free in tipc_recvmsg+0xf77/0xf90 net/tipc/socket.c:1979 Call Trace: tipc_recvmsg+0xf77/0xf90 net/tipc/socket.c:1979 sock_recvmsg_nosec net/socket.c:943 [inline] sock_recvmsg net/socket.c:961 [inline] sock_recvmsg+0xca/0x110 net/socket.c:957 tipc_conn_rcv_from_sock+0x162/0x2f0 net/tipc/topsrv.c:398 tipc_conn_recv_work+0xeb/0x190 net/tipc/topsrv.c:421 process_one_work+0x98d/0x1630 kernel/workqueue.c:2276 worker_thread+0x658/0x11f0 kernel/workqueue.c:2422 As Hoang pointed out, it was caused by skb_cb->bytes_read still accessed after calling tsk_advance_rx_queue() to free the skb in tipc_recvmsg(). This patch is to fix it by accessing skb_cb->bytes_read earlier than calling tsk_advance_rx_queue(). Fixes: f4919ff59c28 ("tipc: keep the skb in rcv queue until the whole data is read") Reported-by: syzbot+e6741b97d5552f97c24d@syzkaller.appspotmail.com Signed-off-by: Xin Long Acked-by: Jon Maloy Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tipc/socket.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 231f9e1bf6bb2..3d6e460d99718 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1849,10 +1849,12 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, tipc_node_distr_xmit(sock_net(sk), &xmitq); } - if (!skb_cb->bytes_read) - tsk_advance_rx_queue(sk); + if (skb_cb->bytes_read) + goto exit; + + tsk_advance_rx_queue(sk); - if (likely(!connected) || skb_cb->bytes_read) + if (likely(!connected)) goto exit; /* Send connection flow control advertisement when applicable */ From dfc3d4cf4cf6b78e176a22c7c1df4e69ff0aac48 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 Sep 2021 11:08:36 -0700 Subject: [PATCH 0841/5344] net-caif: avoid user-triggerable WARN_ON(1) commit 550ac9c1aaaaf51fd42e20d461f0b1cdbd55b3d2 upstream. syszbot triggers this warning, which looks something we can easily prevent. If we initialize priv->list_field in chnl_net_init(), then always use list_del_init(), we can remove robust_list_del() completely. WARNING: CPU: 0 PID: 3233 at net/caif/chnl_net.c:67 robust_list_del net/caif/chnl_net.c:67 [inline] WARNING: CPU: 0 PID: 3233 at net/caif/chnl_net.c:67 chnl_net_uninit+0xc9/0x2e0 net/caif/chnl_net.c:375 Modules linked in: CPU: 0 PID: 3233 Comm: syz-executor.3 Not tainted 5.14.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:robust_list_del net/caif/chnl_net.c:67 [inline] RIP: 0010:chnl_net_uninit+0xc9/0x2e0 net/caif/chnl_net.c:375 Code: 89 eb e8 3a a3 ba f8 48 89 d8 48 c1 e8 03 42 80 3c 28 00 0f 85 bf 01 00 00 48 81 fb 00 14 4e 8d 48 8b 2b 75 d0 e8 17 a3 ba f8 <0f> 0b 5b 5d 41 5c 41 5d e9 0a a3 ba f8 4c 89 e3 e8 02 a3 ba f8 4c RSP: 0018:ffffc90009067248 EFLAGS: 00010202 RAX: 0000000000008780 RBX: ffffffff8d4e1400 RCX: ffffc9000fd34000 RDX: 0000000000040000 RSI: ffffffff88bb6e49 RDI: 0000000000000003 RBP: ffff88802cd9ee08 R08: 0000000000000000 R09: ffffffff8d0e6647 R10: ffffffff88bb6dc2 R11: 0000000000000000 R12: ffff88803791ae08 R13: dffffc0000000000 R14: 00000000e600ffce R15: ffff888073ed3480 FS: 00007fed10fa0700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b2c322000 CR3: 00000000164a6000 CR4: 00000000001506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: register_netdevice+0xadf/0x1500 net/core/dev.c:10347 ipcaif_newlink+0x4c/0x260 net/caif/chnl_net.c:468 __rtnl_newlink+0x106d/0x1750 net/core/rtnetlink.c:3458 rtnl_newlink+0x64/0xa0 net/core/rtnetlink.c:3506 rtnetlink_rcv_msg+0x413/0xb80 net/core/rtnetlink.c:5572 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2504 netlink_unicast_kernel net/netlink/af_netlink.c:1314 [inline] netlink_unicast+0x533/0x7d0 net/netlink/af_netlink.c:1340 netlink_sendmsg+0x86d/0xdb0 net/netlink/af_netlink.c:1929 sock_sendmsg_nosec net/socket.c:704 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:724 __sys_sendto+0x21c/0x320 net/socket.c:2036 __do_sys_sendto net/socket.c:2048 [inline] __se_sys_sendto net/socket.c:2044 [inline] __x64_sys_sendto+0xdd/0x1b0 net/socket.c:2044 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Fixes: cc36a070b590 ("net-caif: add CAIF netdevice") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/caif/chnl_net.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index a566289628522..910f164dd20cb 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -53,20 +53,6 @@ struct chnl_net { enum caif_states state; }; -static void robust_list_del(struct list_head *delete_node) -{ - struct list_head *list_node; - struct list_head *n; - ASSERT_RTNL(); - list_for_each_safe(list_node, n, &chnl_net_list) { - if (list_node == delete_node) { - list_del(list_node); - return; - } - } - WARN_ON(1); -} - static int chnl_recv_cb(struct cflayer *layr, struct cfpkt *pkt) { struct sk_buff *skb; @@ -368,6 +354,7 @@ static int chnl_net_init(struct net_device *dev) ASSERT_RTNL(); priv = netdev_priv(dev); strncpy(priv->name, dev->name, sizeof(priv->name)); + INIT_LIST_HEAD(&priv->list_field); return 0; } @@ -376,7 +363,7 @@ static void chnl_net_uninit(struct net_device *dev) struct chnl_net *priv; ASSERT_RTNL(); priv = netdev_priv(dev); - robust_list_del(&priv->list_field); + list_del_init(&priv->list_field); } static const struct net_device_ops netdev_ops = { @@ -541,7 +528,7 @@ static void __exit chnl_exit_module(void) rtnl_lock(); list_for_each_safe(list_node, _tmp, &chnl_net_list) { dev = list_entry(list_node, struct chnl_net, list_field); - list_del(list_node); + list_del_init(list_node); delete_device(dev); } rtnl_unlock(); From 8c31dd4f6ecd02d88165c9d2b8477dd09fa7dc90 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 13 Sep 2021 15:06:05 -0700 Subject: [PATCH 0842/5344] ptp: dp83640: don't define PAGE0 commit 7366c23ff492ad260776a3ee1aaabba9fc773a8b upstream. Building dp83640.c on arch/parisc/ produces a build warning for PAGE0 being redefined. Since the macro is not used in the dp83640 driver, just make it a comment for documentation purposes. In file included from ../drivers/net/phy/dp83640.c:23: ../drivers/net/phy/dp83640_reg.h:8: warning: "PAGE0" redefined 8 | #define PAGE0 0x0000 from ../drivers/net/phy/dp83640.c:11: ../arch/parisc/include/asm/page.h:187: note: this is the location of the previous definition 187 | #define PAGE0 ((struct zeropage *)__PAGE_OFFSET) Fixes: cb646e2b02b2 ("ptp: Added a clock driver for the National Semiconductor PHYTER.") Signed-off-by: Randy Dunlap Reported-by: Geert Uytterhoeven Cc: Richard Cochran Cc: John Stultz Cc: Heiner Kallweit Cc: Russell King Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20210913220605.19682-1-rdunlap@infradead.org Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/phy/dp83640_reg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/dp83640_reg.h b/drivers/net/phy/dp83640_reg.h index 21aa24c741b96..daae7fa58fb82 100644 --- a/drivers/net/phy/dp83640_reg.h +++ b/drivers/net/phy/dp83640_reg.h @@ -5,7 +5,7 @@ #ifndef HAVE_DP83640_REGISTERS #define HAVE_DP83640_REGISTERS -#define PAGE0 0x0000 +/* #define PAGE0 0x0000 */ #define PHYCR2 0x001c /* PHY Control Register 2 */ #define PAGE4 0x0004 From 5b44a2381235b43a29ad27b65b10434b1e9208cf Mon Sep 17 00:00:00 2001 From: "Lin, Zhenpeng" Date: Wed, 8 Sep 2021 03:40:59 +0000 Subject: [PATCH 0843/5344] dccp: don't duplicate ccid when cloning dccp sock commit d9ea761fdd197351890418acd462c51f241014a7 upstream. Commit 2677d2067731 ("dccp: don't free ccid2_hc_tx_sock ...") fixed a UAF but reintroduced CVE-2017-6074. When the sock is cloned, two dccps_hc_tx_ccid will reference to the same ccid. So one can free the ccid object twice from two socks after cloning. This issue was found by "Hadar Manor" as well and assigned with CVE-2020-16119, which was fixed in Ubuntu's kernel. So here I port the patch from Ubuntu to fix it. The patch prevents cloned socks from referencing the same ccid. Fixes: 2677d2067731410 ("dccp: don't free ccid2_hc_tx_sock ...") Signed-off-by: Zhenpeng Lin Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/dccp/minisocks.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 25187528c308a..1f352d669c944 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -94,6 +94,8 @@ struct sock *dccp_create_openreq_child(const struct sock *sk, newdp->dccps_role = DCCP_ROLE_SERVER; newdp->dccps_hc_rx_ackvec = NULL; newdp->dccps_service_list = NULL; + newdp->dccps_hc_rx_ccid = NULL; + newdp->dccps_hc_tx_ccid = NULL; newdp->dccps_service = dreq->dreq_service; newdp->dccps_timestamp_echo = dreq->dreq_timestamp_echo; newdp->dccps_timestamp_time = dreq->dreq_timestamp_time; From bf5fa4d761622fdd51f871337aff16033b4666c6 Mon Sep 17 00:00:00 2001 From: Xiyu Yang Date: Thu, 9 Sep 2021 12:32:00 +0800 Subject: [PATCH 0844/5344] net/l2tp: Fix reference count leak in l2tp_udp_recv_core commit 9b6ff7eb666415e1558f1ba8a742f5db6a9954de upstream. The reference count leak issue may take place in an error handling path. If both conditions of tunnel->version == L2TP_HDR_VER_3 and the return value of l2tp_v3_ensure_opt_in_linear is nonzero, the function would directly jump to label invalid, without decrementing the reference count of the l2tp_session object session increased earlier by l2tp_tunnel_get_session(). This may result in refcount leaks. Fix this issue by decrease the reference count before jumping to the label invalid. Fixes: 4522a70db7aa ("l2tp: fix reading optional fields of L2TPv3") Signed-off-by: Xiyu Yang Signed-off-by: Xin Xiong Signed-off-by: Xin Tan Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/l2tp/l2tp_core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 95805a6331be2..421b2c89ce12a 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -886,8 +886,10 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) } if (tunnel->version == L2TP_HDR_VER_3 && - l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr)) + l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr)) { + l2tp_session_dec_refcount(session); goto error; + } l2tp_recv_common(session, skb, ptr, optr, hdrflags, length); l2tp_session_dec_refcount(session); From 528ffc50691d3e04ceee09528e51e767cb3473e2 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 9 Sep 2021 10:33:28 -0700 Subject: [PATCH 0845/5344] r6040: Restore MDIO clock frequency after MAC reset commit e3f0cc1a945fcefec0c7c9d9dfd028a51daa1846 upstream. A number of users have reported that they were not able to get the PHY to successfully link up, especially after commit c36757eb9dee ("net: phy: consider AN_RESTART status when reading link status") where we stopped reading just BMSR, but we also read BMCR to determine the link status. Andrius at NetBSD did a wonderful job at debugging the problem and found out that the MDIO bus clock frequency would be incorrectly set back to its default value which would prevent the MDIO bus controller from reading PHY registers properly. Back when we only read BMSR, if we read all 1s, we could falsely indicate a link status, though in general there is a cable plugged in, so this went unnoticed. After a second read of BMCR was added, a wrong read will lead to the inability to determine a link UP condition which is when it started to be visibly broken, even if it was long before that. The fix consists in restoring the value of the MD_CSR register that was set prior to the MAC reset. Link: http://gnats.netbsd.org/cgi-bin/query-pr-single.pl?number=53494 Fixes: 90f750a81a29 ("r6040: consolidate MAC reset to its own function") Reported-by: Andrius V Reported-by: Darek Strugacz Tested-by: Darek Strugacz Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/rdc/r6040.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/rdc/r6040.c b/drivers/net/ethernet/rdc/r6040.c index 274e5b4bc4ac8..f158fdf3aab2c 100644 --- a/drivers/net/ethernet/rdc/r6040.c +++ b/drivers/net/ethernet/rdc/r6040.c @@ -119,6 +119,8 @@ #define PHY_ST 0x8A /* PHY status register */ #define MAC_SM 0xAC /* MAC status machine */ #define MAC_SM_RST 0x0002 /* MAC status machine reset */ +#define MD_CSC 0xb6 /* MDC speed control register */ +#define MD_CSC_DEFAULT 0x0030 #define MAC_ID 0xBE /* Identifier register */ #define TX_DCNT 0x80 /* TX descriptor count */ @@ -354,8 +356,9 @@ static void r6040_reset_mac(struct r6040_private *lp) { void __iomem *ioaddr = lp->base; int limit = MAC_DEF_TIMEOUT; - u16 cmd; + u16 cmd, md_csc; + md_csc = ioread16(ioaddr + MD_CSC); iowrite16(MAC_RST, ioaddr + MCR1); while (limit--) { cmd = ioread16(ioaddr + MCR1); @@ -367,6 +370,10 @@ static void r6040_reset_mac(struct r6040_private *lp) iowrite16(MAC_SM_RST, ioaddr + MAC_SM); iowrite16(0, ioaddr + MAC_SM); mdelay(5); + + /* Restore MDIO clock frequency */ + if (md_csc != MD_CSC_DEFAULT) + iowrite16(md_csc, ioaddr + MD_CSC); } static void r6040_init_mac_regs(struct net_device *dev) From ba35cda224276e0fef1cffb0e49137d2d28d9415 Mon Sep 17 00:00:00 2001 From: Hoang Le Date: Mon, 13 Sep 2021 16:28:52 +0700 Subject: [PATCH 0846/5344] tipc: increase timeout in tipc_sk_enqueue() commit f4bb62e64c88c93060c051195d3bbba804e56945 upstream. In tipc_sk_enqueue() we use hardcoded 2 jiffies to extract socket buffer from generic queue to particular socket. The 2 jiffies is too short in case there are other high priority tasks get CPU cycles for multiple jiffies update. As result, no buffer could be enqueued to particular socket. To solve this, we switch to use constant timeout 20msecs. Then, the function will be expired between 2 jiffies (CONFIG_100HZ) and 20 jiffies (CONFIG_1000HZ). Fixes: c637c1035534 ("tipc: resolve race problem at unicast message reception") Acked-by: Jon Maloy Signed-off-by: Hoang Le Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tipc/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 3d6e460d99718..fbbac9ba2862f 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2275,7 +2275,7 @@ static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, u32 dport, struct sk_buff_head *xmitq) { - unsigned long time_limit = jiffies + 2; + unsigned long time_limit = jiffies + usecs_to_jiffies(20000); struct sk_buff *skb; unsigned int lim; atomic_t *dcnt; From 3889c89180e5f29f2669d184c410979cee42a6e7 Mon Sep 17 00:00:00 2001 From: Michael Petlan Date: Mon, 19 Jul 2021 16:53:32 +0200 Subject: [PATCH 0847/5344] perf machine: Initialize srcline string member in add_location struct commit 57f0ff059e3daa4e70a811cb1d31a49968262d20 upstream. It's later supposed to be either a correct address or NULL. Without the initialization, it may contain an undefined value which results in the following segmentation fault: # perf top --sort comm -g --ignore-callees=do_idle terminates with: #0 0x00007ffff56b7685 in __strlen_avx2 () from /lib64/libc.so.6 #1 0x00007ffff55e3802 in strdup () from /lib64/libc.so.6 #2 0x00005555558cb139 in hist_entry__init (callchain_size=, sample_self=true, template=0x7fffde7fb110, he=0x7fffd801c250) at util/hist.c:489 #3 hist_entry__new (template=template@entry=0x7fffde7fb110, sample_self=sample_self@entry=true) at util/hist.c:564 #4 0x00005555558cb4ba in hists__findnew_entry (hists=hists@entry=0x5555561d9e38, entry=entry@entry=0x7fffde7fb110, al=al@entry=0x7fffde7fb420, sample_self=sample_self@entry=true) at util/hist.c:657 #5 0x00005555558cba1b in __hists__add_entry (hists=hists@entry=0x5555561d9e38, al=0x7fffde7fb420, sym_parent=, bi=bi@entry=0x0, mi=mi@entry=0x0, sample=sample@entry=0x7fffde7fb4b0, sample_self=true, ops=0x0, block_info=0x0) at util/hist.c:288 #6 0x00005555558cbb70 in hists__add_entry (sample_self=true, sample=0x7fffde7fb4b0, mi=0x0, bi=0x0, sym_parent=, al=, hists=0x5555561d9e38) at util/hist.c:1056 #7 iter_add_single_cumulative_entry (iter=0x7fffde7fb460, al=) at util/hist.c:1056 #8 0x00005555558cc8a4 in hist_entry_iter__add (iter=iter@entry=0x7fffde7fb460, al=al@entry=0x7fffde7fb420, max_stack_depth=, arg=arg@entry=0x7fffffff7db0) at util/hist.c:1231 #9 0x00005555557cdc9a in perf_event__process_sample (machine=, sample=0x7fffde7fb4b0, evsel=, event=, tool=0x7fffffff7db0) at builtin-top.c:842 #10 deliver_event (qe=, qevent=) at builtin-top.c:1202 #11 0x00005555558a9318 in do_flush (show_progress=false, oe=0x7fffffff80e0) at util/ordered-events.c:244 #12 __ordered_events__flush (oe=oe@entry=0x7fffffff80e0, how=how@entry=OE_FLUSH__TOP, timestamp=timestamp@entry=0) at util/ordered-events.c:323 #13 0x00005555558a9789 in __ordered_events__flush (timestamp=, how=, oe=) at util/ordered-events.c:339 #14 ordered_events__flush (how=OE_FLUSH__TOP, oe=0x7fffffff80e0) at util/ordered-events.c:341 #15 ordered_events__flush (oe=oe@entry=0x7fffffff80e0, how=how@entry=OE_FLUSH__TOP) at util/ordered-events.c:339 #16 0x00005555557cd631 in process_thread (arg=0x7fffffff7db0) at builtin-top.c:1114 #17 0x00007ffff7bb817a in start_thread () from /lib64/libpthread.so.0 #18 0x00007ffff5656dc3 in clone () from /lib64/libc.so.6 If you look at the frame #2, the code is: 488 if (he->srcline) { 489 he->srcline = strdup(he->srcline); 490 if (he->srcline == NULL) 491 goto err_rawdata; 492 } If he->srcline is not NULL (it is not NULL if it is uninitialized rubbish), it gets strdupped and strdupping a rubbish random string causes the problem. Also, if you look at the commit 1fb7d06a509e, it adds the srcline property into the struct, but not initializing it everywhere needed. Committer notes: Now I see, when using --ignore-callees=do_idle we end up here at line 2189 in add_callchain_ip(): 2181 if (al.sym != NULL) { 2182 if (perf_hpp_list.parent && !*parent && 2183 symbol__match_regex(al.sym, &parent_regex)) 2184 *parent = al.sym; 2185 else if (have_ignore_callees && root_al && 2186 symbol__match_regex(al.sym, &ignore_callees_regex)) { 2187 /* Treat this symbol as the root, 2188 forgetting its callees. */ 2189 *root_al = al; 2190 callchain_cursor_reset(cursor); 2191 } 2192 } And the al that doesn't have the ->srcline field initialized will be copied to the root_al, so then, back to: 1211 int hist_entry_iter__add(struct hist_entry_iter *iter, struct addr_location *al, 1212 int max_stack_depth, void *arg) 1213 { 1214 int err, err2; 1215 struct map *alm = NULL; 1216 1217 if (al) 1218 alm = map__get(al->map); 1219 1220 err = sample__resolve_callchain(iter->sample, &callchain_cursor, &iter->parent, 1221 iter->evsel, al, max_stack_depth); 1222 if (err) { 1223 map__put(alm); 1224 return err; 1225 } 1226 1227 err = iter->ops->prepare_entry(iter, al); 1228 if (err) 1229 goto out; 1230 1231 err = iter->ops->add_single_entry(iter, al); 1232 if (err) 1233 goto out; 1234 That al at line 1221 is what hist_entry_iter__add() (called from sample__resolve_callchain()) saw as 'root_al', and then: iter->ops->add_single_entry(iter, al); will go on with al->srcline with a bogus value, I'll add the above sequence to the cset and apply, thanks! Signed-off-by: Michael Petlan CC: Milian Wolff Cc: Jiri Olsa Fixes: 1fb7d06a509e ("perf report Use srcline from callchain for hist entries") Link: https //lore.kernel.org/r/20210719145332.29747-1-mpetlan@redhat.com Reported-by: Juri Lelli Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Greg Kroah-Hartman --- tools/perf/util/machine.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 767fe1bfd922c..8c3addc2e9e1e 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -2020,6 +2020,7 @@ static int add_callchain_ip(struct thread *thread, al.filtered = 0; al.sym = NULL; + al.srcline = NULL; if (!cpumode) { thread__find_cpumode_addr_location(thread, ip, &al); } else { From 965f911455fe116481788fddf384697a354853ab Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Wed, 18 Aug 2021 13:09:26 -0700 Subject: [PATCH 0848/5344] net/mlx5: FWTrace, cancel work on alloc pd error flow commit dfe6fd72b5f1878b16aa2c8603e031bbcd66b96d upstream. Handle error flow on mlx5_core_alloc_pd() failure, read_fw_strings_work must be canceled. Fixes: c71ad41ccb0c ("net/mlx5: FW tracer, events handling") Reported-by: Pavel Machek (CIP) Suggested-by: Pavel Machek (CIP) Signed-off-by: Saeed Mahameed Reviewed-by: Aya Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c index dc36b0db37222..97359417c6e7f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c @@ -1005,7 +1005,7 @@ int mlx5_fw_tracer_init(struct mlx5_fw_tracer *tracer) err = mlx5_core_alloc_pd(dev, &tracer->buff.pdn); if (err) { mlx5_core_warn(dev, "FWTracer: Failed to allocate PD %d\n", err); - return err; + goto err_cancel_work; } err = mlx5_fw_tracer_create_mkey(tracer); @@ -1029,6 +1029,7 @@ int mlx5_fw_tracer_init(struct mlx5_fw_tracer *tracer) mlx5_core_destroy_mkey(dev, &tracer->buff.mkey); err_dealloc_pd: mlx5_core_dealloc_pd(dev, tracer->buff.pdn); +err_cancel_work: cancel_work_sync(&tracer->read_fw_strings_work); return err; } From 4dab5c7e2106ea663e32c46f1101174c8222d36d Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Wed, 1 Sep 2021 11:48:13 +0300 Subject: [PATCH 0849/5344] net/mlx5: Fix potential sleeping in atomic context commit ee27e330a953595903979ffdb84926843595a9fe upstream. Fixes the below flow of sleeping in atomic context by releasing the RCU lock before calling to free_match_list. build_match_list() <- disables preempt -> free_match_list() -> tree_put_node() -> down_write_ref_node() <- take write lock Fixes: 693c6883bbc4 ("net/mlx5: Add hash table for flow groups in flow table") Reported-by: Dan Carpenter Signed-off-by: Maor Gottlieb Signed-off-by: Saeed Mahameed Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 0ae3846d75e91..0f7ab9bea2ac4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -1597,9 +1597,9 @@ static int build_match_list(struct match_list_head *match_head, curr_match = kmalloc(sizeof(*curr_match), GFP_ATOMIC); if (!curr_match) { + rcu_read_unlock(); free_match_list(match_head, ft_locked); - err = -ENOMEM; - goto out; + return -ENOMEM; } if (!tree_get_node(&g->node)) { kfree(curr_match); @@ -1608,7 +1608,6 @@ static int build_match_list(struct match_list_head *match_head, curr_match->g = g; list_add_tail(&curr_match->list, &match_head->list); } -out: rcu_read_unlock(); return err; } From e0c35bc76e8a0d94e9ad9da51eb3f0380580b471 Mon Sep 17 00:00:00 2001 From: Baptiste Lepers Date: Mon, 6 Sep 2021 11:53:10 +1000 Subject: [PATCH 0850/5344] events: Reuse value read using READ_ONCE instead of re-reading it commit b89a05b21f46150ac10a962aa50109250b56b03b upstream. In perf_event_addr_filters_apply, the task associated with the event (event->ctx->task) is read using READ_ONCE at the beginning of the function, checked, and then re-read from event->ctx->task, voiding all guarantees of the checks. Reuse the value that was read by READ_ONCE to ensure the consistency of the task struct throughout the function. Fixes: 375637bc52495 ("perf/core: Introduce address range filtering") Signed-off-by: Baptiste Lepers Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210906015310.12802-1-baptiste.lepers@gmail.com Signed-off-by: Greg Kroah-Hartman --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index c1e28439fc439..6ed10918668b8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9287,7 +9287,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event) return; if (ifh->nr_file_filters) { - mm = get_task_mm(event->ctx->task); + mm = get_task_mm(task); if (!mm) goto restart; From f162afac39f27432e125f5cb700747fa94d6cf30 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 8 Sep 2021 13:42:09 +0200 Subject: [PATCH 0851/5344] vhost_net: fix OoB on sendmsg() failure. commit 3c4cea8fa7f71f00c5279547043a84bc2a4d8b8c upstream. If the sendmsg() call in vhost_tx_batch() fails, both the 'batched_xdp' and 'done_idx' indexes are left unchanged. If such failure happens when batched_xdp == VHOST_NET_BATCH, the next call to vhost_net_build_xdp() will access and write memory outside the xdp buffers area. Since sendmsg() can only error with EBADFD, this change addresses the issue explicitly freeing the XDP buffers batch on error. Fixes: 0a0be13b8fe2 ("vhost_net: batch submitting XDP buffers to underlayer sockets") Suggested-by: Jason Wang Signed-off-by: Paolo Abeni Acked-by: Jason Wang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/vhost/net.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 8b976b3e1bd15..0d77de0447ba9 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -466,7 +466,7 @@ static void vhost_tx_batch(struct vhost_net *net, .num = nvq->batched_xdp, .ptr = nvq->xdp, }; - int err; + int i, err; if (nvq->batched_xdp == 0) goto signal_used; @@ -475,6 +475,15 @@ static void vhost_tx_batch(struct vhost_net *net, err = sock->ops->sendmsg(sock, msghdr, 0); if (unlikely(err < 0)) { vq_err(&nvq->vq, "Fail to batch sending packets\n"); + + /* free pages owned by XDP; since this is an unlikely error path, + * keep it simple and avoid more complex bulk update for the + * used pages + */ + for (i = 0; i < nvq->batched_xdp; ++i) + put_page(virt_to_head_page(nvq->xdp[i].data)); + nvq->batched_xdp = 0; + nvq->done_idx = 0; return; } From 647cb9cb68408a69aaada5ab98c991e0ab3746cc Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 14 Sep 2021 16:43:31 +0300 Subject: [PATCH 0852/5344] net: dsa: destroy the phylink instance on any error in dsa_slave_phy_setup commit 6a52e73368038f47f6618623d75061dc263b26ae upstream. DSA supports connecting to a phy-handle, and has a fallback to a non-OF based method of connecting to an internal PHY on the switch's own MDIO bus, if no phy-handle and no fixed-link nodes were present. The -ENODEV error code from the first attempt (phylink_of_phy_connect) is what triggers the second attempt (phylink_connect_phy). However, when the first attempt returns a different error code than -ENODEV, this results in an unbalance of calls to phylink_create and phylink_destroy by the time we exit the function. The phylink instance has leaked. There are many other error codes that can be returned by phylink_of_phy_connect. For example, phylink_validate returns -EINVAL. So this is a practical issue too. Fixes: aab9c4067d23 ("net: dsa: Plug in PHYLINK support") Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Reviewed-by: Russell King (Oracle) Link: https://lore.kernel.org/r/20210914134331.2303380-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- net/dsa/slave.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 75b4cd4bcafb9..59759ceb426ac 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1327,13 +1327,11 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) * use the switch internal MDIO bus instead */ ret = dsa_slave_phy_connect(slave_dev, dp->index); - if (ret) { - netdev_err(slave_dev, - "failed to connect to port %d: %d\n", - dp->index, ret); - phylink_destroy(dp->pl); - return ret; - } + } + if (ret) { + netdev_err(slave_dev, "failed to connect to PHY: %pe\n", + ERR_PTR(ret)); + phylink_destroy(dp->pl); } return ret; From 8acb62162f2644e905da8156c80f1a26d5525dc3 Mon Sep 17 00:00:00 2001 From: zhenggy Date: Tue, 14 Sep 2021 09:51:15 +0800 Subject: [PATCH 0853/5344] tcp: fix tp->undo_retrans accounting in tcp_sacktag_one() commit 4f884f3962767877d7aabbc1ec124d2c307a4257 upstream. Commit 10d3be569243 ("tcp-tso: do not split TSO packets at retransmit time") may directly retrans a multiple segments TSO/GSO packet without split, Since this commit, we can no longer assume that a retransmitted packet is a single segment. This patch fixes the tp->undo_retrans accounting in tcp_sacktag_one() that use the actual segments(pcount) of the retransmitted packet. Before that commit (10d3be569243), the assumption underlying the tp->undo_retrans-- seems correct. Fixes: 10d3be569243 ("tcp-tso: do not split TSO packets at retransmit time") Signed-off-by: zhenggy Reviewed-by: Eric Dumazet Acked-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index be3d96c7750b5..1a8ebd9d815a9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1302,7 +1302,7 @@ static u8 tcp_sacktag_one(struct sock *sk, if (dup_sack && (sacked & TCPCB_RETRANS)) { if (tp->undo_marker && tp->undo_retrans > 0 && after(end_seq, tp->undo_marker)) - tp->undo_retrans--; + tp->undo_retrans = max_t(int, 0, tp->undo_retrans - pcount); if ((sacked & TCPCB_SACKED_ACKED) && before(start_seq, state->reord)) state->reord = start_seq; From d3f7ad2cd0a84452f1244018846b3988b46e917a Mon Sep 17 00:00:00 2001 From: Shai Malin Date: Fri, 10 Sep 2021 11:33:56 +0300 Subject: [PATCH 0854/5344] qed: Handle management FW error commit 20e100f52730cd0db609e559799c1712b5f27582 upstream. Handle MFW (management FW) error response in order to avoid a crash during recovery flows. Changes from v1: - Add "Fixes tag". Fixes: tag 5e7ba042fd05 ("qed: Fix reading stale configuration information") Signed-off-by: Ariel Elior Signed-off-by: Shai Malin Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/qlogic/qed/qed_mcp.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c index 9401b49275f0a..74aec5c2732a0 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c @@ -3162,6 +3162,7 @@ qed_mcp_get_nvm_image_att(struct qed_hwfn *p_hwfn, struct qed_nvm_image_att *p_image_att) { enum nvm_image_type type; + int rc; u32 i; /* Translate image_id into MFW definitions */ @@ -3187,7 +3188,10 @@ qed_mcp_get_nvm_image_att(struct qed_hwfn *p_hwfn, return -EINVAL; } - qed_mcp_nvm_info_populate(p_hwfn); + rc = qed_mcp_nvm_info_populate(p_hwfn); + if (rc) + return rc; + for (i = 0; i < p_hwfn->nvm_info.num_images; i++) if (type == p_hwfn->nvm_info.image_att[i].image_type) break; From 0e00fd1a132e6f9379d913c93722481cc13d2000 Mon Sep 17 00:00:00 2001 From: David Heidelberg Date: Sun, 12 Sep 2021 18:51:20 +0200 Subject: [PATCH 0855/5344] dt-bindings: arm: Fix Toradex compatible typo commit 55c21d57eafb7b379bb7b3e93baf9ca2695895b0 upstream. Fix board compatible typo reported by dtbs_check. Fixes: f4d1577e9bc6 ("dt-bindings: arm: Convert Tegra board/soc bindings to json-schema") Signed-off-by: David Heidelberg Link: https://lore.kernel.org/r/20210912165120.188490-1-david@ixit.cz Signed-off-by: Rob Herring Signed-off-by: Greg Kroah-Hartman --- Documentation/devicetree/bindings/arm/tegra.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/arm/tegra.yaml b/Documentation/devicetree/bindings/arm/tegra.yaml index 60b38eb5c61ab..56e1945911f1e 100644 --- a/Documentation/devicetree/bindings/arm/tegra.yaml +++ b/Documentation/devicetree/bindings/arm/tegra.yaml @@ -49,7 +49,7 @@ properties: - const: toradex,apalis_t30 - const: nvidia,tegra30 - items: - - const: toradex,apalis_t30-eval-v1.1 + - const: toradex,apalis_t30-v1.1-eval - const: toradex,apalis_t30-eval - const: toradex,apalis_t30-v1.1 - const: toradex,apalis_t30 From d20660193492f85060e49e8acd99a49bb058ca05 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Wed, 8 Sep 2021 09:58:20 -0700 Subject: [PATCH 0856/5344] ibmvnic: check failover_pending in login response commit 273c29e944bda9a20a30c26cfc34c9a3f363280b upstream. If a failover occurs before a login response is received, the login response buffer maybe undefined. Check that there was no failover before accessing the login response buffer. Fixes: 032c5e82847a ("Driver for IBM System i/p VNIC protocol") Signed-off-by: Sukadev Bhattiprolu Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/ibm/ibmvnic.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index ecfe588f330ef..cfe7229593ead 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -4277,6 +4277,14 @@ static int handle_login_rsp(union ibmvnic_crq *login_rsp_crq, return 0; } + if (adapter->failover_pending) { + adapter->init_done_rc = -EAGAIN; + netdev_dbg(netdev, "Failover pending, ignoring login response\n"); + complete(&adapter->init_done); + /* login response buffer will be released on reset */ + return 0; + } + netdev->mtu = adapter->req_mtu - ETH_HLEN; netdev_dbg(adapter->netdev, "Login Response Buffer:\n"); From 3d3b271ac21fb79163dc6ef18e9377d2e99af19f Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 8 Sep 2021 20:17:18 +1000 Subject: [PATCH 0857/5344] KVM: PPC: Book3S HV: Tolerate treclaim. in fake-suspend mode changing registers commit 267cdfa21385d78c794768233678756e32b39ead upstream. POWER9 DD2.2 and 2.3 hardware implements a "fake-suspend" mode where certain TM instructions executed in HV=0 mode cause softpatch interrupts so the hypervisor can emulate them and prevent problematic processor conditions. In this fake-suspend mode, the treclaim. instruction does not modify registers. Unfortunately the rfscv instruction executed by the guest do not generate softpatch interrupts, which can cause the hypervisor to lose track of the fake-suspend mode, and it can execute this treclaim. while not in fake-suspend mode. This modifies GPRs and crashes the hypervisor. It's not trivial to disable scv in the guest with HFSCR now, because they assume a POWER9 has scv available. So this fix saves and restores checkpointed registers across the treclaim. Fixes: 7854f7545bff ("KVM: PPC: Book3S: Rework TM save/restore code and make it C-callable") Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210908101718.118522-2-npiggin@gmail.com Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 36 +++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index c6fbbd29bd871..feaf6ca2e76c1 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -3137,7 +3137,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_P9_TM_HV_ASSIST) /* The following code handles the fake_suspend = 1 case */ mflr r0 std r0, PPC_LR_STKOFF(r1) - stdu r1, -PPC_MIN_STKFRM(r1) + stdu r1, -TM_FRAME_SIZE(r1) /* Turn on TM. */ mfmsr r8 @@ -3152,10 +3152,42 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG) nop + /* + * It's possible that treclaim. may modify registers, if we have lost + * track of fake-suspend state in the guest due to it using rfscv. + * Save and restore registers in case this occurs. + */ + mfspr r3, SPRN_DSCR + mfspr r4, SPRN_XER + mfspr r5, SPRN_AMR + /* SPRN_TAR would need to be saved here if the kernel ever used it */ + mfcr r12 + SAVE_NVGPRS(r1) + SAVE_GPR(2, r1) + SAVE_GPR(3, r1) + SAVE_GPR(4, r1) + SAVE_GPR(5, r1) + stw r12, 8(r1) + std r1, HSTATE_HOST_R1(r13) + /* We have to treclaim here because that's the only way to do S->N */ li r3, TM_CAUSE_KVM_RESCHED TRECLAIM(R3) + GET_PACA(r13) + ld r1, HSTATE_HOST_R1(r13) + REST_GPR(2, r1) + REST_GPR(3, r1) + REST_GPR(4, r1) + REST_GPR(5, r1) + lwz r12, 8(r1) + REST_NVGPRS(r1) + mtspr SPRN_DSCR, r3 + mtspr SPRN_XER, r4 + mtspr SPRN_AMR, r5 + mtcr r12 + HMT_MEDIUM + /* * We were in fake suspend, so we are not going to save the * register state as the guest checkpointed state (since @@ -3183,7 +3215,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG) std r5, VCPU_TFHAR(r9) std r6, VCPU_TFIAR(r9) - addi r1, r1, PPC_MIN_STKFRM + addi r1, r1, TM_FRAME_SIZE ld r0, PPC_LR_STKOFF(r1) mtlr r0 blr From de754d66c8b577e5f8699714cd47112188a90db8 Mon Sep 17 00:00:00 2001 From: Yufeng Mo Date: Mon, 13 Sep 2021 21:08:21 +0800 Subject: [PATCH 0858/5344] net: hns3: pad the short tunnel frame before sending to hardware commit d18e81183b1cb9c309266cbbce9acd3e0c528d04 upstream. The hardware cannot handle short tunnel frames below 65 bytes, and will cause vlan tag missing problem. So pads packet size to 65 bytes for tunnel frames to fix this bug. Fixes: 3db084d28dc0("net: hns3: Fix for vxlan tx checksum bug") Signed-off-by: Yufeng Mo Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index e64e175162068..db9c8f943811b 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -56,6 +56,7 @@ MODULE_PARM_DESC(debug, " Network interface message level setting"); #define HNS3_OUTER_VLAN_TAG 2 #define HNS3_MIN_TX_LEN 33U +#define HNS3_MIN_TUN_PKT_LEN 65U /* hns3_pci_tbl - PCI Device ID Table * @@ -931,8 +932,11 @@ static int hns3_set_l2l3l4(struct sk_buff *skb, u8 ol4_proto, l4.tcp->doff); break; case IPPROTO_UDP: - if (hns3_tunnel_csum_bug(skb)) - return skb_checksum_help(skb); + if (hns3_tunnel_csum_bug(skb)) { + int ret = skb_put_padto(skb, HNS3_MIN_TUN_PKT_LEN); + + return ret ? ret : skb_checksum_help(skb); + } hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4CS_B, 1); hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4T_S, From 100a482532577484c690f6f25156182dbd0929ee Mon Sep 17 00:00:00 2001 From: Yufeng Mo Date: Mon, 13 Sep 2021 21:08:22 +0800 Subject: [PATCH 0859/5344] net: hns3: change affinity_mask to numa node range commit 1dc839ec09d3ab2a4156dc98328b8bc3586f2b70 upstream. Currently, affinity_mask is set to a single cpu. As a result, irqbalance becomes invalid in SUBSET or EXACT mode. To solve this problem, change affinity_mask to numa node range. In this way, irqbalance can be performed on the cpu of the numa node. Fixes: 0812545487ec ("net: hns3: add interrupt affinity support for misc interrupt") Signed-off-by: Yufeng Mo Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index aa402e2671212..69bf78e5240cf 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1328,9 +1328,10 @@ static void hclge_init_kdump_kernel_config(struct hclge_dev *hdev) static int hclge_configure(struct hclge_dev *hdev) { + const struct cpumask *cpumask = cpu_online_mask; struct hclge_cfg cfg; unsigned int i; - int ret; + int node, ret; ret = hclge_get_cfg(hdev, &cfg); if (ret) { @@ -1390,11 +1391,12 @@ static int hclge_configure(struct hclge_dev *hdev) hclge_init_kdump_kernel_config(hdev); - /* Set the init affinity based on pci func number */ - i = cpumask_weight(cpumask_of_node(dev_to_node(&hdev->pdev->dev))); - i = i ? PCI_FUNC(hdev->pdev->devfn) % i : 0; - cpumask_set_cpu(cpumask_local_spread(i, dev_to_node(&hdev->pdev->dev)), - &hdev->affinity_mask); + /* Set the affinity based on numa node */ + node = dev_to_node(&hdev->pdev->dev); + if (node != NUMA_NO_NODE) + cpumask = cpumask_of_node(node); + + cpumask_copy(&hdev->affinity_mask, cpumask); return ret; } From 6f63047fc5487b59b298a7c8470d8dacab33218d Mon Sep 17 00:00:00 2001 From: Yufeng Mo Date: Mon, 13 Sep 2021 21:08:23 +0800 Subject: [PATCH 0860/5344] net: hns3: disable mac in flr process commit b81d8948746520f989e86d66292ff72b5056114a upstream. The firmware will not disable mac in flr process. Therefore, the driver needs to proactively disable mac during flr, which is the same as the function reset. Fixes: 35d93a30040c ("net: hns3: adjust the process of PF reset") Signed-off-by: Yufeng Mo Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 69bf78e5240cf..f44e8401496b1 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -6685,11 +6685,12 @@ static void hclge_ae_stop(struct hnae3_handle *handle) hclge_clear_arfs_rules(handle); spin_unlock_bh(&hdev->fd_rule_lock); - /* If it is not PF reset, the firmware will disable the MAC, + /* If it is not PF reset or FLR, the firmware will disable the MAC, * so it only need to stop phy here. */ if (test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state) && - hdev->reset_type != HNAE3_FUNC_RESET) { + hdev->reset_type != HNAE3_FUNC_RESET && + hdev->reset_type != HNAE3_FLR_RESET) { hclge_mac_stop_phy(hdev); hclge_update_link_status(hdev); return; From 20d47294de76b279e8d1e807f250ba4da75bedfc Mon Sep 17 00:00:00 2001 From: Jiaran Zhang Date: Mon, 13 Sep 2021 21:08:25 +0800 Subject: [PATCH 0861/5344] net: hns3: fix the timing issue of VF clearing interrupt sources commit 427900d27d86b820c559037a984bd403f910860f upstream. Currently, the VF does not clear the interrupt source immediately after receiving the interrupt. As a result, if the second interrupt task is triggered when processing the first interrupt task, clearing the interrupt source before exiting will clear the interrupt sources of the two tasks at the same time. As a result, no interrupt is triggered for the second task. The VF detects the missed message only when the next interrupt is generated. Clearing it immediately after executing check_evt_cause ensures that: 1. Even if two interrupt tasks are triggered at the same time, they can be processed. 2. If the second task is triggered during the processing of the first task and the interrupt source is not cleared, the interrupt is reported after vector0 is enabled. Fixes: b90fcc5bd904 ("net: hns3: add reset handling for VF when doing Core/Global/IMP reset") Signed-off-by: Jiaran Zhang Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index ea348ebbbf2e9..db2e9dd5681eb 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -1956,6 +1956,8 @@ static irqreturn_t hclgevf_misc_irq_handle(int irq, void *data) hclgevf_enable_vector(&hdev->misc_vector, false); event_cause = hclgevf_check_evt_cause(hdev, &clearval); + if (event_cause != HCLGEVF_VECTOR0_EVENT_OTHER) + hclgevf_clear_event_cause(hdev, clearval); switch (event_cause) { case HCLGEVF_VECTOR0_EVENT_RST: @@ -1968,10 +1970,8 @@ static irqreturn_t hclgevf_misc_irq_handle(int irq, void *data) break; } - if (event_cause != HCLGEVF_VECTOR0_EVENT_OTHER) { - hclgevf_clear_event_cause(hdev, clearval); + if (event_cause != HCLGEVF_VECTOR0_EVENT_OTHER) hclgevf_enable_vector(&hdev->misc_vector, true); - } return IRQ_HANDLED; } From f62dbad4ee1c65ed9c7f3a7c49265c03329d3b9f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 7 Sep 2021 19:54:59 -0700 Subject: [PATCH 0862/5344] mm/memory_hotplug: use "unsigned long" for PFN in zone_for_pfn_range() commit 7cf209ba8a86410939a24cb1aeb279479a7e0ca6 upstream. Patch series "mm/memory_hotplug: preparatory patches for new online policy and memory" These are all cleanups and one fix previously sent as part of [1]: [PATCH v1 00/12] mm/memory_hotplug: "auto-movable" online policy and memory groups. These patches make sense even without the other series, therefore I pulled them out to make the other series easier to digest. [1] https://lkml.kernel.org/r/20210607195430.48228-1-david@redhat.com This patch (of 4): Checkpatch complained on a follow-up patch that we are using "unsigned" here, which defaults to "unsigned int" and checkpatch is correct. As we will search for a fitting zone using the wrong pfn, we might end up onlining memory to one of the special kernel zones, such as ZONE_DMA, which can end badly as the onlined memory does not satisfy properties of these zones. Use "unsigned long" instead, just as we do in other places when handling PFNs. This can bite us once we have physical addresses in the range of multiple TB. Link: https://lkml.kernel.org/r/20210712124052.26491-2-david@redhat.com Fixes: e5e689302633 ("mm, memory_hotplug: display allowed zones in the preferred ordering") Signed-off-by: David Hildenbrand Reviewed-by: Pankaj Gupta Reviewed-by: Muchun Song Reviewed-by: Oscar Salvador Cc: David Hildenbrand Cc: Vitaly Kuznetsov Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Cc: Wei Yang Cc: Michal Hocko Cc: Dan Williams Cc: Anshuman Khandual Cc: Dave Hansen Cc: Vlastimil Babka Cc: Mike Rapoport Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Pavel Tatashin Cc: Heiko Carstens Cc: Michael Ellerman Cc: Catalin Marinas Cc: virtualization@lists.linux-foundation.org Cc: Andy Lutomirski Cc: "Aneesh Kumar K.V" Cc: Anton Blanchard Cc: Ard Biesheuvel Cc: Baoquan He Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Dave Jiang Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jia He Cc: Joe Perches Cc: Kefeng Wang Cc: Laurent Dufour Cc: Michel Lespinasse Cc: Nathan Lynch Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Pierre Morel Cc: "Rafael J. Wysocki" Cc: Rich Felker Cc: Scott Cheloha Cc: Sergei Trofimovich Cc: Thiago Jung Bauermann Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vishal Verma Cc: Will Deacon Cc: Yoshinori Sato Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: David Hildenbrand Signed-off-by: Greg Kroah-Hartman --- include/linux/memory_hotplug.h | 4 ++-- mm/memory_hotplug.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 3723308ac3d22..bc4753fd4d45f 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -331,6 +331,6 @@ extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type); -extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, - unsigned long nr_pages); +extern struct zone *zone_for_pfn_range(int online_type, int nid, + unsigned long start_pfn, unsigned long nr_pages); #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 7af0f6669f395..6710125384e2d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -659,8 +659,8 @@ static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn return movable_node_enabled ? movable_zone : kernel_zone; } -struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, - unsigned long nr_pages) +struct zone *zone_for_pfn_range(int online_type, int nid, + unsigned long start_pfn, unsigned long nr_pages) { if (online_type == MMOP_ONLINE_KERNEL) return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages); From 455581228c37831d273e5a42b19089b6f11eafdc Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 10 Jun 2021 16:39:45 +0200 Subject: [PATCH 0863/5344] dt-bindings: mtd: gpmc: Fix the ECC bytes vs. OOB bytes equation [ Upstream commit 778cb8e39f6ec252be50fc3850d66f3dcbd5dd5a ] "PAGESIZE / 512" is the number of ECC chunks. "ECC_BYTES" is the number of bytes needed to store a single ECC code. "2" is the space reserved by the bad block marker. "2 + (PAGESIZE / 512) * ECC_BYTES" should of course be lower or equal than the total number of OOB bytes, otherwise it won't fit. Fix the equation by substituting s/>=/<=/. Suggested-by: Ryan J. Barnett Signed-off-by: Miquel Raynal Acked-by: Rob Herring Link: https://lore.kernel.org/linux-mtd/20210610143945.3504781-1-miquel.raynal@bootlin.com Signed-off-by: Sasha Levin --- Documentation/devicetree/bindings/mtd/gpmc-nand.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/mtd/gpmc-nand.txt b/Documentation/devicetree/bindings/mtd/gpmc-nand.txt index 44919d48d2415..c459f169a9044 100644 --- a/Documentation/devicetree/bindings/mtd/gpmc-nand.txt +++ b/Documentation/devicetree/bindings/mtd/gpmc-nand.txt @@ -122,7 +122,7 @@ on various other factors also like; so the device should have enough free bytes available its OOB/Spare area to accommodate ECC for entire page. In general following expression helps in determining if given device can accommodate ECC syndrome: - "2 + (PAGESIZE / 512) * ECC_BYTES" >= OOBSIZE" + "2 + (PAGESIZE / 512) * ECC_BYTES" <= OOBSIZE" where OOBSIZE number of bytes in OOB/spare area PAGESIZE number of bytes in main-area of device page From 4f004bb7d453eb5fa8b8cdff4218aab4ed8eb574 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Mon, 2 Aug 2021 01:33:13 +0200 Subject: [PATCH 0864/5344] mfd: db8500-prcmu: Adjust map to reality [ Upstream commit ec343111c056ec3847800302f6dbc57281f833fa ] These are the actual frequencies reported by the PLL, so let's report these. The roundoffs are inappropriate, we should round to the frequency that the clock will later report. Drop some whitespace at the same time. Cc: phone-devel@vger.kernel.org Signed-off-by: Linus Walleij Signed-off-by: Lee Jones Signed-off-by: Sasha Levin --- drivers/mfd/db8500-prcmu.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/mfd/db8500-prcmu.c b/drivers/mfd/db8500-prcmu.c index dfac6afa82ca5..f1f2ad9ff0b34 100644 --- a/drivers/mfd/db8500-prcmu.c +++ b/drivers/mfd/db8500-prcmu.c @@ -1695,22 +1695,20 @@ static long round_clock_rate(u8 clock, unsigned long rate) } static const unsigned long db8500_armss_freqs[] = { - 200000000, - 400000000, - 800000000, + 199680000, + 399360000, + 798720000, 998400000 }; /* The DB8520 has slightly higher ARMSS max frequency */ static const unsigned long db8520_armss_freqs[] = { - 200000000, - 400000000, - 800000000, + 199680000, + 399360000, + 798720000, 1152000000 }; - - static long round_armss_rate(unsigned long rate) { unsigned long freq = 0; From aa8838d6ae099e4eae76c54081c0621793df4428 Mon Sep 17 00:00:00 2001 From: Wasim Khan Date: Thu, 29 Jul 2021 14:17:47 +0200 Subject: [PATCH 0865/5344] PCI: Add ACS quirks for NXP LX2xx0 and LX2xx2 platforms [ Upstream commit d08c8b855140e9f5240b3ffd1b8b9d435675e281 ] Root Ports in NXP LX2xx0 and LX2xx2, where each Root Port is a Root Complex with unique segment numbers, do provide isolation features to disable peer transactions and validate bus numbers in requests, but do not provide an actual PCIe ACS capability. Add ACS quirks for NXP LX2xx0 A/C/E/N and LX2xx2 A/C/E/N platforms. LX2xx0A : without security features + CAN-FD LX2160A (0x8d81) - 16 cores LX2120A (0x8da1) - 12 cores LX2080A (0x8d83) - 8 cores LX2xx0C : security features + CAN-FD LX2160C (0x8d80) - 16 cores LX2120C (0x8da0) - 12 cores LX2080C (0x8d82) - 8 cores LX2xx0E : security features + CAN LX2160E (0x8d90) - 16 cores LX2120E (0x8db0) - 12 cores LX2080E (0x8d92) - 8 cores LX2xx0N : without security features + CAN LX2160N (0x8d91) - 16 cores LX2120N (0x8db1) - 12 cores LX2080N (0x8d93) - 8 cores LX2xx2A : without security features + CAN-FD LX2162A (0x8d89) - 16 cores LX2122A (0x8da9) - 12 cores LX2082A (0x8d8b) - 8 cores LX2xx2C : security features + CAN-FD LX2162C (0x8d88) - 16 cores LX2122C (0x8da8) - 12 cores LX2082C (0x8d8a) - 8 cores LX2xx2E : security features + CAN LX2162E (0x8d98) - 16 cores LX2122E (0x8db8) - 12 cores LX2082E (0x8d9a) - 8 cores LX2xx2N : without security features + CAN LX2162N (0x8d99) - 16 cores LX2122N (0x8db9) - 12 cores LX2082N (0x8d9b) - 8 cores [bhelgaas: put PCI_VENDOR_ID_NXP definition next to PCI_VENDOR_ID_FREESCALE as a clue that they share the same Device ID namespace] Link: https://lore.kernel.org/r/20210729121747.1823086-1-wasim.khan@oss.nxp.com Link: https://lore.kernel.org/r/20210803180021.3252886-1-wasim.khan@oss.nxp.com Signed-off-by: Wasim Khan Signed-off-by: Bjorn Helgaas Signed-off-by: Sasha Levin --- drivers/pci/quirks.c | 45 +++++++++++++++++++++++++++++++++++++++++ include/linux/pci_ids.h | 3 ++- 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 34c68a7313db1..e230a7b5e70a3 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -4684,6 +4684,18 @@ static int pci_quirk_qcom_rp_acs(struct pci_dev *dev, u16 acs_flags) PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF); } +/* + * Each of these NXP Root Ports is in a Root Complex with a unique segment + * number and does provide isolation features to disable peer transactions + * and validate bus numbers in requests, but does not provide an ACS + * capability. + */ +static int pci_quirk_nxp_rp_acs(struct pci_dev *dev, u16 acs_flags) +{ + return pci_acs_ctrl_enabled(acs_flags, + PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF); +} + static int pci_quirk_al_acs(struct pci_dev *dev, u16 acs_flags) { if (pci_pcie_type(dev) != PCI_EXP_TYPE_ROOT_PORT) @@ -4930,6 +4942,39 @@ static const struct pci_dev_acs_enabled { { PCI_VENDOR_ID_ZHAOXIN, 0x3038, pci_quirk_mf_endpoint_acs }, { PCI_VENDOR_ID_ZHAOXIN, 0x3104, pci_quirk_mf_endpoint_acs }, { PCI_VENDOR_ID_ZHAOXIN, 0x9083, pci_quirk_mf_endpoint_acs }, + /* NXP root ports, xx=16, 12, or 08 cores */ + /* LX2xx0A : without security features + CAN-FD */ + { PCI_VENDOR_ID_NXP, 0x8d81, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8da1, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8d83, pci_quirk_nxp_rp_acs }, + /* LX2xx0C : security features + CAN-FD */ + { PCI_VENDOR_ID_NXP, 0x8d80, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8da0, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8d82, pci_quirk_nxp_rp_acs }, + /* LX2xx0E : security features + CAN */ + { PCI_VENDOR_ID_NXP, 0x8d90, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8db0, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8d92, pci_quirk_nxp_rp_acs }, + /* LX2xx0N : without security features + CAN */ + { PCI_VENDOR_ID_NXP, 0x8d91, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8db1, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8d93, pci_quirk_nxp_rp_acs }, + /* LX2xx2A : without security features + CAN-FD */ + { PCI_VENDOR_ID_NXP, 0x8d89, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8da9, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8d8b, pci_quirk_nxp_rp_acs }, + /* LX2xx2C : security features + CAN-FD */ + { PCI_VENDOR_ID_NXP, 0x8d88, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8da8, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8d8a, pci_quirk_nxp_rp_acs }, + /* LX2xx2E : security features + CAN */ + { PCI_VENDOR_ID_NXP, 0x8d98, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8db8, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8d9a, pci_quirk_nxp_rp_acs }, + /* LX2xx2N : without security features + CAN */ + { PCI_VENDOR_ID_NXP, 0x8d99, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8db9, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8d9b, pci_quirk_nxp_rp_acs }, /* Zhaoxin Root/Downstream Ports */ { PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs }, { 0 } diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 0ad57693f3926..42588645478d9 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2476,7 +2476,8 @@ #define PCI_VENDOR_ID_TDI 0x192E #define PCI_DEVICE_ID_TDI_EHCI 0x0101 -#define PCI_VENDOR_ID_FREESCALE 0x1957 +#define PCI_VENDOR_ID_FREESCALE 0x1957 /* duplicate: NXP */ +#define PCI_VENDOR_ID_NXP 0x1957 /* duplicate: FREESCALE */ #define PCI_DEVICE_ID_MPC8308 0xc006 #define PCI_DEVICE_ID_MPC8315E 0x00b4 #define PCI_DEVICE_ID_MPC8315 0x00b5 From 90c76aff91f0b2e53b7aa6b4753568e6ff822379 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 25 Jul 2021 19:07:54 +0100 Subject: [PATCH 0866/5344] mfd: Don't use irq_create_mapping() to resolve a mapping [ Upstream commit 9ff80e2de36d0554e3a6da18a171719fe8663c17 ] Although irq_create_mapping() is able to deal with duplicate mappings, it really isn't supposed to be a substitute for irq_find_mapping(), and can result in allocations that take place in atomic context if the mapping didn't exist. Fix the handful of MFD drivers that use irq_create_mapping() in interrupt context by using irq_find_mapping() instead. Cc: Linus Walleij Cc: Lee Jones Cc: Maxime Coquelin Cc: Alexandre Torgue Signed-off-by: Marc Zyngier Signed-off-by: Lee Jones Signed-off-by: Sasha Levin --- drivers/mfd/ab8500-core.c | 2 +- drivers/mfd/stmpe.c | 4 ++-- drivers/mfd/tc3589x.c | 2 +- drivers/mfd/wm8994-irq.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/mfd/ab8500-core.c b/drivers/mfd/ab8500-core.c index 3e9dc92cb467b..842de1f352dfc 100644 --- a/drivers/mfd/ab8500-core.c +++ b/drivers/mfd/ab8500-core.c @@ -493,7 +493,7 @@ static int ab8500_handle_hierarchical_line(struct ab8500 *ab8500, if (line == AB8540_INT_GPIO43F || line == AB8540_INT_GPIO44F) line += 1; - handle_nested_irq(irq_create_mapping(ab8500->domain, line)); + handle_nested_irq(irq_find_mapping(ab8500->domain, line)); } return 0; diff --git a/drivers/mfd/stmpe.c b/drivers/mfd/stmpe.c index 1aee3b3253fc9..508349399f8af 100644 --- a/drivers/mfd/stmpe.c +++ b/drivers/mfd/stmpe.c @@ -1091,7 +1091,7 @@ static irqreturn_t stmpe_irq(int irq, void *data) if (variant->id_val == STMPE801_ID || variant->id_val == STMPE1600_ID) { - int base = irq_create_mapping(stmpe->domain, 0); + int base = irq_find_mapping(stmpe->domain, 0); handle_nested_irq(base); return IRQ_HANDLED; @@ -1119,7 +1119,7 @@ static irqreturn_t stmpe_irq(int irq, void *data) while (status) { int bit = __ffs(status); int line = bank * 8 + bit; - int nestedirq = irq_create_mapping(stmpe->domain, line); + int nestedirq = irq_find_mapping(stmpe->domain, line); handle_nested_irq(nestedirq); status &= ~(1 << bit); diff --git a/drivers/mfd/tc3589x.c b/drivers/mfd/tc3589x.c index 67c9995bb1aa6..23cfbd050120d 100644 --- a/drivers/mfd/tc3589x.c +++ b/drivers/mfd/tc3589x.c @@ -187,7 +187,7 @@ static irqreturn_t tc3589x_irq(int irq, void *data) while (status) { int bit = __ffs(status); - int virq = irq_create_mapping(tc3589x->domain, bit); + int virq = irq_find_mapping(tc3589x->domain, bit); handle_nested_irq(virq); status &= ~(1 << bit); diff --git a/drivers/mfd/wm8994-irq.c b/drivers/mfd/wm8994-irq.c index 6c3a619e26286..651a028bc519a 100644 --- a/drivers/mfd/wm8994-irq.c +++ b/drivers/mfd/wm8994-irq.c @@ -154,7 +154,7 @@ static irqreturn_t wm8994_edge_irq(int irq, void *data) struct wm8994 *wm8994 = data; while (gpio_get_value_cansleep(wm8994->pdata.irq_gpio)) - handle_nested_irq(irq_create_mapping(wm8994->edge_irq, 0)); + handle_nested_irq(irq_find_mapping(wm8994->edge_irq, 0)); return IRQ_HANDLED; } From 79f3169336fa671488865340020b5421c8b560ec Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 19 Aug 2021 19:26:02 +0900 Subject: [PATCH 0867/5344] tracing/probes: Reject events which have the same name of existing one [ Upstream commit 8e242060c6a4947e8ae7d29794af6a581db08841 ] Since kprobe_events and uprobe_events only check whether the other same-type probe event has the same name or not, if the user gives the same name of the existing tracepoint event (or the other type of probe events), it silently fails to create the tracefs entry (but registered.) as below. /sys/kernel/tracing # ls events/task/task_rename enable filter format hist id trigger /sys/kernel/tracing # echo p:task/task_rename vfs_read >> kprobe_events [ 113.048508] Could not create tracefs 'task_rename' directory /sys/kernel/tracing # cat kprobe_events p:task/task_rename vfs_read To fix this issue, check whether the existing events have the same name or not in trace_probe_register_event_call(). If exists, it rejects to register the new event. Link: https://lkml.kernel.org/r/162936876189.187130.17558311387542061930.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Sasha Levin --- kernel/trace/trace_kprobe.c | 6 +++++- kernel/trace/trace_probe.c | 25 +++++++++++++++++++++++++ kernel/trace/trace_probe.h | 1 + kernel/trace/trace_uprobe.c | 6 +++++- 4 files changed, 36 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 6fef7233740c0..970f947d59415 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -646,7 +646,11 @@ static int register_trace_kprobe(struct trace_kprobe *tk) /* Register new event */ ret = register_kprobe_event(tk); if (ret) { - pr_warn("Failed to register probe event(%d)\n", ret); + if (ret == -EEXIST) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, EVENT_EXIST); + } else + pr_warn("Failed to register probe event(%d)\n", ret); goto end; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index f98d6d94cbbf7..23e85cb151346 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1029,11 +1029,36 @@ int trace_probe_init(struct trace_probe *tp, const char *event, return ret; } +static struct trace_event_call * +find_trace_event_call(const char *system, const char *event_name) +{ + struct trace_event_call *tp_event; + const char *name; + + list_for_each_entry(tp_event, &ftrace_events, list) { + if (!tp_event->class->system || + strcmp(system, tp_event->class->system)) + continue; + name = trace_event_name(tp_event); + if (!name || strcmp(event_name, name)) + continue; + return tp_event; + } + + return NULL; +} + int trace_probe_register_event_call(struct trace_probe *tp) { struct trace_event_call *call = trace_probe_event_call(tp); int ret; + lockdep_assert_held(&event_mutex); + + if (find_trace_event_call(trace_probe_group_name(tp), + trace_probe_name(tp))) + return -EEXIST; + ret = register_trace_event(&call->event); if (!ret) return -ENODEV; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index a0ff9e200ef6f..bab9e0dba9af2 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -410,6 +410,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(NO_EVENT_NAME, "Event name is not specified"), \ C(EVENT_TOO_LONG, "Event name is too long"), \ C(BAD_EVENT_NAME, "Event name must follow the same rules as C identifiers"), \ + C(EVENT_EXIST, "Given group/event name is already used by another event"), \ C(RETVAL_ON_PROBE, "$retval is not available on probe"), \ C(BAD_STACK_NUM, "Invalid stack number"), \ C(BAD_ARG_NUM, "Invalid argument number"), \ diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 5294843de6efd..b515db036becc 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -514,7 +514,11 @@ static int register_trace_uprobe(struct trace_uprobe *tu) ret = register_uprobe_event(tu); if (ret) { - pr_warn("Failed to register probe event(%d)\n", ret); + if (ret == -EEXIST) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, EVENT_EXIST); + } else + pr_warn("Failed to register probe event(%d)\n", ret); goto end; } From 83d2caf3dd4f9f191a0eb6e97fb3029654be9b5c Mon Sep 17 00:00:00 2001 From: George Cherian Date: Tue, 10 Aug 2021 17:54:25 +0530 Subject: [PATCH 0868/5344] PCI: Add ACS quirks for Cavium multi-function devices [ Upstream commit 32837d8a8f63eb95dcb9cd005524a27f06478832 ] Some Cavium endpoints are implemented as multi-function devices without ACS capability, but they actually don't support peer-to-peer transactions. Add ACS quirks to declare DMA isolation for the following devices: - BGX device found on Octeon-TX (8xxx) - CGX device found on Octeon-TX2 (9xxx) - RPM device found on Octeon-TX3 (10xxx) Link: https://lore.kernel.org/r/20210810122425.1115156-1-george.cherian@marvell.com Signed-off-by: George Cherian Signed-off-by: Bjorn Helgaas Signed-off-by: Sasha Levin --- drivers/pci/quirks.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index e230a7b5e70a3..686298c0f6cda 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -4922,6 +4922,10 @@ static const struct pci_dev_acs_enabled { { 0x10df, 0x720, pci_quirk_mf_endpoint_acs }, /* Emulex Skyhawk-R */ /* Cavium ThunderX */ { PCI_VENDOR_ID_CAVIUM, PCI_ANY_ID, pci_quirk_cavium_acs }, + /* Cavium multi-function devices */ + { PCI_VENDOR_ID_CAVIUM, 0xA026, pci_quirk_mf_endpoint_acs }, + { PCI_VENDOR_ID_CAVIUM, 0xA059, pci_quirk_mf_endpoint_acs }, + { PCI_VENDOR_ID_CAVIUM, 0xA060, pci_quirk_mf_endpoint_acs }, /* APM X-Gene */ { PCI_VENDOR_ID_AMCC, 0xE004, pci_quirk_xgene_acs }, /* Ampere Computing */ From e46016e09c143d48bcbc1abe5b6a4ff14b95afad Mon Sep 17 00:00:00 2001 From: Ryoga Saito Date: Thu, 2 Sep 2021 05:20:14 +0000 Subject: [PATCH 0869/5344] Set fc_nlinfo in nh_create_ipv4, nh_create_ipv6 [ Upstream commit 9aca491e0dccf8a9d84a5b478e5eee3c6ea7803b ] This patch fixes kernel NULL pointer dereference when creating nexthop which is bound with SRv6 decapsulation. In the creation of nexthop, __seg6_end_dt_vrf_build is called. __seg6_end_dt_vrf_build expects fc_lninfo in fib6_config is set correctly, but it isn't set in nh_create_ipv6, which causes kernel crash. Here is steps to reproduce kernel crash: 1. modprobe vrf 2. ip -6 nexthop add encap seg6local action End.DT4 vrftable 1 dev eth0 We got the following message: [ 901.370336] BUG: kernel NULL pointer dereference, address: 0000000000000ba0 [ 901.371658] #PF: supervisor read access in kernel mode [ 901.372672] #PF: error_code(0x0000) - not-present page [ 901.373672] PGD 0 P4D 0 [ 901.374248] Oops: 0000 [#1] SMP PTI [ 901.374944] CPU: 0 PID: 8593 Comm: ip Not tainted 5.14-051400-generic #202108310811-Ubuntu [ 901.376404] Hardware name: Red Hat KVM, BIOS 1.11.1-4.module_el8.2.0+320+13f867d7 04/01/2014 [ 901.377907] RIP: 0010:vrf_ifindex_lookup_by_table_id+0x19/0x90 [vrf] [ 901.379182] Code: c1 e9 72 ff ff ff e8 96 49 01 c2 66 0f 1f 44 00 00 0f 1f 44 00 00 55 48 89 e5 41 56 41 55 41 89 f5 41 54 53 8b 05 47 4c 00 00 <48> 8b 97 a0 0b 00 00 48 8b 1c c2 e8 57 27 53 c1 4c 8d a3 88 00 00 [ 901.382652] RSP: 0018:ffffbf2d02043590 EFLAGS: 00010282 [ 901.383746] RAX: 000000000000000b RBX: ffff990808255e70 RCX: ffffbf2d02043aa8 [ 901.385436] RDX: 0000000000000001 RSI: 0000000000000001 RDI: 0000000000000000 [ 901.386924] RBP: ffffbf2d020435b0 R08: 00000000000000c0 R09: ffff990808255e40 [ 901.388537] R10: ffffffff83b08c90 R11: 0000000000000009 R12: 0000000000000000 [ 901.389937] R13: 0000000000000001 R14: 0000000000000000 R15: 000000000000000b [ 901.391226] FS: 00007fe49381f740(0000) GS:ffff99087dc00000(0000) knlGS:0000000000000000 [ 901.392737] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 901.393803] CR2: 0000000000000ba0 CR3: 000000000e3e8003 CR4: 0000000000770ef0 [ 901.395122] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 901.396496] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 901.397833] PKRU: 55555554 [ 901.398578] Call Trace: [ 901.399144] l3mdev_ifindex_lookup_by_table_id+0x3b/0x70 [ 901.400179] __seg6_end_dt_vrf_build+0x34/0xd0 [ 901.401067] seg6_end_dt4_build+0x16/0x20 [ 901.401904] seg6_local_build_state+0x271/0x430 [ 901.402797] lwtunnel_build_state+0x81/0x130 [ 901.403645] fib_nh_common_init+0x82/0x100 [ 901.404465] ? sock_def_readable+0x4b/0x80 [ 901.405285] fib6_nh_init+0x115/0x7c0 [ 901.406033] nh_create_ipv6.isra.0+0xe1/0x140 [ 901.406932] rtm_new_nexthop+0x3b7/0xeb0 [ 901.407828] rtnetlink_rcv_msg+0x152/0x3a0 [ 901.408663] ? rtnl_calcit.isra.0+0x130/0x130 [ 901.409535] netlink_rcv_skb+0x55/0x100 [ 901.410319] rtnetlink_rcv+0x15/0x20 [ 901.411026] netlink_unicast+0x1a8/0x250 [ 901.411813] netlink_sendmsg+0x238/0x470 [ 901.412602] ? _copy_from_user+0x2b/0x60 [ 901.413394] sock_sendmsg+0x65/0x70 [ 901.414112] ____sys_sendmsg+0x218/0x290 [ 901.414929] ? copy_msghdr_from_user+0x5c/0x90 [ 901.415814] ___sys_sendmsg+0x81/0xc0 [ 901.416559] ? fsnotify_destroy_marks+0x27/0xf0 [ 901.417447] ? call_rcu+0xa4/0x230 [ 901.418153] ? kmem_cache_free+0x23f/0x410 [ 901.418972] ? dentry_free+0x37/0x70 [ 901.419705] ? mntput_no_expire+0x4c/0x260 [ 901.420574] __sys_sendmsg+0x62/0xb0 [ 901.421297] __x64_sys_sendmsg+0x1f/0x30 [ 901.422057] do_syscall_64+0x5c/0xc0 [ 901.422756] ? syscall_exit_to_user_mode+0x27/0x50 [ 901.423675] ? __x64_sys_close+0x12/0x40 [ 901.424462] ? do_syscall_64+0x69/0xc0 [ 901.425219] ? irqentry_exit_to_user_mode+0x9/0x20 [ 901.426149] ? irqentry_exit+0x19/0x30 [ 901.426901] ? exc_page_fault+0x89/0x160 [ 901.427709] ? asm_exc_page_fault+0x8/0x30 [ 901.428536] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 901.429514] RIP: 0033:0x7fe493945747 [ 901.430248] Code: 64 89 02 48 c7 c0 ff ff ff ff eb bb 0f 1f 80 00 00 00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 89 54 24 1c 48 89 74 24 10 [ 901.433549] RSP: 002b:00007ffe9932cf68 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 901.434981] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fe493945747 [ 901.436303] RDX: 0000000000000000 RSI: 00007ffe9932cfe0 RDI: 0000000000000003 [ 901.437607] RBP: 00000000613053f7 R08: 0000000000000001 R09: 00007ffe9932d07c [ 901.438990] R10: 000055f4a903a010 R11: 0000000000000246 R12: 0000000000000001 [ 901.440340] R13: 0000000000000001 R14: 000055f4a802b163 R15: 000055f4a8042020 [ 901.441630] Modules linked in: vrf nls_utf8 isofs nls_iso8859_1 dm_multipath scsi_dh_rdac scsi_dh_emc scsi_dh_alua intel_rapl_msr intel_rapl_common isst_if_mbox_msr isst_if_common nfit rapl input_leds joydev serio_raw qemu_fw_cfg mac_hid sch_fq_codel drm virtio_rng ip_tables x_tables autofs4 btrfs blake2b_generic zstd_compress raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 raid0 multipath linear crct10dif_pclmul crc32_pclmul ghash_clmulni_intel aesni_intel crypto_simd virtio_net net_failover cryptd psmouse virtio_blk failover i2c_piix4 pata_acpi floppy [ 901.450808] CR2: 0000000000000ba0 [ 901.451514] ---[ end trace c27b934b99ade304 ]--- [ 901.452403] RIP: 0010:vrf_ifindex_lookup_by_table_id+0x19/0x90 [vrf] [ 901.453626] Code: c1 e9 72 ff ff ff e8 96 49 01 c2 66 0f 1f 44 00 00 0f 1f 44 00 00 55 48 89 e5 41 56 41 55 41 89 f5 41 54 53 8b 05 47 4c 00 00 <48> 8b 97 a0 0b 00 00 48 8b 1c c2 e8 57 27 53 c1 4c 8d a3 88 00 00 [ 901.456910] RSP: 0018:ffffbf2d02043590 EFLAGS: 00010282 [ 901.457912] RAX: 000000000000000b RBX: ffff990808255e70 RCX: ffffbf2d02043aa8 [ 901.459238] RDX: 0000000000000001 RSI: 0000000000000001 RDI: 0000000000000000 [ 901.460552] RBP: ffffbf2d020435b0 R08: 00000000000000c0 R09: ffff990808255e40 [ 901.461882] R10: ffffffff83b08c90 R11: 0000000000000009 R12: 0000000000000000 [ 901.463208] R13: 0000000000000001 R14: 0000000000000000 R15: 000000000000000b [ 901.464529] FS: 00007fe49381f740(0000) GS:ffff99087dc00000(0000) knlGS:0000000000000000 [ 901.466058] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 901.467189] CR2: 0000000000000ba0 CR3: 000000000e3e8003 CR4: 0000000000770ef0 [ 901.468515] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 901.469858] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 901.471139] PKRU: 55555554 Signed-off-by: Ryoga Saito Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/ipv4/nexthop.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index f5f4369c131c9..858bb10d8341e 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -1183,6 +1183,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh, .fc_gw4 = cfg->gw.ipv4, .fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0, .fc_flags = cfg->nh_flags, + .fc_nlinfo = cfg->nlinfo, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, }; @@ -1218,6 +1219,7 @@ static int nh_create_ipv6(struct net *net, struct nexthop *nh, .fc_ifindex = cfg->nh_ifindex, .fc_gateway = cfg->gw.ipv6, .fc_flags = cfg->nh_flags, + .fc_nlinfo = cfg->nlinfo, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, }; From ffcbc4f826090e910591bc97b6c640a135b3b0c1 Mon Sep 17 00:00:00 2001 From: Daniele Palmas Date: Thu, 2 Sep 2021 12:51:22 +0200 Subject: [PATCH 0870/5344] net: usb: cdc_mbim: avoid altsetting toggling for Telit LN920 [ Upstream commit aabbdc67f3485b5db27ab4eba01e5fbf1ffea62c ] Add quirk CDC_MBIM_FLAG_AVOID_ALTSETTING_TOGGLE for Telit LN920 0x1061 composition in order to avoid bind error. Signed-off-by: Daniele Palmas Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/usb/cdc_mbim.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/usb/cdc_mbim.c b/drivers/net/usb/cdc_mbim.c index eb100eb33de3d..77ac5a721e7b6 100644 --- a/drivers/net/usb/cdc_mbim.c +++ b/drivers/net/usb/cdc_mbim.c @@ -653,6 +653,11 @@ static const struct usb_device_id mbim_devs[] = { .driver_info = (unsigned long)&cdc_mbim_info_avoid_altsetting_toggle, }, + /* Telit LN920 */ + { USB_DEVICE_AND_INTERFACE_INFO(0x1bc7, 0x1061, USB_CLASS_COMM, USB_CDC_SUBCLASS_MBIM, USB_CDC_PROTO_NONE), + .driver_info = (unsigned long)&cdc_mbim_info_avoid_altsetting_toggle, + }, + /* default entry */ { USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_MBIM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&cdc_mbim_info_zlp, From c01659a2fdbfe0327f07285eb9ddc20e51dfe134 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 2 Aug 2021 16:13:52 +0200 Subject: [PATCH 0871/5344] block, bfq: honor already-setup queue merges [ Upstream commit 2d52c58b9c9bdae0ca3df6a1eab5745ab3f7d80b ] The function bfq_setup_merge prepares the merging between two bfq_queues, say bfqq and new_bfqq. To this goal, it assigns bfqq->new_bfqq = new_bfqq. Then, each time some I/O for bfqq arrives, the process that generated that I/O is disassociated from bfqq and associated with new_bfqq (merging is actually a redirection). In this respect, bfq_setup_merge increases new_bfqq->ref in advance, adding the number of processes that are expected to be associated with new_bfqq. Unfortunately, the stable-merging mechanism interferes with this setup. After bfqq->new_bfqq has been set by bfq_setup_merge, and before all the expected processes have been associated with bfqq->new_bfqq, bfqq may happen to be stably merged with a different queue than the current bfqq->new_bfqq. In this case, bfqq->new_bfqq gets changed. So, some of the processes that have been already accounted for in the ref counter of the previous new_bfqq will not be associated with that queue. This creates an unbalance, because those references will never be decremented. This commit fixes this issue by reestablishing the previous, natural behaviour: once bfqq->new_bfqq has been set, it will not be changed until all expected redirections have occurred. Signed-off-by: Davide Zini Signed-off-by: Paolo Valente Link: https://lore.kernel.org/r/20210802141352.74353-2-paolo.valente@linaro.org Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- block/bfq-iosched.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index c5da1955b1c94..2acc814f9ad21 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2526,6 +2526,15 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) * are likely to increase the throughput. */ bfqq->new_bfqq = new_bfqq; + /* + * The above assignment schedules the following redirections: + * each time some I/O for bfqq arrives, the process that + * generated that I/O is disassociated from bfqq and + * associated with new_bfqq. Here we increases new_bfqq->ref + * in advance, adding the number of processes that are + * expected to be associated with new_bfqq as they happen to + * issue I/O. + */ new_bfqq->ref += process_refs; return new_bfqq; } @@ -2585,6 +2594,10 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, { struct bfq_queue *in_service_bfqq, *new_bfqq; + /* if a merge has already been setup, then proceed with that first */ + if (bfqq->new_bfqq) + return bfqq->new_bfqq; + /* * Do not perform queue merging if the device is non * rotational and performs internal queueing. In fact, such a @@ -2639,9 +2652,6 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (bfq_too_late_for_merging(bfqq)) return NULL; - if (bfqq->new_bfqq) - return bfqq->new_bfqq; - if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) return NULL; From bd82540cbb8dea245bcee8798984a4e438c304d8 Mon Sep 17 00:00:00 2001 From: Vishal Aslot Date: Wed, 18 Aug 2021 11:57:51 -0500 Subject: [PATCH 0872/5344] PCI: ibmphp: Fix double unmap of io_mem [ Upstream commit faa2e05ad0dccf37f995bcfbb8d1980d66c02c11 ] ebda_rsrc_controller() calls iounmap(io_mem) on the error path. Its caller, ibmphp_access_ebda(), also calls iounmap(io_mem) on good and error paths. Remove the iounmap(io_mem) invocation from ebda_rsrc_controller(). [bhelgaas: remove item from TODO] Link: https://lore.kernel.org/r/20210818165751.591185-1-os.vaslot@gmail.com Signed-off-by: Vishal Aslot Signed-off-by: Bjorn Helgaas Signed-off-by: Sasha Levin --- drivers/pci/hotplug/TODO | 3 --- drivers/pci/hotplug/ibmphp_ebda.c | 5 +---- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/pci/hotplug/TODO b/drivers/pci/hotplug/TODO index a32070be5adf9..cc6194aa24c15 100644 --- a/drivers/pci/hotplug/TODO +++ b/drivers/pci/hotplug/TODO @@ -40,9 +40,6 @@ ibmphp: * The return value of pci_hp_register() is not checked. -* iounmap(io_mem) is called in the error path of ebda_rsrc_controller() - and once more in the error path of its caller ibmphp_access_ebda(). - * The various slot data structures are difficult to follow and need to be simplified. A lot of functions are too large and too complex, they need to be broken up into smaller, manageable pieces. Negative examples are diff --git a/drivers/pci/hotplug/ibmphp_ebda.c b/drivers/pci/hotplug/ibmphp_ebda.c index 11a2661dc0627..7fb75401ad8a7 100644 --- a/drivers/pci/hotplug/ibmphp_ebda.c +++ b/drivers/pci/hotplug/ibmphp_ebda.c @@ -714,8 +714,7 @@ static int __init ebda_rsrc_controller(void) /* init hpc structure */ hpc_ptr = alloc_ebda_hpc(slot_num, bus_num); if (!hpc_ptr) { - rc = -ENOMEM; - goto error_no_hpc; + return -ENOMEM; } hpc_ptr->ctlr_id = ctlr_id; hpc_ptr->ctlr_relative_id = ctlr; @@ -910,8 +909,6 @@ static int __init ebda_rsrc_controller(void) kfree(tmp_slot); error_no_slot: free_ebda_hpc(hpc_ptr); -error_no_hpc: - iounmap(io_mem); return rc; } From 91e5ee0be354192eb627090572a8d9e555455a75 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 3 Sep 2021 14:42:33 +0800 Subject: [PATCH 0873/5344] ethtool: Fix an error code in cxgb2.c [ Upstream commit 7db8263a12155c7ae4ad97e850f1e499c73765fc ] When adapter->registered_device_map is NULL, the value of err is uncertain, we set err to -EINVAL to avoid ambiguity. Clean up smatch warning: drivers/net/ethernet/chelsio/cxgb/cxgb2.c:1114 init_one() warn: missing error code 'err' Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/chelsio/cxgb/cxgb2.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c index 0ccdde366ae17..540d99f59226e 100644 --- a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c +++ b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c @@ -1153,6 +1153,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) if (!adapter->registered_device_map) { pr_err("%s: could not register any net devices\n", pci_name(pdev)); + err = -EINVAL; goto out_release_adapter_res; } From c6e6f1a4d315c8f22c6305bd404ef767df06bc91 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Mon, 7 Jun 2021 13:56:20 +0800 Subject: [PATCH 0874/5344] NTB: Fix an error code in ntb_msit_probe() [ Upstream commit 319f83ac98d7afaabab84ce5281a819a358b9895 ] When the value of nm->isr_ctx is false, the value of ret is 0. So, we set ret to -ENOMEM to indicate this error. Clean up smatch warning: drivers/ntb/test/ntb_msi_test.c:373 ntb_msit_probe() warn: missing error code 'ret'. Reported-by: Abaci Robot Signed-off-by: Yang Li Reviewed-by: Logan Gunthorpe Signed-off-by: Jon Mason Signed-off-by: Sasha Levin --- drivers/ntb/test/ntb_msi_test.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/ntb/test/ntb_msi_test.c b/drivers/ntb/test/ntb_msi_test.c index 99d826ed9c341..662067dc9ce2c 100644 --- a/drivers/ntb/test/ntb_msi_test.c +++ b/drivers/ntb/test/ntb_msi_test.c @@ -372,8 +372,10 @@ static int ntb_msit_probe(struct ntb_client *client, struct ntb_dev *ntb) if (ret) goto remove_dbgfs; - if (!nm->isr_ctx) + if (!nm->isr_ctx) { + ret = -ENOMEM; goto remove_dbgfs; + } ntb_link_enable(ntb, NTB_SPEED_AUTO, NTB_WIDTH_AUTO); From b797ca908ccac66b3839eadd0ddbfc1656af1498 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Mon, 7 Jun 2021 16:40:36 +0800 Subject: [PATCH 0875/5344] NTB: perf: Fix an error code in perf_setup_inbuf() [ Upstream commit 0097ae5f7af5684f961a5f803ff7ad3e6f933668 ] When the function IS_ALIGNED() returns false, the value of ret is 0. So, we set ret to -EINVAL to indicate this error. Clean up smatch warning: drivers/ntb/test/ntb_perf.c:602 perf_setup_inbuf() warn: missing error code 'ret'. Reported-by: Abaci Robot Signed-off-by: Yang Li Reviewed-by: Serge Semin Signed-off-by: Jon Mason Signed-off-by: Sasha Levin --- drivers/ntb/test/ntb_perf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c index 5ce4766a6c9eb..251fe75798c13 100644 --- a/drivers/ntb/test/ntb_perf.c +++ b/drivers/ntb/test/ntb_perf.c @@ -597,6 +597,7 @@ static int perf_setup_inbuf(struct perf_peer *peer) return -ENOMEM; } if (!IS_ALIGNED(peer->inbuf_xlat, xlat_align)) { + ret = -EINVAL; dev_err(&perf->ntb->dev, "Unaligned inbuf allocated\n"); goto err_free_inbuf; } From 19255d758af2c95b989086a9697ee716b09ffe54 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 29 Jun 2021 19:12:39 +0200 Subject: [PATCH 0876/5344] mfd: axp20x: Update AXP288 volatile ranges [ Upstream commit f949a9ebce7a18005266b859a17f10c891bb13d7 ] On Cherry Trail devices with an AXP288 PMIC the external SD-card slot used the AXP's DLDO2 as card-voltage and either DLDO3 or GPIO1LDO (GPIO1 pin in low noise LDO mode) as signal-voltage. These regulators are turned on/off and in case of the signal-voltage also have their output-voltage changed by the _PS0 and _PS3 power- management ACPI methods on the MMC-controllers ACPI fwnode as well as by the _DSM ACPI method for changing the signal voltage. The AML code implementing these methods is directly accessing the PMIC through ACPI I2C OpRegion accesses, instead of using the special PMIC OpRegion handled by drivers/acpi/pmic/intel_pmic_xpower.c . This means that the contents of the involved PMIC registers can change without the change being made through the regmap interface, so regmap should not cache the contents of these registers. Mark the regulator power on/off, the regulator voltage control and the GPIO1 control registers as volatile, to avoid regmap caching them. Specifically this fixes an issue on some models where the i915 driver toggles another LDO using the same on/off register on/off through MIPI sequences (through intel_soc_pmic_exec_mipi_pmic_seq_element()) which then writes back a cached on/off register-value where the card-voltage is off causing the external sdcard slot to stop working when the screen goes blank, or comes back on again. The regulator register-range now marked volatile also includes the buck regulator control registers. This is done on purpose these are normally not touched by the AML code, but they are updated directly by the SoC's PUNIT which means that they may also change without going through regmap. Note the AXP288 PMIC is only used on Bay- and Cherry-Trail platforms, so even though this is an ACPI specific problem there is no need to make the new volatile ranges conditional since these platforms always use ACPI. Fixes: dc91c3b6fe66 ("mfd: axp20x: Mark AXP20X_VBUS_IPSOUT_MGMT as volatile") Fixes: cd53216625a0 ("mfd: axp20x: Fix axp288 volatile ranges") Reported-and-tested-by: Clamshell Signed-off-by: Hans de Goede Reviewed-by: Chen-Yu Tsai Signed-off-by: Lee Jones Signed-off-by: Sasha Levin --- drivers/mfd/axp20x.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/mfd/axp20x.c b/drivers/mfd/axp20x.c index aa59496e43768..9db1000944c34 100644 --- a/drivers/mfd/axp20x.c +++ b/drivers/mfd/axp20x.c @@ -125,12 +125,13 @@ static const struct regmap_range axp288_writeable_ranges[] = { static const struct regmap_range axp288_volatile_ranges[] = { regmap_reg_range(AXP20X_PWR_INPUT_STATUS, AXP288_POWER_REASON), + regmap_reg_range(AXP22X_PWR_OUT_CTRL1, AXP22X_ALDO3_V_OUT), regmap_reg_range(AXP288_BC_GLOBAL, AXP288_BC_GLOBAL), regmap_reg_range(AXP288_BC_DET_STAT, AXP20X_VBUS_IPSOUT_MGMT), regmap_reg_range(AXP20X_CHRG_BAK_CTRL, AXP20X_CHRG_BAK_CTRL), regmap_reg_range(AXP20X_IRQ1_EN, AXP20X_IPSOUT_V_HIGH_L), regmap_reg_range(AXP20X_TIMER_CTRL, AXP20X_TIMER_CTRL), - regmap_reg_range(AXP22X_GPIO_STATE, AXP22X_GPIO_STATE), + regmap_reg_range(AXP20X_GPIO1_CTRL, AXP22X_GPIO_STATE), regmap_reg_range(AXP288_RT_BATT_V_H, AXP288_RT_BATT_V_L), regmap_reg_range(AXP20X_FG_RES, AXP288_FG_CC_CAP_REG), }; From 043d18ccdb297f0b76d0bc8820427009566c79b8 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 12 Aug 2021 10:00:04 +0300 Subject: [PATCH 0877/5344] PCI: Fix pci_dev_str_match_path() alloc while atomic bug [ Upstream commit 7eb6ea4148579b85540a41d57bcec315b8af8ff8 ] pci_dev_str_match_path() is often called with a spinlock held so the allocation has to be atomic. The call tree is: pci_specified_resource_alignment() <-- takes spin_lock(); pci_dev_str_match() pci_dev_str_match_path() Fixes: 45db33709ccc ("PCI: Allow specifying devices using a base bus and path of devfns") Link: https://lore.kernel.org/r/20210812070004.GC31863@kili Signed-off-by: Dan Carpenter Signed-off-by: Bjorn Helgaas Reviewed-by: Logan Gunthorpe Signed-off-by: Sasha Levin --- drivers/pci/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 84f3e0e5dfe11..4636e5fc1ad21 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -224,7 +224,7 @@ static int pci_dev_str_match_path(struct pci_dev *dev, const char *path, *endptr = strchrnul(path, ';'); - wpath = kmemdup_nul(path, *endptr - path, GFP_KERNEL); + wpath = kmemdup_nul(path, *endptr - path, GFP_ATOMIC); if (!wpath) return -ENOMEM; From faa29e8c03c942cc6a8ecf04fa734aba98cddceb Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Fri, 16 Jul 2021 12:00:48 +0200 Subject: [PATCH 0878/5344] mfd: tqmx86: Clear GPIO IRQ resource when no IRQ is set [ Upstream commit a946506c48f3bd09363c9d2b0a178e55733bcbb6 ] The driver was registering IRQ 0 when no IRQ was set. This leads to warnings with newer kernels. Clear the resource flags, so no resource is registered at all in this case. Fixes: 2f17dd34ffed ("mfd: tqmx86: IO controller with I2C, Wachdog and GPIO") Signed-off-by: Matthias Schiffer Reviewed-by: Andrew Lunn Signed-off-by: Lee Jones Signed-off-by: Sasha Levin --- drivers/mfd/tqmx86.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/mfd/tqmx86.c b/drivers/mfd/tqmx86.c index 22d2f02d855c2..ccc5a9ac788c1 100644 --- a/drivers/mfd/tqmx86.c +++ b/drivers/mfd/tqmx86.c @@ -210,6 +210,8 @@ static int tqmx86_probe(struct platform_device *pdev) /* Assumes the IRQ resource is first. */ tqmx_gpio_resources[0].start = gpio_irq; + } else { + tqmx_gpio_resources[0].flags = 0; } ocores_platfom_data.clock_khz = tqmx86_board_id_to_clk_rate(board_id); From 90f4bd05dd98e509c0560eae7d8917d0ca03d73b Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 18 Aug 2021 20:21:31 +0000 Subject: [PATCH 0879/5344] KVM: arm64: Handle PSCI resets before userspace touches vCPU state [ Upstream commit 6826c6849b46aaa91300201213701eb861af4ba0 ] The CPU_ON PSCI call takes a payload that KVM uses to configure a destination vCPU to run. This payload is non-architectural state and not exposed through any existing UAPI. Effectively, we have a race between CPU_ON and userspace saving/restoring a guest: if the target vCPU isn't ran again before the VMM saves its state, the requested PC and context ID are lost. When restored, the target vCPU will be runnable and start executing at its old PC. We can avoid this race by making sure the reset payload is serviced before userspace can access a vCPU's state. Fixes: 358b28f09f0a ("arm/arm64: KVM: Allow a VCPU to fully reset itself") Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210818202133.1106786-3-oupton@google.com Signed-off-by: Sasha Levin --- virt/kvm/arm/arm.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 4af85605730e4..f7150fbeeb55e 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -1141,6 +1141,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (copy_from_user(®, argp, sizeof(reg))) break; + /* + * We could owe a reset due to PSCI. Handle the pending reset + * here to ensure userspace register accesses are ordered after + * the reset. + */ + if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu)) + kvm_reset_vcpu(vcpu); + if (ioctl == KVM_SET_ONE_REG) r = kvm_arm_set_reg(vcpu, ®); else From 2077e1073a8f8dcd236d2417ab92eec15e281afe Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 13 Aug 2021 18:36:19 +0300 Subject: [PATCH 0880/5344] PCI: Sync __pci_register_driver() stub for CONFIG_PCI=n [ Upstream commit 817f9916a6e96ae43acdd4e75459ef4f92d96eb1 ] The CONFIG_PCI=y case got a new parameter long time ago. Sync the stub as well. [bhelgaas: add parameter names] Fixes: 725522b5453d ("PCI: add the sysfs driver name to all modules") Link: https://lore.kernel.org/r/20210813153619.89574-1-andriy.shevchenko@linux.intel.com Reported-by: kernel test robot Signed-off-by: Andy Shevchenko Signed-off-by: Bjorn Helgaas Signed-off-by: Sasha Levin --- include/linux/pci.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/pci.h b/include/linux/pci.h index 96be27b359548..65399271f9202 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1702,8 +1702,9 @@ static inline int pci_enable_device(struct pci_dev *dev) { return -EIO; } static inline void pci_disable_device(struct pci_dev *dev) { } static inline int pci_assign_resource(struct pci_dev *dev, int i) { return -EBUSY; } -static inline int __pci_register_driver(struct pci_driver *drv, - struct module *owner) +static inline int __must_check __pci_register_driver(struct pci_driver *drv, + struct module *owner, + const char *mod_name) { return 0; } static inline int pci_register_driver(struct pci_driver *drv) { return 0; } From d1ecc82852d0582c3ef02562bc1cffd8c8238556 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 21 Aug 2021 09:58:45 +0200 Subject: [PATCH 0881/5344] mtd: rawnand: cafe: Fix a resource leak in the error handling path of 'cafe_nand_probe()' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 6b430c7595e4eb95fae8fb54adc3c3ce002e75ae ] A successful 'init_rs_non_canonical()' call should be balanced by a corresponding 'free_rs()' call in the error handling path of the probe, as already done in the remove function. Update the error handling path accordingly. Fixes: 8c61b7a7f4d4 ("[MTD] [NAND] Use rslib for CAFÉ ECC") Signed-off-by: Christophe JAILLET Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/fd313d3fb787458bcc73189e349f481133a2cdc9.1629532640.git.christophe.jaillet@wanadoo.fr Signed-off-by: Sasha Levin --- drivers/mtd/nand/raw/cafe_nand.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/mtd/nand/raw/cafe_nand.c b/drivers/mtd/nand/raw/cafe_nand.c index 2d1c22dc88c15..cc5009200cc23 100644 --- a/drivers/mtd/nand/raw/cafe_nand.c +++ b/drivers/mtd/nand/raw/cafe_nand.c @@ -757,7 +757,7 @@ static int cafe_nand_probe(struct pci_dev *pdev, "CAFE NAND", mtd); if (err) { dev_warn(&pdev->dev, "Could not register IRQ %d\n", pdev->irq); - goto out_ior; + goto out_free_rs; } /* Disable master reset, enable NAND clock */ @@ -801,6 +801,8 @@ static int cafe_nand_probe(struct pci_dev *pdev, /* Disable NAND IRQ in global IRQ mask register */ cafe_writel(cafe, ~1 & cafe_readl(cafe, GLOBAL_IRQ_MASK), GLOBAL_IRQ_MASK); free_irq(pdev->irq, mtd); + out_free_rs: + free_rs(cafe->rs); out_ior: pci_iounmap(pdev, cafe->mmio); out_free_mtd: From 350c14e5774a8fc104951a57e65f72f82bba8a87 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 16 Aug 2021 14:05:33 -0700 Subject: [PATCH 0882/5344] ARC: export clear_user_page() for modules [ Upstream commit 6b5ff0405e4190f23780362ea324b250bc495683 ] 0day bot reports a build error: ERROR: modpost: "clear_user_page" [drivers/media/v4l2-core/videobuf-dma-sg.ko] undefined! so export it in arch/arc/ to fix the build error. In most ARCHes, clear_user_page() is a macro. OTOH, in a few ARCHes it is a function and needs to be exported. PowerPC exported it in 2004. It looks like nds32 and nios2 still need to have it exported. Fixes: 4102b53392d63 ("ARC: [mm] Aliasing VIPT dcache support 2/4") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Guenter Roeck Cc: linux-snps-arc@lists.infradead.org Signed-off-by: Vineet Gupta Signed-off-by: Sasha Levin --- arch/arc/mm/cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c index a2fbea3ee07c7..102418ac5ff4a 100644 --- a/arch/arc/mm/cache.c +++ b/arch/arc/mm/cache.c @@ -1123,7 +1123,7 @@ void clear_user_page(void *to, unsigned long u_vaddr, struct page *page) clear_page(to); clear_bit(PG_dc_clean, &page->flags); } - +EXPORT_SYMBOL(clear_user_page); /********************************************************************** * Explicit Cache flush request from user space via syscall From a0e8627314c8397e95cfea2b8bb4599d6d5b4f5e Mon Sep 17 00:00:00 2001 From: Li Huafei Date: Mon, 23 Aug 2021 21:43:40 +0800 Subject: [PATCH 0883/5344] perf unwind: Do not overwrite FEATURE_CHECK_LDFLAGS-libunwind-{x86,aarch64} [ Upstream commit cdf32b44678c382a31dc183d9a767306915cda7b ] When setting LIBUNWIND_DIR, we first set FEATURE_CHECK_LDFLAGS-libunwind-{aarch64,x86} = -L$(LIBUNWIND_DIR)/lib. This happens a bit before, the overwritting, in: libunwind_arch_set_flags = $(eval $(libunwind_arch_set_flags_code)) define libunwind_arch_set_flags_code FEATURE_CHECK_CFLAGS-libunwind-$(1) = -I$(LIBUNWIND_DIR)/include FEATURE_CHECK_LDFLAGS-libunwind-$(1) = -L$(LIBUNWIND_DIR)/lib endef ifdef LIBUNWIND_DIR LIBUNWIND_CFLAGS = -I$(LIBUNWIND_DIR)/include LIBUNWIND_LDFLAGS = -L$(LIBUNWIND_DIR)/lib LIBUNWIND_ARCHS = x86 x86_64 arm aarch64 debug-frame-arm debug-frame-aarch64 $(foreach libunwind_arch,$(LIBUNWIND_ARCHS),$(call libunwind_arch_set_flags,$(libunwind_arch))) endif Look at that 'foreach' on all the LIBUNWIND_ARCHS. After commit 5c4d7c82c0dc ("perf unwind: Do not put libunwind-{x86,aarch64} in FEATURE_TESTS_BASIC"), FEATURE_CHECK_LDFLAGS-libunwind-{x86,aarch64} is overwritten. As a result, the remote libunwind libraries cannot be searched from $(LIBUNWIND_DIR)/lib directory during feature check tests. Fix it with variable appending. Before this patch: perf$ make VF=1 LIBUNWIND_DIR=/opt/libunwind_aarch64 BUILD: Doing 'make -j16' parallel build ... ... libopencsd: [ OFF ] ... libunwind-x86: [ OFF ] ... libunwind-x86_64: [ OFF ] ... libunwind-arm: [ OFF ] ... libunwind-aarch64: [ OFF ] ... libunwind-debug-frame: [ OFF ] ... libunwind-debug-frame-arm: [ OFF ] ... libunwind-debug-frame-aarch64: [ OFF ] ... cxx: [ OFF ] perf$ cat ../build/feature/test-libunwind-aarch64.make.output /usr/bin/ld: cannot find -lunwind-aarch64 /usr/bin/ld: cannot find -lunwind-aarch64 collect2: error: ld returned 1 exit status After this patch: perf$ make VF=1 LIBUNWIND_DIR=/opt/libunwind_aarch64 BUILD: Doing 'make -j16' parallel build ... libopencsd: [ OFF ] ... libunwind-x86: [ OFF ] ... libunwind-x86_64: [ OFF ] ... libunwind-arm: [ OFF ] ... libunwind-aarch64: [ on ] ... libunwind-debug-frame: [ OFF ] ... libunwind-debug-frame-arm: [ OFF ] ... libunwind-debug-frame-aarch64: [ OFF ] ... cxx: [ OFF ] perf$ cat ../build/feature/test-libunwind-aarch64.make.output perf$ ldd ./perf linux-vdso.so.1 (0x00007ffdf07da000) libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007f30953dc000) librt.so.1 => /lib/x86_64-linux-gnu/librt.so.1 (0x00007f30951d4000) libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007f3094e36000) libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007f3094c32000) libelf.so.1 => /usr/lib/x86_64-linux-gnu/libelf.so.1 (0x00007f3094a18000) libdw.so.1 => /usr/lib/x86_64-linux-gnu/libdw.so.1 (0x00007f30947cc000) libunwind-x86_64.so.8 => /usr/lib/x86_64-linux-gnu/libunwind-x86_64.so.8 (0x00007f30945ad000) libunwind.so.8 => /usr/lib/x86_64-linux-gnu/libunwind.so.8 (0x00007f3094392000) liblzma.so.5 => /lib/x86_64-linux-gnu/liblzma.so.5 (0x00007f309416c000) libunwind-aarch64.so.8 => not found libslang.so.2 => /lib/x86_64-linux-gnu/libslang.so.2 (0x00007f3093c8a000) libpython2.7.so.1.0 => /usr/local/lib/libpython2.7.so.1.0 (0x00007f309386b000) libz.so.1 => /lib/x86_64-linux-gnu/libz.so.1 (0x00007f309364e000) libnuma.so.1 => /usr/lib/x86_64-linux-gnu/libnuma.so.1 (0x00007f3093443000) libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f3093052000) /lib64/ld-linux-x86-64.so.2 (0x00007f3096097000) libbz2.so.1.0 => /lib/x86_64-linux-gnu/libbz2.so.1.0 (0x00007f3092e42000) libutil.so.1 => /lib/x86_64-linux-gnu/libutil.so.1 (0x00007f3092c3f000) Fixes: 5c4d7c82c0dceccf ("perf unwind: Do not put libunwind-{x86,aarch64} in FEATURE_TESTS_BASIC") Signed-off-by: Li Huafei Cc: Alexander Shishkin Cc: He Kuang Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zhang Jinhao Link: http://lore.kernel.org/lkml/20210823134340.60955-1-lihuafei1@huawei.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Sasha Levin --- tools/perf/Makefile.config | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 9832affd5d54b..c75c9b03d6e77 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -118,10 +118,10 @@ FEATURE_CHECK_LDFLAGS-libunwind = $(LIBUNWIND_LDFLAGS) $(LIBUNWIND_LIBS) FEATURE_CHECK_CFLAGS-libunwind-debug-frame = $(LIBUNWIND_CFLAGS) FEATURE_CHECK_LDFLAGS-libunwind-debug-frame = $(LIBUNWIND_LDFLAGS) $(LIBUNWIND_LIBS) -FEATURE_CHECK_LDFLAGS-libunwind-arm = -lunwind -lunwind-arm -FEATURE_CHECK_LDFLAGS-libunwind-aarch64 = -lunwind -lunwind-aarch64 -FEATURE_CHECK_LDFLAGS-libunwind-x86 = -lunwind -llzma -lunwind-x86 -FEATURE_CHECK_LDFLAGS-libunwind-x86_64 = -lunwind -llzma -lunwind-x86_64 +FEATURE_CHECK_LDFLAGS-libunwind-arm += -lunwind -lunwind-arm +FEATURE_CHECK_LDFLAGS-libunwind-aarch64 += -lunwind -lunwind-aarch64 +FEATURE_CHECK_LDFLAGS-libunwind-x86 += -lunwind -llzma -lunwind-x86 +FEATURE_CHECK_LDFLAGS-libunwind-x86_64 += -lunwind -llzma -lunwind-x86_64 FEATURE_CHECK_LDFLAGS-libcrypto = -lcrypto From b4006b0a5583ec2ec97ca96d3b4be7b160629b32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Thu, 2 Sep 2021 10:30:50 +0200 Subject: [PATCH 0884/5344] net: dsa: b53: Fix calculating number of switch ports MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit cdb067d31c0fe4cce98b9d15f1f2ef525acaa094 ] It isn't true that CPU port is always the last one. Switches BCM5301x have 9 ports (port 6 being inactive) and they use port 5 as CPU by default (depending on design some other may be CPU ports too). A more reliable way of determining number of ports is to check for the last set bit in the "enabled_ports" bitfield. This fixes b53 internal state, it will allow providing accurate info to the DSA and is required to fix BCM5301x support. Fixes: 967dd82ffc52 ("net: dsa: b53: Add support for Broadcom RoboSwitch") Signed-off-by: Rafał Miłecki Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/dsa/b53/b53_common.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index e78b683f73052..825d840cdb8c3 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -2353,9 +2353,8 @@ static int b53_switch_init(struct b53_device *dev) dev->cpu_port = 5; } - /* cpu port is always last */ - dev->num_ports = dev->cpu_port + 1; dev->enabled_ports |= BIT(dev->cpu_port); + dev->num_ports = fls(dev->enabled_ports); /* Include non standard CPU port built-in PHYs to be probed */ if (is539x(dev) || is531x5(dev)) { From 1a3a2c02a75362a8cc262aa0041f262586ec0d4b Mon Sep 17 00:00:00 2001 From: Benjamin Hesmans Date: Fri, 3 Sep 2021 15:23:35 +0200 Subject: [PATCH 0885/5344] netfilter: socket: icmp6: fix use-after-scope [ Upstream commit 730affed24bffcd1eebd5903171960f5ff9f1f22 ] Bug reported by KASAN: BUG: KASAN: use-after-scope in inet6_ehashfn (net/ipv6/inet6_hashtables.c:40) Call Trace: (...) inet6_ehashfn (net/ipv6/inet6_hashtables.c:40) (...) nf_sk_lookup_slow_v6 (net/ipv6/netfilter/nf_socket_ipv6.c:91 net/ipv6/netfilter/nf_socket_ipv6.c:146) It seems that this bug has already been fixed by Eric Dumazet in the past in: commit 78296c97ca1f ("netfilter: xt_socket: fix a stack corruption bug") But a variant of the same issue has been introduced in commit d64d80a2cde9 ("netfilter: x_tables: don't extract flow keys on early demuxed sks in socket match") `daddr` and `saddr` potentially hold a reference to ipv6_var that is no longer in scope when the call to `nf_socket_get_sock_v6` is made. Fixes: d64d80a2cde9 ("netfilter: x_tables: don't extract flow keys on early demuxed sks in socket match") Acked-by: Matthieu Baerts Signed-off-by: Benjamin Hesmans Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/ipv6/netfilter/nf_socket_ipv6.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c index b9df879c48d3f..69c021704abd7 100644 --- a/net/ipv6/netfilter/nf_socket_ipv6.c +++ b/net/ipv6/netfilter/nf_socket_ipv6.c @@ -99,7 +99,7 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb, { __be16 uninitialized_var(dport), uninitialized_var(sport); const struct in6_addr *daddr = NULL, *saddr = NULL; - struct ipv6hdr *iph = ipv6_hdr(skb); + struct ipv6hdr *iph = ipv6_hdr(skb), ipv6_var; struct sk_buff *data_skb = NULL; int doff = 0; int thoff = 0, tproto; @@ -129,8 +129,6 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb, thoff + sizeof(*hp); } else if (tproto == IPPROTO_ICMPV6) { - struct ipv6hdr ipv6_var; - if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr, &sport, &dport, &ipv6_var)) return NULL; From 0ca671ab56474a225819c61a05c0da5870c5ac70 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Sep 2021 15:03:43 -0700 Subject: [PATCH 0886/5344] fq_codel: reject silly quantum parameters [ Upstream commit c7c5e6ff533fe1f9afef7d2fa46678987a1335a7 ] syzbot found that forcing a big quantum attribute would crash hosts fast, essentially using this: tc qd replace dev eth0 root fq_codel quantum 4294967295 This is because fq_codel_dequeue() would have to loop ~2^31 times in : if (flow->deficit <= 0) { flow->deficit += q->quantum; list_move_tail(&flow->flowchain, &q->old_flows); goto begin; } SFQ max quantum is 2^19 (half a megabyte) Lets adopt a max quantum of one megabyte for FQ_CODEL. Fixes: 4b549a2ef4be ("fq_codel: Fair Queue Codel AQM") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/uapi/linux/pkt_sched.h | 2 ++ net/sched/sch_fq_codel.c | 12 ++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index edbbf4bfdd9e5..4a245d7a5c8d6 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -807,6 +807,8 @@ struct tc_codel_xstats { /* FQ_CODEL */ +#define FQ_CODEL_QUANTUM_MAX (1 << 20) + enum { TCA_FQ_CODEL_UNSPEC, TCA_FQ_CODEL_TARGET, diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 76d72c3f52eda..86fb2f953bd5b 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -370,6 +370,7 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, { struct fq_codel_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_FQ_CODEL_MAX + 1]; + u32 quantum = 0; int err; if (!opt) @@ -387,6 +388,13 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, q->flows_cnt > 65536) return -EINVAL; } + if (tb[TCA_FQ_CODEL_QUANTUM]) { + quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM])); + if (quantum > FQ_CODEL_QUANTUM_MAX) { + NL_SET_ERR_MSG(extack, "Invalid quantum"); + return -EINVAL; + } + } sch_tree_lock(sch); if (tb[TCA_FQ_CODEL_TARGET]) { @@ -413,8 +421,8 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_FQ_CODEL_ECN]) q->cparams.ecn = !!nla_get_u32(tb[TCA_FQ_CODEL_ECN]); - if (tb[TCA_FQ_CODEL_QUANTUM]) - q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM])); + if (quantum) + q->quantum = quantum; if (tb[TCA_FQ_CODEL_DROP_BATCH_SIZE]) q->drop_batch_size = max(1U, nla_get_u32(tb[TCA_FQ_CODEL_DROP_BATCH_SIZE])); From 51a64b251404614666faa03bb1dd4a075eef93be Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Fri, 3 Sep 2021 15:35:43 +0800 Subject: [PATCH 0887/5344] qlcnic: Remove redundant unlock in qlcnic_pinit_from_rom [ Upstream commit 9ddbc2a00d7f63fa9748f4278643193dac985f2d ] Previous commit 68233c583ab4 removes the qlcnic_rom_lock() in qlcnic_pinit_from_rom(), but remains its corresponding unlock function, which is odd. I'm not very sure whether the lock is missing, or the unlock is redundant. This bug is suggested by a static analysis tool, please advise. Fixes: 68233c583ab4 ("qlcnic: updated reset sequence") Signed-off-by: Dinghao Liu Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/qlogic/qlcnic/qlcnic_init.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_init.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_init.c index c48a0e2d4d7ef..6a009d51ec510 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_init.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_init.c @@ -440,7 +440,6 @@ int qlcnic_pinit_from_rom(struct qlcnic_adapter *adapter) QLCWR32(adapter, QLCNIC_CRB_PEG_NET_4 + 0x3c, 1); msleep(20); - qlcnic_rom_unlock(adapter); /* big hammer don't reset CAM block on reset */ QLCWR32(adapter, QLCNIC_ROMUSB_GLB_SW_RESET, 0xfeffffff); From 48c7326d094313b54c38f8db13eb5b0979175986 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sun, 5 Sep 2021 11:21:09 -0400 Subject: [PATCH 0888/5344] ip_gre: validate csum_start only on pull [ Upstream commit 8a0ed250f911da31a2aef52101bc707846a800ff ] The GRE tunnel device can pull existing outer headers in ipge_xmit. This is a rare path, apparently unique to this device. The below commit ensured that pulling does not move skb->data beyond csum_start. But it has a false positive if ip_summed is not CHECKSUM_PARTIAL and thus csum_start is irrelevant. Refine to exclude this. At the same time simplify and strengthen the test. Simplify, by moving the check next to the offending pull, making it more self documenting and removing an unnecessary branch from other code paths. Strengthen, by also ensuring that the transport header is correct and therefore the inner headers will be after skb_reset_inner_headers. The transport header is set to csum_start in skb_partial_csum_set. Link: https://lore.kernel.org/netdev/YS+h%2FtqCJJiQei+W@shredder/ Fixes: 1d011c4803c7 ("ip_gre: add validation for csum_start") Reported-by: Ido Schimmel Suggested-by: Alexander Duyck Signed-off-by: Willem de Bruijn Reviewed-by: Alexander Duyck Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/ipv4/ip_gre.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index fd8298b8b1c52..c4989e5903e43 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -446,8 +446,6 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, static int gre_handle_offloads(struct sk_buff *skb, bool csum) { - if (csum && skb_checksum_start(skb) < skb->data) - return -EINVAL; return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); } @@ -605,15 +603,20 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, } if (dev->header_ops) { + const int pull_len = tunnel->hlen + sizeof(struct iphdr); + if (skb_cow_head(skb, 0)) goto free_skb; tnl_params = (const struct iphdr *)skb->data; + if (pull_len > skb_transport_offset(skb)) + goto free_skb; + /* Pull skb since ip_tunnel_xmit() needs skb->data pointing * to gre header. */ - skb_pull(skb, tunnel->hlen + sizeof(struct iphdr)); + skb_pull(skb, pull_len); skb_reset_mac_header(skb); } else { if (skb_cow_head(skb, dev->needed_headroom)) From 43d42a710f0848cf1a79fe165ab87fc1b35f0645 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Tue, 7 Sep 2021 20:29:40 +0900 Subject: [PATCH 0889/5344] net: renesas: sh_eth: Fix freeing wrong tx descriptor [ Upstream commit 0341d5e3d1ee2a36dd5a49b5bef2ce4ad1cfa6b4 ] The cur_tx counter must be incremented after TACT bit of txdesc->status was set. However, a CPU is possible to reorder instructions and/or memory accesses between cur_tx and txdesc->status. And then, if TX interrupt happened at such a timing, the sh_eth_tx_free() may free the descriptor wrongly. So, add wmb() before cur_tx++. Otherwise NETDEV WATCHDOG timeout is possible to happen. Fixes: 86a74ff21a7a ("net: sh_eth: add support for Renesas SuperH Ethernet") Signed-off-by: Yoshihiro Shimoda Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/renesas/sh_eth.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index 931a44fe7afe8..50d85d0372302 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -2567,6 +2567,7 @@ static int sh_eth_start_xmit(struct sk_buff *skb, struct net_device *ndev) else txdesc->status |= cpu_to_le32(TD_TACT); + wmb(); /* cur_tx must be incremented after TACT bit was set */ mdp->cur_tx++; if (!(sh_eth_read(ndev, EDTRR) & mdp->cd->edtrr_trns)) From d97fbd85cd17f6bde4814e87469d75b589a9fc22 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Mon, 6 Sep 2021 15:04:14 +0200 Subject: [PATCH 0890/5344] s390/bpf: Fix optimizing out zero-extensions commit db7bee653859ef7179be933e7d1384644f795f26 upstream. Currently the JIT completely removes things like `reg32 += 0`, however, the BPF_ALU semantics requires the target register to be zero-extended in such cases. Fix by optimizing out only the arithmetic operation, but not the subsequent zero-extension. Reported-by: Johan Almbladh Fixes: 054623105728 ("s390/bpf: Add s390x eBPF JIT compiler backend") Reviewed-by: Heiko Carstens Signed-off-by: Ilya Leoshkevich Signed-off-by: Vasily Gorbik Signed-off-by: Greg Kroah-Hartman --- arch/s390/net/bpf_jit_comp.c | 58 +++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 3e6612d8b921c..9ae86f60dae55 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -569,10 +569,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT4(0xb9080000, dst_reg, src_reg); break; case BPF_ALU | BPF_ADD | BPF_K: /* dst = (u32) dst + (u32) imm */ - if (!imm) - break; - /* alfi %dst,imm */ - EMIT6_IMM(0xc20b0000, dst_reg, imm); + if (imm != 0) { + /* alfi %dst,imm */ + EMIT6_IMM(0xc20b0000, dst_reg, imm); + } EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_ADD | BPF_K: /* dst = dst + imm */ @@ -594,10 +594,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT4(0xb9090000, dst_reg, src_reg); break; case BPF_ALU | BPF_SUB | BPF_K: /* dst = (u32) dst - (u32) imm */ - if (!imm) - break; - /* alfi %dst,-imm */ - EMIT6_IMM(0xc20b0000, dst_reg, -imm); + if (imm != 0) { + /* alfi %dst,-imm */ + EMIT6_IMM(0xc20b0000, dst_reg, -imm); + } EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_SUB | BPF_K: /* dst = dst - imm */ @@ -619,10 +619,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT4(0xb90c0000, dst_reg, src_reg); break; case BPF_ALU | BPF_MUL | BPF_K: /* dst = (u32) dst * (u32) imm */ - if (imm == 1) - break; - /* msfi %r5,imm */ - EMIT6_IMM(0xc2010000, dst_reg, imm); + if (imm != 1) { + /* msfi %r5,imm */ + EMIT6_IMM(0xc2010000, dst_reg, imm); + } EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_MUL | BPF_K: /* dst = dst * imm */ @@ -675,6 +675,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, if (BPF_OP(insn->code) == BPF_MOD) /* lhgi %dst,0 */ EMIT4_IMM(0xa7090000, dst_reg, 0); + else + EMIT_ZERO(dst_reg); break; } /* lhi %w0,0 */ @@ -769,10 +771,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT4(0xb9820000, dst_reg, src_reg); break; case BPF_ALU | BPF_XOR | BPF_K: /* dst = (u32) dst ^ (u32) imm */ - if (!imm) - break; - /* xilf %dst,imm */ - EMIT6_IMM(0xc0070000, dst_reg, imm); + if (imm != 0) { + /* xilf %dst,imm */ + EMIT6_IMM(0xc0070000, dst_reg, imm); + } EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_XOR | BPF_K: /* dst = dst ^ imm */ @@ -793,10 +795,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT6_DISP_LH(0xeb000000, 0x000d, dst_reg, dst_reg, src_reg, 0); break; case BPF_ALU | BPF_LSH | BPF_K: /* dst = (u32) dst << (u32) imm */ - if (imm == 0) - break; - /* sll %dst,imm(%r0) */ - EMIT4_DISP(0x89000000, dst_reg, REG_0, imm); + if (imm != 0) { + /* sll %dst,imm(%r0) */ + EMIT4_DISP(0x89000000, dst_reg, REG_0, imm); + } EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_LSH | BPF_K: /* dst = dst << imm */ @@ -818,10 +820,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT6_DISP_LH(0xeb000000, 0x000c, dst_reg, dst_reg, src_reg, 0); break; case BPF_ALU | BPF_RSH | BPF_K: /* dst = (u32) dst >> (u32) imm */ - if (imm == 0) - break; - /* srl %dst,imm(%r0) */ - EMIT4_DISP(0x88000000, dst_reg, REG_0, imm); + if (imm != 0) { + /* srl %dst,imm(%r0) */ + EMIT4_DISP(0x88000000, dst_reg, REG_0, imm); + } EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_RSH | BPF_K: /* dst = dst >> imm */ @@ -843,10 +845,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT6_DISP_LH(0xeb000000, 0x000a, dst_reg, dst_reg, src_reg, 0); break; case BPF_ALU | BPF_ARSH | BPF_K: /* ((s32) dst >> imm */ - if (imm == 0) - break; - /* sra %dst,imm(%r0) */ - EMIT4_DISP(0x8a000000, dst_reg, REG_0, imm); + if (imm != 0) { + /* sra %dst,imm(%r0) */ + EMIT4_DISP(0x8a000000, dst_reg, REG_0, imm); + } EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_ARSH | BPF_K: /* ((s64) dst) >>= imm */ From c5eec096c168c5c89ff05385f9bf31d2d52f2f07 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 7 Sep 2021 13:41:16 +0200 Subject: [PATCH 0891/5344] s390/bpf: Fix 64-bit subtraction of the -0x80000000 constant commit 6e61dc9da0b7a0d91d57c2e20b5ea4fd2d4e7e53 upstream. The JIT uses agfi for subtracting constants, but -(-0x80000000) cannot be represented as a 32-bit signed binary integer. Fix by using algfi in this particular case. Reported-by: Johan Almbladh Fixes: 054623105728 ("s390/bpf: Add s390x eBPF JIT compiler backend") Reviewed-by: Heiko Carstens Signed-off-by: Ilya Leoshkevich Signed-off-by: Vasily Gorbik Signed-off-by: Greg Kroah-Hartman --- arch/s390/net/bpf_jit_comp.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 9ae86f60dae55..2d29966276296 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -603,8 +603,13 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, case BPF_ALU64 | BPF_SUB | BPF_K: /* dst = dst - imm */ if (!imm) break; - /* agfi %dst,-imm */ - EMIT6_IMM(0xc2080000, dst_reg, -imm); + if (imm == -0x80000000) { + /* algfi %dst,0x80000000 */ + EMIT6_IMM(0xc20a0000, dst_reg, 0x80000000); + } else { + /* agfi %dst,-imm */ + EMIT6_IMM(0xc2080000, dst_reg, -imm); + } break; /* * BPF_MUL From f8b5556f398d2e5ed919918d62d04335c8e2d0cb Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 22 Sep 2021 12:26:46 +0200 Subject: [PATCH 0892/5344] Linux 5.4.148 Link: https://lore.kernel.org/r/20210920163931.123590023@linuxfoundation.org Tested-by: Florian Fainelli Tested-by: Jon Hunter Tested-by: Shuah Khan Tested-by: Sudip Mukherjee Tested-by: Guenter Roeck Tested-by: Hulk Robot Tested-by: Linux Kernel Functional Testing Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 98227dae34947..b84706c6d6248 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 4 -SUBLEVEL = 147 +SUBLEVEL = 148 EXTRAVERSION = NAME = Kleptomaniac Octopus From f23b7a966bec2408a739a53c97594efea4d33f32 Mon Sep 17 00:00:00 2001 From: Grzegorz Jaszczyk Date: Tue, 16 Jul 2019 14:13:46 +0200 Subject: [PATCH 0893/5344] PCI: pci-bridge-emul: Fix big-endian support commit e0d9d30b73548fbfe5c024ed630169bdc9a08aee upstream. Perform conversion to little-endian before every write to configuration space and convert it back to CPU endianness on reads. Additionally, initialise every multiple byte field of config space with the cpu_to_le* macro, which is required since the structure describing config space of emulated bridge assumes little-endian convention. Signed-off-by: Grzegorz Jaszczyk Signed-off-by: Lorenzo Pieralisi Signed-off-by: Greg Kroah-Hartman --- drivers/pci/pci-bridge-emul.c | 25 +++++------ drivers/pci/pci-bridge-emul.h | 78 +++++++++++++++++------------------ 2 files changed, 52 insertions(+), 51 deletions(-) diff --git a/drivers/pci/pci-bridge-emul.c b/drivers/pci/pci-bridge-emul.c index d3b6b9a056185..06c800595e036 100644 --- a/drivers/pci/pci-bridge-emul.c +++ b/drivers/pci/pci-bridge-emul.c @@ -270,10 +270,10 @@ static const struct pci_bridge_reg_behavior pcie_cap_regs_behavior[] = { int pci_bridge_emul_init(struct pci_bridge_emul *bridge, unsigned int flags) { - bridge->conf.class_revision |= PCI_CLASS_BRIDGE_PCI << 16; + bridge->conf.class_revision |= cpu_to_le32(PCI_CLASS_BRIDGE_PCI << 16); bridge->conf.header_type = PCI_HEADER_TYPE_BRIDGE; bridge->conf.cache_line_size = 0x10; - bridge->conf.status = PCI_STATUS_CAP_LIST; + bridge->conf.status = cpu_to_le16(PCI_STATUS_CAP_LIST); bridge->pci_regs_behavior = kmemdup(pci_regs_behavior, sizeof(pci_regs_behavior), GFP_KERNEL); @@ -284,8 +284,9 @@ int pci_bridge_emul_init(struct pci_bridge_emul *bridge, bridge->conf.capabilities_pointer = PCI_CAP_PCIE_START; bridge->pcie_conf.cap_id = PCI_CAP_ID_EXP; /* Set PCIe v2, root port, slot support */ - bridge->pcie_conf.cap = PCI_EXP_TYPE_ROOT_PORT << 4 | 2 | - PCI_EXP_FLAGS_SLOT; + bridge->pcie_conf.cap = + cpu_to_le16(PCI_EXP_TYPE_ROOT_PORT << 4 | 2 | + PCI_EXP_FLAGS_SLOT); bridge->pcie_cap_regs_behavior = kmemdup(pcie_cap_regs_behavior, sizeof(pcie_cap_regs_behavior), @@ -327,7 +328,7 @@ int pci_bridge_emul_conf_read(struct pci_bridge_emul *bridge, int where, int reg = where & ~3; pci_bridge_emul_read_status_t (*read_op)(struct pci_bridge_emul *bridge, int reg, u32 *value); - u32 *cfgspace; + __le32 *cfgspace; const struct pci_bridge_reg_behavior *behavior; if (bridge->has_pcie && reg >= PCI_CAP_PCIE_END) { @@ -343,11 +344,11 @@ int pci_bridge_emul_conf_read(struct pci_bridge_emul *bridge, int where, if (bridge->has_pcie && reg >= PCI_CAP_PCIE_START) { reg -= PCI_CAP_PCIE_START; read_op = bridge->ops->read_pcie; - cfgspace = (u32 *) &bridge->pcie_conf; + cfgspace = (__le32 *) &bridge->pcie_conf; behavior = bridge->pcie_cap_regs_behavior; } else { read_op = bridge->ops->read_base; - cfgspace = (u32 *) &bridge->conf; + cfgspace = (__le32 *) &bridge->conf; behavior = bridge->pci_regs_behavior; } @@ -357,7 +358,7 @@ int pci_bridge_emul_conf_read(struct pci_bridge_emul *bridge, int where, ret = PCI_BRIDGE_EMUL_NOT_HANDLED; if (ret == PCI_BRIDGE_EMUL_NOT_HANDLED) - *value = cfgspace[reg / 4]; + *value = le32_to_cpu(cfgspace[reg / 4]); /* * Make sure we never return any reserved bit with a value @@ -387,7 +388,7 @@ int pci_bridge_emul_conf_write(struct pci_bridge_emul *bridge, int where, int mask, ret, old, new, shift; void (*write_op)(struct pci_bridge_emul *bridge, int reg, u32 old, u32 new, u32 mask); - u32 *cfgspace; + __le32 *cfgspace; const struct pci_bridge_reg_behavior *behavior; if (bridge->has_pcie && reg >= PCI_CAP_PCIE_END) @@ -414,11 +415,11 @@ int pci_bridge_emul_conf_write(struct pci_bridge_emul *bridge, int where, if (bridge->has_pcie && reg >= PCI_CAP_PCIE_START) { reg -= PCI_CAP_PCIE_START; write_op = bridge->ops->write_pcie; - cfgspace = (u32 *) &bridge->pcie_conf; + cfgspace = (__le32 *) &bridge->pcie_conf; behavior = bridge->pcie_cap_regs_behavior; } else { write_op = bridge->ops->write_base; - cfgspace = (u32 *) &bridge->conf; + cfgspace = (__le32 *) &bridge->conf; behavior = bridge->pci_regs_behavior; } @@ -431,7 +432,7 @@ int pci_bridge_emul_conf_write(struct pci_bridge_emul *bridge, int where, /* Clear the W1C bits */ new &= ~((value << shift) & (behavior[reg / 4].w1c & mask)); - cfgspace[reg / 4] = new; + cfgspace[reg / 4] = cpu_to_le32(new); if (write_op) write_op(bridge, reg, old, new, mask); diff --git a/drivers/pci/pci-bridge-emul.h b/drivers/pci/pci-bridge-emul.h index e65b1b79899d0..b31883022a8e6 100644 --- a/drivers/pci/pci-bridge-emul.h +++ b/drivers/pci/pci-bridge-emul.h @@ -6,65 +6,65 @@ /* PCI configuration space of a PCI-to-PCI bridge. */ struct pci_bridge_emul_conf { - u16 vendor; - u16 device; - u16 command; - u16 status; - u32 class_revision; + __le16 vendor; + __le16 device; + __le16 command; + __le16 status; + __le32 class_revision; u8 cache_line_size; u8 latency_timer; u8 header_type; u8 bist; - u32 bar[2]; + __le32 bar[2]; u8 primary_bus; u8 secondary_bus; u8 subordinate_bus; u8 secondary_latency_timer; u8 iobase; u8 iolimit; - u16 secondary_status; - u16 membase; - u16 memlimit; - u16 pref_mem_base; - u16 pref_mem_limit; - u32 prefbaseupper; - u32 preflimitupper; - u16 iobaseupper; - u16 iolimitupper; + __le16 secondary_status; + __le16 membase; + __le16 memlimit; + __le16 pref_mem_base; + __le16 pref_mem_limit; + __le32 prefbaseupper; + __le32 preflimitupper; + __le16 iobaseupper; + __le16 iolimitupper; u8 capabilities_pointer; u8 reserve[3]; - u32 romaddr; + __le32 romaddr; u8 intline; u8 intpin; - u16 bridgectrl; + __le16 bridgectrl; }; /* PCI configuration space of the PCIe capabilities */ struct pci_bridge_emul_pcie_conf { u8 cap_id; u8 next; - u16 cap; - u32 devcap; - u16 devctl; - u16 devsta; - u32 lnkcap; - u16 lnkctl; - u16 lnksta; - u32 slotcap; - u16 slotctl; - u16 slotsta; - u16 rootctl; - u16 rsvd; - u32 rootsta; - u32 devcap2; - u16 devctl2; - u16 devsta2; - u32 lnkcap2; - u16 lnkctl2; - u16 lnksta2; - u32 slotcap2; - u16 slotctl2; - u16 slotsta2; + __le16 cap; + __le32 devcap; + __le16 devctl; + __le16 devsta; + __le32 lnkcap; + __le16 lnkctl; + __le16 lnksta; + __le32 slotcap; + __le16 slotctl; + __le16 slotsta; + __le16 rootctl; + __le16 rsvd; + __le32 rootsta; + __le32 devcap2; + __le16 devctl2; + __le16 devsta2; + __le32 lnkcap2; + __le16 lnkctl2; + __le16 lnksta2; + __le32 slotcap2; + __le16 slotctl2; + __le16 slotsta2; }; struct pci_bridge_emul; From 5ac5f2baccd3828259714eb03d81a673425138f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Mon, 1 Jun 2020 15:03:15 +0200 Subject: [PATCH 0894/5344] PCI: aardvark: Indicate error in 'val' when config read fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit b1bd5714472cc72e14409f5659b154c765a76c65 upstream. Most callers of config read do not check for return value. But most of the ones that do, checks for error indication in 'val' variable. This patch updates error handling in advk_pcie_rd_conf() function. If PIO transfer fails then 'val' variable is set to 0xffffffff which indicates failture. Link: https://lore.kernel.org/r/20200528162604.GA323482@bjorn-Precision-5520 Link: https://lore.kernel.org/r/20200601130315.18895-1-pali@kernel.org Reported-by: Bjorn Helgaas Signed-off-by: Pali Rohár Signed-off-by: Lorenzo Pieralisi Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-aardvark.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 0538348ed843f..a957211930a81 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -664,8 +664,10 @@ static int advk_pcie_rd_conf(struct pci_bus *bus, u32 devfn, advk_writel(pcie, 1, PIO_START); ret = advk_pcie_wait_pio(pcie); - if (ret < 0) + if (ret < 0) { + *val = 0xffffffff; return PCIBIOS_SET_FAILED; + } /* Check PIO status and get the read result */ ret = advk_pcie_check_pio_status(pcie, val); From 236ae5d39583a9f55de901ce272f9737e5e98063 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 22 Jul 2021 16:40:40 +0200 Subject: [PATCH 0895/5344] PCI: pci-bridge-emul: Add PCIe Root Capabilities Register MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit e902bb7c24a7099d0eb0eb4cba06f2d91e9299f3 upstream. The 16-bit Root Capabilities register is at offset 0x1e in the PCIe Capability. Rename current 'rsvd' struct member to 'rootcap'. Link: https://lore.kernel.org/r/20210722144041.12661-4-pali@kernel.org Signed-off-by: Pali Rohár Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marek Behún Signed-off-by: Greg Kroah-Hartman --- drivers/pci/pci-bridge-emul.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/pci-bridge-emul.h b/drivers/pci/pci-bridge-emul.h index b31883022a8e6..49bbd37ee318a 100644 --- a/drivers/pci/pci-bridge-emul.h +++ b/drivers/pci/pci-bridge-emul.h @@ -54,7 +54,7 @@ struct pci_bridge_emul_pcie_conf { __le16 slotctl; __le16 slotsta; __le16 rootctl; - __le16 rsvd; + __le16 rootcap; __le32 rootsta; __le32 devcap2; __le16 devctl2; From a4daf1b3dd45ef509ff248773da58c795a65ac0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 22 Jul 2021 16:40:41 +0200 Subject: [PATCH 0896/5344] PCI: aardvark: Fix reporting CRS value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 43f5c77bcbd27cce70bf33c2b86d6726ce95dd66 upstream. Set CRSVIS flag in emulated root PCI bridge to indicate support for Completion Retry Status. Add check for CRSSVE flag from root PCI brige when issuing Configuration Read Request via PIO to correctly returns fabricated CRS value as it is required by PCIe spec. Link: https://lore.kernel.org/r/20210722144041.12661-5-pali@kernel.org Fixes: 8a3ebd8de328 ("PCI: aardvark: Implement emulated root PCI bridge config space") Signed-off-by: Pali Rohár Signed-off-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org # e0d9d30b7354 ("PCI: pci-bridge-emul: Fix big-endian support") Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-aardvark.c | 67 +++++++++++++++++++++++++-- 1 file changed, 63 insertions(+), 4 deletions(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index a957211930a81..18753fd218a31 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -188,6 +188,8 @@ #define MSI_IRQ_NUM 32 +#define CFG_RD_CRS_VAL 0xffff0001 + struct advk_pcie { struct platform_device *pdev; void __iomem *base; @@ -365,7 +367,7 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) advk_writel(pcie, reg, PCIE_CORE_CMD_STATUS_REG); } -static int advk_pcie_check_pio_status(struct advk_pcie *pcie, u32 *val) +static int advk_pcie_check_pio_status(struct advk_pcie *pcie, bool allow_crs, u32 *val) { struct device *dev = &pcie->pdev->dev; u32 reg; @@ -407,9 +409,30 @@ static int advk_pcie_check_pio_status(struct advk_pcie *pcie, u32 *val) strcomp_status = "UR"; break; case PIO_COMPLETION_STATUS_CRS: + if (allow_crs && val) { + /* PCIe r4.0, sec 2.3.2, says: + * If CRS Software Visibility is enabled: + * For a Configuration Read Request that includes both + * bytes of the Vendor ID field of a device Function's + * Configuration Space Header, the Root Complex must + * complete the Request to the host by returning a + * read-data value of 0001h for the Vendor ID field and + * all '1's for any additional bytes included in the + * request. + * + * So CRS in this case is not an error status. + */ + *val = CFG_RD_CRS_VAL; + strcomp_status = NULL; + break; + } /* PCIe r4.0, sec 2.3.2, says: * If CRS Software Visibility is not enabled, the Root Complex * must re-issue the Configuration Request as a new Request. + * If CRS Software Visibility is enabled: For a Configuration + * Write Request or for any other Configuration Read Request, + * the Root Complex must re-issue the Configuration Request as + * a new Request. * A Root Complex implementation may choose to limit the number * of Configuration Request/CRS Completion Status loops before * determining that something is wrong with the target of the @@ -478,6 +501,7 @@ advk_pci_bridge_emul_pcie_conf_read(struct pci_bridge_emul *bridge, case PCI_EXP_RTCTL: { u32 val = advk_readl(pcie, PCIE_ISR0_MASK_REG); *value = (val & PCIE_MSG_PM_PME_MASK) ? 0 : PCI_EXP_RTCTL_PMEIE; + *value |= PCI_EXP_RTCAP_CRSVIS << 16; return PCI_BRIDGE_EMUL_HANDLED; } @@ -559,6 +583,7 @@ static struct pci_bridge_emul_ops advk_pci_bridge_emul_ops = { static int advk_sw_pci_bridge_init(struct advk_pcie *pcie) { struct pci_bridge_emul *bridge = &pcie->bridge; + int ret; bridge->conf.vendor = advk_readl(pcie, PCIE_CORE_DEV_ID_REG) & 0xffff; bridge->conf.device = advk_readl(pcie, PCIE_CORE_DEV_ID_REG) >> 16; @@ -580,7 +605,15 @@ static int advk_sw_pci_bridge_init(struct advk_pcie *pcie) bridge->data = pcie; bridge->ops = &advk_pci_bridge_emul_ops; - return pci_bridge_emul_init(bridge, 0); + /* PCIe config space can be initialized after pci_bridge_emul_init() */ + ret = pci_bridge_emul_init(bridge, 0); + if (ret < 0) + return ret; + + /* Indicates supports for Completion Retry Status */ + bridge->pcie_conf.rootcap = cpu_to_le16(PCI_EXP_RTCAP_CRSVIS); + + return 0; } static bool advk_pcie_valid_device(struct advk_pcie *pcie, struct pci_bus *bus, @@ -625,6 +658,7 @@ static int advk_pcie_rd_conf(struct pci_bus *bus, u32 devfn, int where, int size, u32 *val) { struct advk_pcie *pcie = bus->sysdata; + bool allow_crs; u32 reg; int ret; @@ -637,7 +671,24 @@ static int advk_pcie_rd_conf(struct pci_bus *bus, u32 devfn, return pci_bridge_emul_conf_read(&pcie->bridge, where, size, val); + /* + * Completion Retry Status is possible to return only when reading all + * 4 bytes from PCI_VENDOR_ID and PCI_DEVICE_ID registers at once and + * CRSSVE flag on Root Bridge is enabled. + */ + allow_crs = (where == PCI_VENDOR_ID) && (size == 4) && + (le16_to_cpu(pcie->bridge.pcie_conf.rootctl) & + PCI_EXP_RTCTL_CRSSVE); + if (advk_pcie_pio_is_running(pcie)) { + /* + * If it is possible return Completion Retry Status so caller + * tries to issue the request again instead of failing. + */ + if (allow_crs) { + *val = CFG_RD_CRS_VAL; + return PCIBIOS_SUCCESSFUL; + } *val = 0xffffffff; return PCIBIOS_SET_FAILED; } @@ -665,12 +716,20 @@ static int advk_pcie_rd_conf(struct pci_bus *bus, u32 devfn, ret = advk_pcie_wait_pio(pcie); if (ret < 0) { + /* + * If it is possible return Completion Retry Status so caller + * tries to issue the request again instead of failing. + */ + if (allow_crs) { + *val = CFG_RD_CRS_VAL; + return PCIBIOS_SUCCESSFUL; + } *val = 0xffffffff; return PCIBIOS_SET_FAILED; } /* Check PIO status and get the read result */ - ret = advk_pcie_check_pio_status(pcie, val); + ret = advk_pcie_check_pio_status(pcie, allow_crs, val); if (ret < 0) { *val = 0xffffffff; return PCIBIOS_SET_FAILED; @@ -739,7 +798,7 @@ static int advk_pcie_wr_conf(struct pci_bus *bus, u32 devfn, if (ret < 0) return PCIBIOS_SET_FAILED; - ret = advk_pcie_check_pio_status(pcie, NULL); + ret = advk_pcie_check_pio_status(pcie, false, NULL); if (ret < 0) return PCIBIOS_SET_FAILED; From f7da4ff7e7d44b1086550c5c5fd93435374afae6 Mon Sep 17 00:00:00 2001 From: Tuan Phan Date: Thu, 6 Aug 2020 14:57:34 -0700 Subject: [PATCH 0897/5344] PCI/ACPI: Add Ampere Altra SOC MCFG quirk commit 877c1a5f79c6984bbe3f2924234c08e2f4f1acd5 upstream. Ampere Altra SOC supports only 32-bit ECAM reads. Add an MCFG quirk for the platform. Link: https://lore.kernel.org/r/1596751055-12316-1-git-send-email-tuanphan@os.amperecomputing.com Signed-off-by: Tuan Phan Signed-off-by: Bjorn Helgaas [ dannf: backport drops const qualifier from pci_32b_read_ops for consistency with the other quirks that weren't yet constified in v5.4 ] Signed-off-by: dann frazier Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/pci_mcfg.c | 20 ++++++++++++++++++++ drivers/pci/ecam.c | 10 ++++++++++ include/linux/pci-ecam.h | 1 + 3 files changed, 31 insertions(+) diff --git a/drivers/acpi/pci_mcfg.c b/drivers/acpi/pci_mcfg.c index 6b347d9920cc2..47e43c9498257 100644 --- a/drivers/acpi/pci_mcfg.c +++ b/drivers/acpi/pci_mcfg.c @@ -142,6 +142,26 @@ static struct mcfg_fixup mcfg_quirks[] = { XGENE_V2_ECAM_MCFG(4, 0), XGENE_V2_ECAM_MCFG(4, 1), XGENE_V2_ECAM_MCFG(4, 2), + +#define ALTRA_ECAM_QUIRK(rev, seg) \ + { "Ampere", "Altra ", rev, seg, MCFG_BUS_ANY, &pci_32b_read_ops } + + ALTRA_ECAM_QUIRK(1, 0), + ALTRA_ECAM_QUIRK(1, 1), + ALTRA_ECAM_QUIRK(1, 2), + ALTRA_ECAM_QUIRK(1, 3), + ALTRA_ECAM_QUIRK(1, 4), + ALTRA_ECAM_QUIRK(1, 5), + ALTRA_ECAM_QUIRK(1, 6), + ALTRA_ECAM_QUIRK(1, 7), + ALTRA_ECAM_QUIRK(1, 8), + ALTRA_ECAM_QUIRK(1, 9), + ALTRA_ECAM_QUIRK(1, 10), + ALTRA_ECAM_QUIRK(1, 11), + ALTRA_ECAM_QUIRK(1, 12), + ALTRA_ECAM_QUIRK(1, 13), + ALTRA_ECAM_QUIRK(1, 14), + ALTRA_ECAM_QUIRK(1, 15), }; static char mcfg_oem_id[ACPI_OEM_ID_SIZE]; diff --git a/drivers/pci/ecam.c b/drivers/pci/ecam.c index 1a81af0ba961a..9765d2eb79d21 100644 --- a/drivers/pci/ecam.c +++ b/drivers/pci/ecam.c @@ -164,4 +164,14 @@ struct pci_ecam_ops pci_32b_ops = { .write = pci_generic_config_write32, } }; + +/* ECAM ops for 32-bit read only (non-compliant) */ +struct pci_ecam_ops pci_32b_read_ops = { + .bus_shift = 20, + .pci_ops = { + .map_bus = pci_ecam_map_bus, + .read = pci_generic_config_read32, + .write = pci_generic_config_write, + } +}; #endif diff --git a/include/linux/pci-ecam.h b/include/linux/pci-ecam.h index a73164c85e78b..75456a66024a9 100644 --- a/include/linux/pci-ecam.h +++ b/include/linux/pci-ecam.h @@ -51,6 +51,7 @@ extern struct pci_ecam_ops pci_generic_ecam_ops; #if defined(CONFIG_ACPI) && defined(CONFIG_PCI_QUIRKS) extern struct pci_ecam_ops pci_32b_ops; /* 32-bit accesses only */ +extern struct pci_ecam_ops pci_32b_read_ops; /* 32-bit read only */ extern struct pci_ecam_ops hisi_pcie_ops; /* HiSilicon */ extern struct pci_ecam_ops thunder_pem_ecam_ops; /* Cavium ThunderX 1.x & 2.x */ extern struct pci_ecam_ops pci_thunder_ecam_ops; /* Cavium ThunderX 1.x */ From 34b32621b0e05124ad6fda6d9b4c03368ca68e2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Thu, 7 Nov 2019 07:53:42 -0500 Subject: [PATCH 0898/5344] KVM: remember position in kvm->vcpus array MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 8750e72a79dda2f665ce17b62049f4d62130d991 upstream. Fetching an index for any vcpu in kvm->vcpus array by traversing the entire array everytime is costly. This patch remembers the position of each vcpu in kvm->vcpus array by storing it in vcpus_idx under kvm_vcpu structure. Signed-off-by: Radim Krčmář Signed-off-by: Nitesh Narayan Lal Signed-off-by: Paolo Bonzini [borntraeger@de.ibm.com]: backport to 4.19 (also fits for 5.4) Signed-off-by: Christian Borntraeger Acked-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- include/linux/kvm_host.h | 11 +++-------- virt/kvm/kvm_main.c | 5 +++-- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c3402920fd2f6..4168109bfa855 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -266,7 +266,8 @@ struct kvm_vcpu { struct preempt_notifier preempt_notifier; #endif int cpu; - int vcpu_id; + int vcpu_id; /* id given by userspace at creation */ + int vcpu_idx; /* index in kvm->vcpus array */ int srcu_idx; int mode; u64 requests; @@ -571,13 +572,7 @@ static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id) static inline int kvm_vcpu_get_idx(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *tmp; - int idx; - - kvm_for_each_vcpu(idx, tmp, vcpu->kvm) - if (tmp == vcpu) - return idx; - BUG(); + return vcpu->vcpu_idx; } #define kvm_for_each_memslot(memslot, slots) \ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 362350609fe76..c77a0ee76eaa0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2864,7 +2864,8 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) goto unlock_vcpu_destroy; } - BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); + vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus); + BUG_ON(kvm->vcpus[vcpu->vcpu_idx]); /* Now it's all set up, let userspace reach it */ kvm_get_kvm(kvm); @@ -2874,7 +2875,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) goto unlock_vcpu_destroy; } - kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; + kvm->vcpus[vcpu->vcpu_idx] = vcpu; /* * Pairs with smp_rmb() in kvm_get_vcpu. Write kvm->vcpus From 8ffc680357f82b1ad36e628039e54db4e9942833 Mon Sep 17 00:00:00 2001 From: nick black Date: Mon, 30 Aug 2021 04:56:15 -0400 Subject: [PATCH 0899/5344] console: consume APC, DM, DCS commit 3a2b2eb55681158d3e3ef464fbf47574cf0c517c upstream. The Linux console's VT102 implementation already consumes OSC ("Operating System Command") sequences, probably because that's how palette changes are transmitted. In addition to OSC, there are three other major clases of ANSI control strings: APC ("Application Program Command"), PM ("Privacy Message"), and DCS ("Device Control String"). They are handled similarly to OSC in terms of termination. Source: vt100.net Add three new enumerated states, one for each of these types. All three are handled the same way right now--they simply consume input until terminated. I hope to expand upon this firmament in the future. Add new predicate ansi_control_string(), returning true for any of these states. Replace explicit checks against ESosc with calls to this function. Transition to these states appropriately from the escape initiation (ESesc) state. This was motivated by the following Notcurses bugs: https://github.com/dankamongmen/notcurses/issues/2050 https://github.com/dankamongmen/notcurses/issues/1828 https://github.com/dankamongmen/notcurses/issues/2069 where standard VT sequences are not consumed by the Linux console. It's not necessary that the Linux console *support* these sequences, but it ought *consume* these well-specified classes of sequences. Tested by sending a variety of escape sequences to the console, and verifying that they still worked, or were now properly consumed. Verified that the escapes were properly terminated at a generic level. Verified that the Notcurses tools continued to show expected output on the Linux console, except now without escape bleedthrough. Link: https://lore.kernel.org/lkml/YSydL0q8iaUfkphg@schwarzgerat.orthanc/ Signed-off-by: nick black Cc: Greg Kroah-Hartman Cc: Jiri Slaby Cc: Tetsuo Handa Cc: Daniel Vetter Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- drivers/tty/vt/vt.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c index 6f013d7f5bd0f..404b80dc06b87 100644 --- a/drivers/tty/vt/vt.c +++ b/drivers/tty/vt/vt.c @@ -2070,7 +2070,7 @@ static void restore_cur(struct vc_data *vc) enum { ESnormal, ESesc, ESsquare, ESgetpars, ESfunckey, EShash, ESsetG0, ESsetG1, ESpercent, EScsiignore, ESnonstd, - ESpalette, ESosc }; + ESpalette, ESosc, ESapc, ESpm, ESdcs }; /* console_lock is held (except via vc_init()) */ static void reset_terminal(struct vc_data *vc, int do_clear) @@ -2124,20 +2124,28 @@ static void reset_terminal(struct vc_data *vc, int do_clear) csi_J(vc, 2); } +/* is this state an ANSI control string? */ +static bool ansi_control_string(unsigned int state) +{ + if (state == ESosc || state == ESapc || state == ESpm || state == ESdcs) + return true; + return false; +} + /* console_lock is held */ static void do_con_trol(struct tty_struct *tty, struct vc_data *vc, int c) { /* * Control characters can be used in the _middle_ - * of an escape sequence. + * of an escape sequence, aside from ANSI control strings. */ - if (vc->vc_state == ESosc && c>=8 && c<=13) /* ... except for OSC */ + if (ansi_control_string(vc->vc_state) && c >= 8 && c <= 13) return; switch (c) { case 0: return; case 7: - if (vc->vc_state == ESosc) + if (ansi_control_string(vc->vc_state)) vc->vc_state = ESnormal; else if (vc->vc_bell_duration) kd_mksound(vc->vc_bell_pitch, vc->vc_bell_duration); @@ -2196,6 +2204,12 @@ static void do_con_trol(struct tty_struct *tty, struct vc_data *vc, int c) case ']': vc->vc_state = ESnonstd; return; + case '_': + vc->vc_state = ESapc; + return; + case '^': + vc->vc_state = ESpm; + return; case '%': vc->vc_state = ESpercent; return; @@ -2212,6 +2226,9 @@ static void do_con_trol(struct tty_struct *tty, struct vc_data *vc, int c) case 'H': vc->vc_tab_stop[7 & (vc->vc_x >> 5)] |= (1 << (vc->vc_x & 31)); return; + case 'P': + vc->vc_state = ESdcs; + return; case 'Z': respond_ID(tty); return; @@ -2531,8 +2548,14 @@ static void do_con_trol(struct tty_struct *tty, struct vc_data *vc, int c) vc->vc_translate = set_translate(vc->vc_G1_charset, vc); vc->vc_state = ESnormal; return; + case ESapc: + return; case ESosc: return; + case ESpm: + return; + case ESdcs: + return; default: vc->vc_state = ESnormal; } From a61c4b7f3b8e201d66a15b35bb998d42a1f26c3f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 9 Sep 2021 16:59:42 +0200 Subject: [PATCH 0900/5344] s390/pci_mmio: fully validate the VMA before calling follow_pte() commit a8b92b8c1eac8d655a97b1e90f4d83c25d9b9a18 upstream. We should not walk/touch page tables outside of VMA boundaries when holding only the mmap sem in read mode. Evil user space can modify the VMA layout just before this function runs and e.g., trigger races with page table removal code since commit dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in munmap"). find_vma() does not check if the address is >= the VMA start address; use vma_lookup() instead. Reviewed-by: Niklas Schnelle Reviewed-by: Liam R. Howlett Fixes: dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in munmap") Signed-off-by: David Hildenbrand Signed-off-by: Vasily Gorbik Signed-off-by: Greg Kroah-Hartman --- arch/s390/pci/pci_mmio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c index 38efa3e852c4f..a90535b3ef3aa 100644 --- a/arch/s390/pci/pci_mmio.c +++ b/arch/s390/pci/pci_mmio.c @@ -128,7 +128,7 @@ static long get_pfn(unsigned long user_addr, unsigned long access, mmap_read_lock(current->mm); ret = -EINVAL; vma = find_vma(current->mm, user_addr); - if (!vma) + if (!vma || user_addr < vma->vm_start) goto out; ret = -EACCES; if (!(vma->vm_flags & access)) From dde0a16e6fc082b4fbf9d17a9e33b2f46babf0a1 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 18 Mar 2021 21:03:33 -0700 Subject: [PATCH 0901/5344] ARM: Qualify enabling of swiotlb_init() commit fcf044891c84e38fc90eb736b818781bccf94e38 upstream. We do not need a SWIOTLB unless we have DRAM that is addressable beyond the arm_dma_limit. Compare max_pfn with arm_dma_pfn_limit to determine whether we do need a SWIOTLB to be initialized. Fixes: ad3c7b18c5b3 ("arm: use swiotlb for bounce buffering on LPAE configs") Signed-off-by: Florian Fainelli Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Greg Kroah-Hartman --- arch/arm/mm/init.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 0804a6af4a3b7..5a3641b5ec2cd 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -469,7 +469,11 @@ static void __init free_highpages(void) void __init mem_init(void) { #ifdef CONFIG_ARM_LPAE - swiotlb_init(1); + if (swiotlb_force == SWIOTLB_FORCE || + max_pfn > arm_dma_pfn_limit) + swiotlb_init(1); + else + swiotlb_force = SWIOTLB_NO_FORCE; #endif set_max_mapnr(pfn_to_page(max_pfn) - mem_map); From 4357405cc49e1c2e8d9714f92ff8577562840c52 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 15 Dec 2020 20:47:16 -0800 Subject: [PATCH 0902/5344] apparmor: remove duplicate macro list_entry_is_head() commit 9801ca279ad37f72f71234fa81722afd95a3f997 upstream. Strangely I hadn't had noticed the existence of the list_entry_is_head() in apparmor code when added the same one in the list.h. Luckily it's fully identical and didn't break builds. In any case we don't need a duplicate anymore, thus remove it from apparmor code. Link: https://lkml.kernel.org/r/20201208100639.88182-1-andriy.shevchenko@linux.intel.com Fixes: e130816164e244 ("include/linux/list.h: add a macro to test if entry is pointing to the head") Signed-off-by: Andy Shevchenko Acked-by: John Johansen Cc: James Morris Cc: "Serge E . Hallyn " Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Nobuhiro Iwamatsu (CIP) Signed-off-by: Greg Kroah-Hartman --- security/apparmor/apparmorfs.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 7c4238a040732..95da9f8a7516e 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -1960,9 +1960,6 @@ int __aafs_ns_mkdir(struct aa_ns *ns, struct dentry *parent, const char *name, return error; } - -#define list_entry_is_head(pos, head, member) (&pos->member == (head)) - /** * __next_ns - find the next namespace to list * @root: root namespace to stop search at (NOT NULL) From 80f2e2ce526919ce8799a20121572975a4813917 Mon Sep 17 00:00:00 2001 From: Alex Sverdlin Date: Wed, 22 Sep 2021 10:00:31 -0700 Subject: [PATCH 0903/5344] ARM: 9077/1: PLT: Move struct plt_entries definition to header commit 4e271701c17dee70c6e1351c4d7d42e70405c6a9 upstream No functional change, later it will be re-used in several files. Signed-off-by: Alexander Sverdlin Signed-off-by: Russell King Signed-off-by: Florian Fainelli Signed-off-by: Greg Kroah-Hartman --- arch/arm/include/asm/module.h | 9 +++++++++ arch/arm/kernel/module-plts.c | 9 --------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/arm/include/asm/module.h b/arch/arm/include/asm/module.h index 182163b55546c..78e4c16446287 100644 --- a/arch/arm/include/asm/module.h +++ b/arch/arm/include/asm/module.h @@ -19,6 +19,15 @@ enum { }; #endif +#define PLT_ENT_STRIDE L1_CACHE_BYTES +#define PLT_ENT_COUNT (PLT_ENT_STRIDE / sizeof(u32)) +#define PLT_ENT_SIZE (sizeof(struct plt_entries) / PLT_ENT_COUNT) + +struct plt_entries { + u32 ldr[PLT_ENT_COUNT]; + u32 lit[PLT_ENT_COUNT]; +}; + struct mod_plt_sec { struct elf32_shdr *plt; int plt_count; diff --git a/arch/arm/kernel/module-plts.c b/arch/arm/kernel/module-plts.c index b647741c0ab06..d1b76f3f3df98 100644 --- a/arch/arm/kernel/module-plts.c +++ b/arch/arm/kernel/module-plts.c @@ -11,10 +11,6 @@ #include #include -#define PLT_ENT_STRIDE L1_CACHE_BYTES -#define PLT_ENT_COUNT (PLT_ENT_STRIDE / sizeof(u32)) -#define PLT_ENT_SIZE (sizeof(struct plt_entries) / PLT_ENT_COUNT) - #ifdef CONFIG_THUMB2_KERNEL #define PLT_ENT_LDR __opcode_to_mem_thumb32(0xf8dff000 | \ (PLT_ENT_STRIDE - 4)) @@ -23,11 +19,6 @@ (PLT_ENT_STRIDE - 8)) #endif -struct plt_entries { - u32 ldr[PLT_ENT_COUNT]; - u32 lit[PLT_ENT_COUNT]; -}; - static bool in_init(const struct module *mod, unsigned long loc) { return loc - (u32)mod->init_layout.base < mod->init_layout.size; From 4c3aa7189e41c7c02b89e097c49cb1e9592dcb4b Mon Sep 17 00:00:00 2001 From: Alex Sverdlin Date: Wed, 22 Sep 2021 10:00:32 -0700 Subject: [PATCH 0904/5344] ARM: 9078/1: Add warn suppress parameter to arm_gen_branch_link() commit 890cb057a46d323fd8c77ebecb6485476614cd21 upstream Will be used in the following patch. No functional change. Signed-off-by: Alexander Sverdlin Signed-off-by: Russell King Signed-off-by: Florian Fainelli Signed-off-by: Greg Kroah-Hartman --- arch/arm/include/asm/insn.h | 8 ++++---- arch/arm/kernel/ftrace.c | 2 +- arch/arm/kernel/insn.c | 19 ++++++++++--------- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/arch/arm/include/asm/insn.h b/arch/arm/include/asm/insn.h index f20e08ac85aeb..5475cbf9fb6b4 100644 --- a/arch/arm/include/asm/insn.h +++ b/arch/arm/include/asm/insn.h @@ -13,18 +13,18 @@ arm_gen_nop(void) } unsigned long -__arm_gen_branch(unsigned long pc, unsigned long addr, bool link); +__arm_gen_branch(unsigned long pc, unsigned long addr, bool link, bool warn); static inline unsigned long arm_gen_branch(unsigned long pc, unsigned long addr) { - return __arm_gen_branch(pc, addr, false); + return __arm_gen_branch(pc, addr, false, true); } static inline unsigned long -arm_gen_branch_link(unsigned long pc, unsigned long addr) +arm_gen_branch_link(unsigned long pc, unsigned long addr, bool warn) { - return __arm_gen_branch(pc, addr, true); + return __arm_gen_branch(pc, addr, true, warn); } #endif diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c index bda949fd84e8b..f2073cee41024 100644 --- a/arch/arm/kernel/ftrace.c +++ b/arch/arm/kernel/ftrace.c @@ -73,7 +73,7 @@ int ftrace_arch_code_modify_post_process(void) static unsigned long ftrace_call_replace(unsigned long pc, unsigned long addr) { - return arm_gen_branch_link(pc, addr); + return arm_gen_branch_link(pc, addr, true); } static int ftrace_modify_code(unsigned long pc, unsigned long old, diff --git a/arch/arm/kernel/insn.c b/arch/arm/kernel/insn.c index 2e844b70386b3..db0acbb7d7a02 100644 --- a/arch/arm/kernel/insn.c +++ b/arch/arm/kernel/insn.c @@ -3,8 +3,9 @@ #include #include -static unsigned long -__arm_gen_branch_thumb2(unsigned long pc, unsigned long addr, bool link) +static unsigned long __arm_gen_branch_thumb2(unsigned long pc, + unsigned long addr, bool link, + bool warn) { unsigned long s, j1, j2, i1, i2, imm10, imm11; unsigned long first, second; @@ -12,7 +13,7 @@ __arm_gen_branch_thumb2(unsigned long pc, unsigned long addr, bool link) offset = (long)addr - (long)(pc + 4); if (offset < -16777216 || offset > 16777214) { - WARN_ON_ONCE(1); + WARN_ON_ONCE(warn); return 0; } @@ -33,8 +34,8 @@ __arm_gen_branch_thumb2(unsigned long pc, unsigned long addr, bool link) return __opcode_thumb32_compose(first, second); } -static unsigned long -__arm_gen_branch_arm(unsigned long pc, unsigned long addr, bool link) +static unsigned long __arm_gen_branch_arm(unsigned long pc, unsigned long addr, + bool link, bool warn) { unsigned long opcode = 0xea000000; long offset; @@ -44,7 +45,7 @@ __arm_gen_branch_arm(unsigned long pc, unsigned long addr, bool link) offset = (long)addr - (long)(pc + 8); if (unlikely(offset < -33554432 || offset > 33554428)) { - WARN_ON_ONCE(1); + WARN_ON_ONCE(warn); return 0; } @@ -54,10 +55,10 @@ __arm_gen_branch_arm(unsigned long pc, unsigned long addr, bool link) } unsigned long -__arm_gen_branch(unsigned long pc, unsigned long addr, bool link) +__arm_gen_branch(unsigned long pc, unsigned long addr, bool link, bool warn) { if (IS_ENABLED(CONFIG_THUMB2_KERNEL)) - return __arm_gen_branch_thumb2(pc, addr, link); + return __arm_gen_branch_thumb2(pc, addr, link, warn); else - return __arm_gen_branch_arm(pc, addr, link); + return __arm_gen_branch_arm(pc, addr, link, warn); } From a09c7a3d413b6d6c13f429fe9bb6b30d2ae1c7a0 Mon Sep 17 00:00:00 2001 From: Alex Sverdlin Date: Wed, 22 Sep 2021 10:00:33 -0700 Subject: [PATCH 0905/5344] ARM: 9079/1: ftrace: Add MODULE_PLTS support commit 79f32b221b18c15a98507b101ef4beb52444cc6f upstream Teach ftrace_make_call() and ftrace_make_nop() about PLTs. Teach PLT code about FTRACE and all its callbacks. Otherwise the following might happen: ------------[ cut here ]------------ WARNING: CPU: 14 PID: 2265 at .../arch/arm/kernel/insn.c:14 __arm_gen_branch+0x83/0x8c() ... Hardware name: LSI Axxia AXM55XX [] (unwind_backtrace) from [] (show_stack+0x11/0x14) [] (show_stack) from [] (dump_stack+0x81/0xa8) [] (dump_stack) from [] (warn_slowpath_common+0x69/0x90) [] (warn_slowpath_common) from [] (warn_slowpath_null+0x17/0x1c) [] (warn_slowpath_null) from [] (__arm_gen_branch+0x83/0x8c) [] (__arm_gen_branch) from [] (ftrace_make_nop+0xf/0x24) [] (ftrace_make_nop) from [] (ftrace_process_locs+0x27b/0x3e8) [] (ftrace_process_locs) from [] (load_module+0x11e9/0x1a44) [] (load_module) from [] (SyS_finit_module+0x59/0x84) [] (SyS_finit_module) from [] (ret_fast_syscall+0x1/0x18) ---[ end trace e1b64ced7a89adcc ]--- ------------[ cut here ]------------ WARNING: CPU: 14 PID: 2265 at .../kernel/trace/ftrace.c:1979 ftrace_bug+0x1b1/0x234() ... Hardware name: LSI Axxia AXM55XX [] (unwind_backtrace) from [] (show_stack+0x11/0x14) [] (show_stack) from [] (dump_stack+0x81/0xa8) [] (dump_stack) from [] (warn_slowpath_common+0x69/0x90) [] (warn_slowpath_common) from [] (warn_slowpath_null+0x17/0x1c) [] (warn_slowpath_null) from [] (ftrace_bug+0x1b1/0x234) [] (ftrace_bug) from [] (ftrace_process_locs+0x285/0x3e8) [] (ftrace_process_locs) from [] (load_module+0x11e9/0x1a44) [] (load_module) from [] (SyS_finit_module+0x59/0x84) [] (SyS_finit_module) from [] (ret_fast_syscall+0x1/0x18) ---[ end trace e1b64ced7a89adcd ]--- ftrace failed to modify [] 0xe9ef7006 actual: 02:f0:3b:fa ftrace record flags: 0 (0) expected tramp: c0314265 Signed-off-by: Alexander Sverdlin Signed-off-by: Russell King Signed-off-by: Florian Fainelli Signed-off-by: Greg Kroah-Hartman --- arch/arm/include/asm/ftrace.h | 3 +++ arch/arm/include/asm/module.h | 1 + arch/arm/kernel/ftrace.c | 46 +++++++++++++++++++++++++++++------ arch/arm/kernel/module-plts.c | 44 ++++++++++++++++++++++++++++++--- 4 files changed, 82 insertions(+), 12 deletions(-) diff --git a/arch/arm/include/asm/ftrace.h b/arch/arm/include/asm/ftrace.h index 18b0197f23848..15bd9af13497f 100644 --- a/arch/arm/include/asm/ftrace.h +++ b/arch/arm/include/asm/ftrace.h @@ -16,6 +16,9 @@ extern void __gnu_mcount_nc(void); #ifdef CONFIG_DYNAMIC_FTRACE struct dyn_arch_ftrace { +#ifdef CONFIG_ARM_MODULE_PLTS + struct module *mod; +#endif }; static inline unsigned long ftrace_call_adjust(unsigned long addr) diff --git a/arch/arm/include/asm/module.h b/arch/arm/include/asm/module.h index 78e4c16446287..961fedbd810ec 100644 --- a/arch/arm/include/asm/module.h +++ b/arch/arm/include/asm/module.h @@ -30,6 +30,7 @@ struct plt_entries { struct mod_plt_sec { struct elf32_shdr *plt; + struct plt_entries *plt_ent; int plt_count; }; diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c index f2073cee41024..12b6da56f88dd 100644 --- a/arch/arm/kernel/ftrace.c +++ b/arch/arm/kernel/ftrace.c @@ -71,9 +71,10 @@ int ftrace_arch_code_modify_post_process(void) return 0; } -static unsigned long ftrace_call_replace(unsigned long pc, unsigned long addr) +static unsigned long ftrace_call_replace(unsigned long pc, unsigned long addr, + bool warn) { - return arm_gen_branch_link(pc, addr, true); + return arm_gen_branch_link(pc, addr, warn); } static int ftrace_modify_code(unsigned long pc, unsigned long old, @@ -112,14 +113,14 @@ int ftrace_update_ftrace_func(ftrace_func_t func) int ret; pc = (unsigned long)&ftrace_call; - new = ftrace_call_replace(pc, (unsigned long)func); + new = ftrace_call_replace(pc, (unsigned long)func, true); ret = ftrace_modify_code(pc, 0, new, false); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS if (!ret) { pc = (unsigned long)&ftrace_regs_call; - new = ftrace_call_replace(pc, (unsigned long)func); + new = ftrace_call_replace(pc, (unsigned long)func, true); ret = ftrace_modify_code(pc, 0, new, false); } @@ -132,10 +133,22 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { unsigned long new, old; unsigned long ip = rec->ip; + unsigned long aaddr = adjust_address(rec, addr); + struct module *mod = NULL; + +#ifdef CONFIG_ARM_MODULE_PLTS + mod = rec->arch.mod; +#endif old = ftrace_nop_replace(rec); - new = ftrace_call_replace(ip, adjust_address(rec, addr)); + new = ftrace_call_replace(ip, aaddr, !mod); +#ifdef CONFIG_ARM_MODULE_PLTS + if (!new && mod) { + aaddr = get_module_plt(mod, ip, aaddr); + new = ftrace_call_replace(ip, aaddr, true); + } +#endif return ftrace_modify_code(rec->ip, old, new, true); } @@ -148,9 +161,9 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, unsigned long new, old; unsigned long ip = rec->ip; - old = ftrace_call_replace(ip, adjust_address(rec, old_addr)); + old = ftrace_call_replace(ip, adjust_address(rec, old_addr), true); - new = ftrace_call_replace(ip, adjust_address(rec, addr)); + new = ftrace_call_replace(ip, adjust_address(rec, addr), true); return ftrace_modify_code(rec->ip, old, new, true); } @@ -160,12 +173,29 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { + unsigned long aaddr = adjust_address(rec, addr); unsigned long ip = rec->ip; unsigned long old; unsigned long new; int ret; - old = ftrace_call_replace(ip, adjust_address(rec, addr)); +#ifdef CONFIG_ARM_MODULE_PLTS + /* mod is only supplied during module loading */ + if (!mod) + mod = rec->arch.mod; + else + rec->arch.mod = mod; +#endif + + old = ftrace_call_replace(ip, aaddr, + !IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || !mod); +#ifdef CONFIG_ARM_MODULE_PLTS + if (!old && mod) { + aaddr = get_module_plt(mod, ip, aaddr); + old = ftrace_call_replace(ip, aaddr, true); + } +#endif + new = ftrace_nop_replace(rec); ret = ftrace_modify_code(ip, old, new, true); diff --git a/arch/arm/kernel/module-plts.c b/arch/arm/kernel/module-plts.c index d1b76f3f3df98..1285d6770fdf8 100644 --- a/arch/arm/kernel/module-plts.c +++ b/arch/arm/kernel/module-plts.c @@ -4,6 +4,7 @@ */ #include +#include #include #include #include @@ -19,19 +20,52 @@ (PLT_ENT_STRIDE - 8)) #endif +static const u32 fixed_plts[] = { +#ifdef CONFIG_FUNCTION_TRACER + FTRACE_ADDR, + MCOUNT_ADDR, +#endif +}; + static bool in_init(const struct module *mod, unsigned long loc) { return loc - (u32)mod->init_layout.base < mod->init_layout.size; } +static void prealloc_fixed(struct mod_plt_sec *pltsec, struct plt_entries *plt) +{ + int i; + + if (!ARRAY_SIZE(fixed_plts) || pltsec->plt_count) + return; + pltsec->plt_count = ARRAY_SIZE(fixed_plts); + + for (i = 0; i < ARRAY_SIZE(plt->ldr); ++i) + plt->ldr[i] = PLT_ENT_LDR; + + BUILD_BUG_ON(sizeof(fixed_plts) > sizeof(plt->lit)); + memcpy(plt->lit, fixed_plts, sizeof(fixed_plts)); +} + u32 get_module_plt(struct module *mod, unsigned long loc, Elf32_Addr val) { struct mod_plt_sec *pltsec = !in_init(mod, loc) ? &mod->arch.core : &mod->arch.init; + struct plt_entries *plt; + int idx; + + /* cache the address, ELF header is available only during module load */ + if (!pltsec->plt_ent) + pltsec->plt_ent = (struct plt_entries *)pltsec->plt->sh_addr; + plt = pltsec->plt_ent; - struct plt_entries *plt = (struct plt_entries *)pltsec->plt->sh_addr; - int idx = 0; + prealloc_fixed(pltsec, plt); + + for (idx = 0; idx < ARRAY_SIZE(fixed_plts); ++idx) + if (plt->lit[idx] == val) + return (u32)&plt->ldr[idx]; + idx = 0; /* * Look for an existing entry pointing to 'val'. Given that the * relocations are sorted, this will be the last entry we allocated. @@ -179,8 +213,8 @@ static unsigned int count_plts(const Elf32_Sym *syms, Elf32_Addr base, int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings, struct module *mod) { - unsigned long core_plts = 0; - unsigned long init_plts = 0; + unsigned long core_plts = ARRAY_SIZE(fixed_plts); + unsigned long init_plts = ARRAY_SIZE(fixed_plts); Elf32_Shdr *s, *sechdrs_end = sechdrs + ehdr->e_shnum; Elf32_Sym *syms = NULL; @@ -235,6 +269,7 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, mod->arch.core.plt->sh_size = round_up(core_plts * PLT_ENT_SIZE, sizeof(struct plt_entries)); mod->arch.core.plt_count = 0; + mod->arch.core.plt_ent = NULL; mod->arch.init.plt->sh_type = SHT_NOBITS; mod->arch.init.plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC; @@ -242,6 +277,7 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, mod->arch.init.plt->sh_size = round_up(init_plts * PLT_ENT_SIZE, sizeof(struct plt_entries)); mod->arch.init.plt_count = 0; + mod->arch.init.plt_ent = NULL; pr_debug("%s: plt=%x, init.plt=%x\n", __func__, mod->arch.core.plt->sh_size, mod->arch.init.plt->sh_size); From 49067e314ab0f66d760c3a996f21a91ca9829a69 Mon Sep 17 00:00:00 2001 From: Alex Sverdlin Date: Wed, 22 Sep 2021 10:00:34 -0700 Subject: [PATCH 0906/5344] ARM: 9098/1: ftrace: MODULE_PLT: Fix build problem without DYNAMIC_FTRACE commit 6fa630bf473827aee48cbf0efbbdf6f03134e890 upstream FTRACE_ADDR is only defined when CONFIG_DYNAMIC_FTRACE is defined, the latter is even stronger requirement than CONFIG_FUNCTION_TRACER (which is enough for MCOUNT_ADDR). Link: https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org/thread/ZUVCQBHDMFVR7CCB7JPESLJEWERZDJ3T/ Fixes: 1f12fb25c5c5d22f ("ARM: 9079/1: ftrace: Add MODULE_PLTS support") Reported-by: kernel test robot Signed-off-by: Alexander Sverdlin Signed-off-by: Russell King Signed-off-by: Florian Fainelli Signed-off-by: Greg Kroah-Hartman --- arch/arm/kernel/module-plts.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/kernel/module-plts.c b/arch/arm/kernel/module-plts.c index 1285d6770fdf8..d1c2d3bd55b64 100644 --- a/arch/arm/kernel/module-plts.c +++ b/arch/arm/kernel/module-plts.c @@ -21,7 +21,7 @@ #endif static const u32 fixed_plts[] = { -#ifdef CONFIG_FUNCTION_TRACER +#ifdef CONFIG_DYNAMIC_FTRACE FTRACE_ADDR, MCOUNT_ADDR, #endif From f5a145ac2333dd330f9431489027529178ad9128 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Mon, 28 Jun 2021 16:13:43 -0300 Subject: [PATCH 0907/5344] sctp: validate chunk size in __rcv_asconf_lookup commit b6ffe7671b24689c09faa5675dd58f93758a97ae upstream. In one of the fallbacks that SCTP has for identifying an association for an incoming packet, it looks for AddIp chunk (from ASCONF) and take a peek. Thing is, at this stage nothing was validating that the chunk actually had enough content for that, allowing the peek to happen over uninitialized memory. Similar check already exists in actual asconf handling in sctp_verify_asconf(). Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sctp/input.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/sctp/input.c b/net/sctp/input.c index db4f917aafd90..2aca37717ed1e 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -1168,6 +1168,9 @@ static struct sctp_association *__sctp_rcv_asconf_lookup( union sctp_addr_param *param; union sctp_addr paddr; + if (ntohs(ch->length) < sizeof(*asconf) + sizeof(struct sctp_paramhdr)) + return NULL; + /* Skip over the ADDIP header and find the Address parameter */ param = (union sctp_addr_param *)(asconf + 1); From 017ef1d4e4ceb5c29315d4786cd426562285722d Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Mon, 28 Jun 2021 16:13:44 -0300 Subject: [PATCH 0908/5344] sctp: add param size validation for SCTP_PARAM_SET_PRIMARY commit ef6c8d6ccf0c1dccdda092ebe8782777cd7803c9 upstream. When SCTP handles an INIT chunk, it calls for example: sctp_sf_do_5_1B_init sctp_verify_init sctp_verify_param sctp_process_init sctp_process_param handling of SCTP_PARAM_SET_PRIMARY sctp_verify_init() wasn't doing proper size validation and neither the later handling, allowing it to work over the chunk itself, possibly being uninitialized memory. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sctp/sm_make_chunk.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 38ca7ce8a44ed..000aa62281f46 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -2157,9 +2157,16 @@ static enum sctp_ierror sctp_verify_param(struct net *net, break; case SCTP_PARAM_SET_PRIMARY: - if (ep->asconf_enable) - break; - goto unhandled; + if (!ep->asconf_enable) + goto unhandled; + + if (ntohs(param.p->length) < sizeof(struct sctp_addip_param) + + sizeof(struct sctp_paramhdr)) { + sctp_process_inv_paramlength(asoc, param.p, + chunk, err_chunk); + retval = SCTP_IERROR_ABORT; + } + break; case SCTP_PARAM_HOST_NAME_ADDRESS: /* Tell the peer, we won't support this param. */ From 1fb8f7f547d28b66d7b9f23251c9dd7361a13a3b Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Sat, 14 Aug 2021 16:56:26 -0700 Subject: [PATCH 0909/5344] staging: rtl8192u: Fix bitwise vs logical operator in TranslateRxSignalStuff819xUsb() commit 099ec97ac92911abfb102bb5c68ed270fc12e0dd upstream. clang warns: drivers/staging/rtl8192u/r8192U_core.c:4268:20: warning: bitwise and of boolean expressions; did you mean logical and? [-Wbool-operation-and] bpacket_toself = bpacket_match_bssid & ^~~~~~~~~~~~~~~~~~~~~ && 1 warning generated. Replace the bitwise AND with a logical one to clear up the warning, as that is clearly what was intended. Fixes: 8fc8598e61f6 ("Staging: Added Realtek rtl8192u driver to staging") Signed-off-by: Nathan Chancellor Link: https://lore.kernel.org/r/20210814235625.1780033-1-nathan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/staging/rtl8192u/r8192U_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/rtl8192u/r8192U_core.c b/drivers/staging/rtl8192u/r8192U_core.c index 66cd43f963c9a..e739d1979c877 100644 --- a/drivers/staging/rtl8192u/r8192U_core.c +++ b/drivers/staging/rtl8192u/r8192U_core.c @@ -4338,7 +4338,7 @@ static void TranslateRxSignalStuff819xUsb(struct sk_buff *skb, bpacket_match_bssid = (type != IEEE80211_FTYPE_CTL) && (ether_addr_equal(priv->ieee80211->current_network.bssid, (fc & IEEE80211_FCTL_TODS) ? hdr->addr1 : (fc & IEEE80211_FCTL_FROMDS) ? hdr->addr2 : hdr->addr3)) && (!pstats->bHwError) && (!pstats->bCRC) && (!pstats->bICV); - bpacket_toself = bpacket_match_bssid & + bpacket_toself = bpacket_match_bssid && (ether_addr_equal(praddr, priv->ieee80211->dev->dev_addr)); if (WLAN_FC_GET_FRAMETYPE(fc) == IEEE80211_STYPE_BEACON) From 168354f656052d4c1aabaf14d9b61f67cd75b04b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 25 Jun 2021 10:34:37 +0200 Subject: [PATCH 0910/5344] um: virtio_uml: fix memory leak on init failures commit 7ad28e0df7ee9dbcb793bb88dd81d4d22bb9a10e upstream. If initialization fails, e.g. because the connection failed, we leak the 'vu_dev'. Fix that. Reported by smatch. Fixes: 5d38f324993f ("um: drivers: Add virtio vhost-user driver") Signed-off-by: Johannes Berg Acked-By: Anton Ivanov Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- arch/um/drivers/virtio_uml.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c index e77d1c2a92f3e..69d38839dffa6 100644 --- a/arch/um/drivers/virtio_uml.c +++ b/arch/um/drivers/virtio_uml.c @@ -994,7 +994,7 @@ static int virtio_uml_probe(struct platform_device *pdev) rc = os_connect_socket(pdata->socket_path); } while (rc == -EINTR); if (rc < 0) - return rc; + goto error_free; vu_dev->sock = rc; rc = vhost_user_init(vu_dev); @@ -1010,6 +1010,8 @@ static int virtio_uml_probe(struct platform_device *pdev) error_init: os_close_file(vu_dev->sock); +error_free: + kfree(vu_dev); return rc; } From 13fde7179ab8d70ee44cbe21a47c947afafb64e1 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 30 Jul 2021 23:27:15 +0300 Subject: [PATCH 0911/5344] dmaengine: acpi: Avoid comparison GSI with Linux vIRQ commit 67db87dc8284070adb15b3c02c1c31d5cf51c5d6 upstream. Currently the CRST parsing relies on the fact that on most of x86 devices the IRQ mapping is 1:1 with Linux vIRQ. However, it may be not true for some. Fix this by converting GSI to Linux vIRQ before checking it. Fixes: ee8209fd026b ("dma: acpi-dma: parse CSRT to extract additional resources") Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210730202715.24375-1-andriy.shevchenko@linux.intel.com Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/acpi-dma.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/dma/acpi-dma.c b/drivers/dma/acpi-dma.c index dcbcb712de6e8..731f453ecb51f 100644 --- a/drivers/dma/acpi-dma.c +++ b/drivers/dma/acpi-dma.c @@ -70,10 +70,14 @@ static int acpi_dma_parse_resource_group(const struct acpi_csrt_group *grp, si = (const struct acpi_csrt_shared_info *)&grp[1]; - /* Match device by MMIO and IRQ */ + /* Match device by MMIO */ if (si->mmio_base_low != lower_32_bits(mem) || - si->mmio_base_high != upper_32_bits(mem) || - si->gsi_interrupt != irq) + si->mmio_base_high != upper_32_bits(mem)) + return 0; + + /* Match device by Linux vIRQ */ + ret = acpi_register_gsi(NULL, si->gsi_interrupt, si->interrupt_mode, si->interrupt_polarity); + if (ret != irq) return 0; dev_dbg(&adev->dev, "matches with %.4s%04X (rev %u)\n", From ceafb3ed6fa27f1157710fdb051645f7bc83556f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 10 Aug 2021 11:44:13 +0300 Subject: [PATCH 0912/5344] thermal/drivers/exynos: Fix an error code in exynos_tmu_probe() commit 02d438f62c05f0d055ceeedf12a2f8796b258c08 upstream. This error path return success but it should propagate the negative error code from devm_clk_get(). Fixes: 6c247393cfdd ("thermal: exynos: Add TMU support for Exynos7 SoC") Signed-off-by: Dan Carpenter Reviewed-by: Krzysztof Kozlowski Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210810084413.GA23810@kili Signed-off-by: Greg Kroah-Hartman --- drivers/thermal/samsung/exynos_tmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/thermal/samsung/exynos_tmu.c b/drivers/thermal/samsung/exynos_tmu.c index fb2c55123a99e..059e3d1610c98 100644 --- a/drivers/thermal/samsung/exynos_tmu.c +++ b/drivers/thermal/samsung/exynos_tmu.c @@ -1070,6 +1070,7 @@ static int exynos_tmu_probe(struct platform_device *pdev) data->sclk = devm_clk_get(&pdev->dev, "tmu_sclk"); if (IS_ERR(data->sclk)) { dev_err(&pdev->dev, "Failed to get sclk\n"); + ret = PTR_ERR(data->sclk); goto err_clk; } else { ret = clk_prepare_enable(data->sclk); From 7535725e592f51a815a0fefcdb6ee31474bbbea5 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Mon, 17 May 2021 16:35:57 +0800 Subject: [PATCH 0913/5344] 9p/trans_virtio: Remove sysfs file on probe failure commit f997ea3b7afc108eb9761f321b57de2d089c7c48 upstream. This ensures we don't leak the sysfs file if we failed to allocate chan->vc_wq during probe. Link: http://lkml.kernel.org/r/20210517083557.172-1-xieyongji@bytedance.com Fixes: 86c8437383ac ("net/9p: Add sysfs mount_tag file for virtio 9P device") Signed-off-by: Xie Yongji Signed-off-by: Dominique Martinet Signed-off-by: Greg Kroah-Hartman --- net/9p/trans_virtio.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index a3cd90a74012b..f582351d84ecb 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -605,7 +605,7 @@ static int p9_virtio_probe(struct virtio_device *vdev) chan->vc_wq = kmalloc(sizeof(wait_queue_head_t), GFP_KERNEL); if (!chan->vc_wq) { err = -ENOMEM; - goto out_free_tag; + goto out_remove_file; } init_waitqueue_head(chan->vc_wq); chan->ring_bufs_avail = 1; @@ -623,6 +623,8 @@ static int p9_virtio_probe(struct virtio_device *vdev) return 0; +out_remove_file: + sysfs_remove_file(&vdev->dev.kobj, &dev_attr_mount_tag.attr); out_free_tag: kfree(tag); out_free_vq: From dc2d79ae808867e1dca6fa6069b0043a85ebaaca Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 7 Sep 2021 20:00:41 -0700 Subject: [PATCH 0914/5344] prctl: allow to setup brk for et_dyn executables commit e1fbbd073137a9d63279f6bf363151a938347640 upstream. Keno Fischer reported that when a binray loaded via ld-linux-x the prctl(PR_SET_MM_MAP) doesn't allow to setup brk value because it lays before mm:end_data. For example a test program shows | # ~/t | | start_code 401000 | end_code 401a15 | start_stack 7ffce4577dd0 | start_data 403e10 | end_data 40408c | start_brk b5b000 | sbrk(0) b5b000 and when executed via ld-linux | # /lib64/ld-linux-x86-64.so.2 ~/t | | start_code 7fc25b0a4000 | end_code 7fc25b0c4524 | start_stack 7fffcc6b2400 | start_data 7fc25b0ce4c0 | end_data 7fc25b0cff98 | start_brk 55555710c000 | sbrk(0) 55555710c000 This of course prevent criu from restoring such programs. Looking into how kernel operates with brk/start_brk inside brk() syscall I don't see any problem if we allow to setup brk/start_brk without checking for end_data. Even if someone pass some weird address here on a purpose then the worst possible result will be an unexpected unmapping of existing vma (own vma, since prctl works with the callers memory) but test for RLIMIT_DATA is still valid and a user won't be able to gain more memory in case of expanding VMAs via new values shipped with prctl call. Link: https://lkml.kernel.org/r/20210121221207.GB2174@grain Fixes: bbdc6076d2e5 ("binfmt_elf: move brk out of mmap when doing direct loader exec") Signed-off-by: Cyrill Gorcunov Reported-by: Keno Fischer Acked-by: Andrey Vagin Tested-by: Andrey Vagin Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Kirill Tkhai Cc: Eric W. Biederman Cc: Pavel Tikhomirov Cc: Alexander Mikhalitsyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/sys.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/kernel/sys.c b/kernel/sys.c index 4fde189706deb..ae54afd442ad1 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1927,13 +1927,6 @@ static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map) error = -EINVAL; - /* - * @brk should be after @end_data in traditional maps. - */ - if (prctl_map->start_brk <= prctl_map->end_data || - prctl_map->brk <= prctl_map->end_data) - goto out; - /* * Neither we should allow to override limits if they set. */ From de9607ac093bdcd4600a723410a0946df5aa4599 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Tue, 7 Sep 2021 20:00:26 -0700 Subject: [PATCH 0915/5344] nilfs2: use refcount_dec_and_lock() to fix potential UAF commit 98e2e409e76ef7781d8511f997359e9c504a95c1 upstream. When the refcount is decreased to 0, the resource reclamation branch is entered. Before CPU0 reaches the race point (1), CPU1 may obtain the spinlock and traverse the rbtree to find 'root', see nilfs_lookup_root(). Although CPU1 will call refcount_inc() to increase the refcount, it is obviously too late. CPU0 will release 'root' directly, CPU1 then accesses 'root' and triggers UAF. Use refcount_dec_and_lock() to ensure that both the operations of decrease refcount to 0 and link deletion are lock protected eliminates this risk. CPU0 CPU1 nilfs_put_root(): <-------- (1) spin_lock(&nilfs->ns_cptree_lock); rb_erase(&root->rb_node, &nilfs->ns_cptree); spin_unlock(&nilfs->ns_cptree_lock); kfree(root); <-------- use-after-free refcount_t: underflow; use-after-free. WARNING: CPU: 2 PID: 9476 at lib/refcount.c:28 \ refcount_warn_saturate+0x1cf/0x210 lib/refcount.c:28 Modules linked in: CPU: 2 PID: 9476 Comm: syz-executor.0 Not tainted 5.10.45-rc1+ #3 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), ... RIP: 0010:refcount_warn_saturate+0x1cf/0x210 lib/refcount.c:28 ... ... Call Trace: __refcount_sub_and_test include/linux/refcount.h:283 [inline] __refcount_dec_and_test include/linux/refcount.h:315 [inline] refcount_dec_and_test include/linux/refcount.h:333 [inline] nilfs_put_root+0xc1/0xd0 fs/nilfs2/the_nilfs.c:795 nilfs_segctor_destroy fs/nilfs2/segment.c:2749 [inline] nilfs_detach_log_writer+0x3fa/0x570 fs/nilfs2/segment.c:2812 nilfs_put_super+0x2f/0xf0 fs/nilfs2/super.c:467 generic_shutdown_super+0xcd/0x1f0 fs/super.c:464 kill_block_super+0x4a/0x90 fs/super.c:1446 deactivate_locked_super+0x6a/0xb0 fs/super.c:335 deactivate_super+0x85/0x90 fs/super.c:366 cleanup_mnt+0x277/0x2e0 fs/namespace.c:1118 __cleanup_mnt+0x15/0x20 fs/namespace.c:1125 task_work_run+0x8e/0x110 kernel/task_work.c:151 tracehook_notify_resume include/linux/tracehook.h:188 [inline] exit_to_user_mode_loop kernel/entry/common.c:164 [inline] exit_to_user_mode_prepare+0x13c/0x170 kernel/entry/common.c:191 syscall_exit_to_user_mode+0x16/0x30 kernel/entry/common.c:266 do_syscall_64+0x45/0x80 arch/x86/entry/common.c:56 entry_SYSCALL_64_after_hwframe+0x44/0xa9 There is no reproduction program, and the above is only theoretical analysis. Link: https://lkml.kernel.org/r/1629859428-5906-1-git-send-email-konishi.ryusuke@gmail.com Fixes: ba65ae4729bf ("nilfs2: add checkpoint tree to nilfs object") Link: https://lkml.kernel.org/r/20210723012317.4146-1-thunder.leizhen@huawei.com Signed-off-by: Zhen Lei Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/nilfs2/the_nilfs.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 484785cdf96e2..931870768556c 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -797,14 +797,13 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno) void nilfs_put_root(struct nilfs_root *root) { - if (refcount_dec_and_test(&root->count)) { - struct the_nilfs *nilfs = root->nilfs; + struct the_nilfs *nilfs = root->nilfs; - nilfs_sysfs_delete_snapshot_group(root); - - spin_lock(&nilfs->ns_cptree_lock); + if (refcount_dec_and_lock(&root->count, &nilfs->ns_cptree_lock)) { rb_erase(&root->rb_node, &nilfs->ns_cptree); spin_unlock(&nilfs->ns_cptree_lock); + + nilfs_sysfs_delete_snapshot_group(root); iput(root->ifile); kfree(root); From 3763e623a76d5f4d541c5b6ffa868637b9e25238 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Tue, 7 Sep 2021 19:58:21 -0700 Subject: [PATCH 0916/5344] profiling: fix shift-out-of-bounds bugs commit 2d186afd04d669fe9c48b994c41a7405a3c9f16d upstream. Syzbot reported shift-out-of-bounds bug in profile_init(). The problem was in incorrect prof_shift. Since prof_shift value comes from userspace we need to clamp this value into [0, BITS_PER_LONG -1] boundaries. Second possible shiht-out-of-bounds was found by Tetsuo: sample_step local variable in read_profile() had "unsigned int" type, but prof_shift allows to make a BITS_PER_LONG shift. So, to prevent possible shiht-out-of-bounds sample_step type was changed to "unsigned long". Also, "unsigned short int" will be sufficient for storing [0, BITS_PER_LONG] value, that's why there is no need for "unsigned long" prof_shift. Link: https://lkml.kernel.org/r/20210813140022.5011-1-paskripkin@gmail.com Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-and-tested-by: syzbot+e68c89a9510c159d9684@syzkaller.appspotmail.com Suggested-by: Tetsuo Handa Signed-off-by: Pavel Skripkin Cc: Thomas Gleixner Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/profile.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/kernel/profile.c b/kernel/profile.c index af7c94bf5fa1d..e97e42aaf2023 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -41,7 +41,8 @@ struct profile_hit { #define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) static atomic_t *prof_buffer; -static unsigned long prof_len, prof_shift; +static unsigned long prof_len; +static unsigned short int prof_shift; int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); @@ -67,8 +68,8 @@ int profile_setup(char *str) if (str[strlen(sleepstr)] == ',') str += strlen(sleepstr) + 1; if (get_option(&str, &par)) - prof_shift = par; - pr_info("kernel sleep profiling enabled (shift: %ld)\n", + prof_shift = clamp(par, 0, BITS_PER_LONG - 1); + pr_info("kernel sleep profiling enabled (shift: %u)\n", prof_shift); #else pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); @@ -78,21 +79,21 @@ int profile_setup(char *str) if (str[strlen(schedstr)] == ',') str += strlen(schedstr) + 1; if (get_option(&str, &par)) - prof_shift = par; - pr_info("kernel schedule profiling enabled (shift: %ld)\n", + prof_shift = clamp(par, 0, BITS_PER_LONG - 1); + pr_info("kernel schedule profiling enabled (shift: %u)\n", prof_shift); } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { prof_on = KVM_PROFILING; if (str[strlen(kvmstr)] == ',') str += strlen(kvmstr) + 1; if (get_option(&str, &par)) - prof_shift = par; - pr_info("kernel KVM profiling enabled (shift: %ld)\n", + prof_shift = clamp(par, 0, BITS_PER_LONG - 1); + pr_info("kernel KVM profiling enabled (shift: %u)\n", prof_shift); } else if (get_option(&str, &par)) { - prof_shift = par; + prof_shift = clamp(par, 0, BITS_PER_LONG - 1); prof_on = CPU_PROFILING; - pr_info("kernel profiling enabled (shift: %ld)\n", + pr_info("kernel profiling enabled (shift: %u)\n", prof_shift); } return 1; @@ -468,7 +469,7 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) unsigned long p = *ppos; ssize_t read; char *pnt; - unsigned int sample_step = 1 << prof_shift; + unsigned long sample_step = 1UL << prof_shift; profile_flip_buffers(); if (p >= (prof_len+1)*sizeof(unsigned int)) From 3d0f744e633206b20c723e3747f544a6e62c235a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 7 Jul 2021 18:27:49 +0200 Subject: [PATCH 0917/5344] pwm: lpc32xx: Don't modify HW state in .probe() after the PWM chip was registered MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 3d2813fb17e5fd0d73c1d1442ca0192bde4af10e upstream. This fixes a race condition: After pwmchip_add() is called there might already be a consumer and then modifying the hardware behind the consumer's back is bad. So set the default before. (Side-note: I don't know what this register setting actually does, if this modifies the polarity there is an inconsistency because the inversed polarity isn't considered if the PWM is already running during .probe().) Fixes: acfd92fdfb93 ("pwm: lpc32xx: Set PWM_PIN_LEVEL bit to default value") Cc: Sylvain Lemieux Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding Signed-off-by: Greg Kroah-Hartman --- drivers/pwm/pwm-lpc32xx.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/pwm/pwm-lpc32xx.c b/drivers/pwm/pwm-lpc32xx.c index 710d9a207d2b0..522f862eca526 100644 --- a/drivers/pwm/pwm-lpc32xx.c +++ b/drivers/pwm/pwm-lpc32xx.c @@ -120,17 +120,17 @@ static int lpc32xx_pwm_probe(struct platform_device *pdev) lpc32xx->chip.npwm = 1; lpc32xx->chip.base = -1; + /* If PWM is disabled, configure the output to the default value */ + val = readl(lpc32xx->base + (lpc32xx->chip.pwms[0].hwpwm << 2)); + val &= ~PWM_PIN_LEVEL; + writel(val, lpc32xx->base + (lpc32xx->chip.pwms[0].hwpwm << 2)); + ret = pwmchip_add(&lpc32xx->chip); if (ret < 0) { dev_err(&pdev->dev, "failed to add PWM chip, error %d\n", ret); return ret; } - /* When PWM is disable, configure the output to the default value */ - val = readl(lpc32xx->base + (lpc32xx->chip.pwms[0].hwpwm << 2)); - val &= ~PWM_PIN_LEVEL; - writel(val, lpc32xx->base + (lpc32xx->chip.pwms[0].hwpwm << 2)); - platform_set_drvdata(pdev, lpc32xx); return 0; From 75a1d8ea22cb9ff20cb6365e9e89980ae3fc335b Mon Sep 17 00:00:00 2001 From: Petr Oros Date: Tue, 18 Feb 2020 10:35:55 +0100 Subject: [PATCH 0918/5344] phy: avoid unnecessary link-up delay in polling mode commit e96bd2d3b1f83170d1d5c1a99e439b39a22a5b58 upstream. commit 93c0970493c71f ("net: phy: consider latched link-down status in polling mode") removed double-read of latched link-state register for polling mode from genphy_update_link(). This added extra ~1s delay into sequence link down->up. Following scenario: - After boot link goes up - phy_start() is called triggering an aneg restart, hence link goes down and link-down info is latched. - After aneg has finished link goes up. In phy_state_machine is checked link state but it is latched "link is down". The state machine is scheduled after one second and there is detected "link is up". This extra delay can be avoided when we keep link-state register double read in case when link was down previously. With this solution we don't miss a link-down event in polling mode and link-up is faster. Details about this quirky behavior on Realtek phy: Without patch: T0: aneg is started, link goes down, link-down status is latched T0+3s: state machine runs, up-to-date link-down is read T0+4s: state machine runs, aneg is finished (BMSR_ANEGCOMPLETE==1), here i read link-down (BMSR_LSTATUS==0), T0+5s: state machine runs, aneg is finished (BMSR_ANEGCOMPLETE==1), up-to-date link-up is read (BMSR_LSTATUS==1), phydev->link goes up, state change PHY_NOLINK to PHY_RUNNING With patch: T0: aneg is started, link goes down, link-down status is latched T0+3s: state machine runs, up-to-date link-down is read T0+4s: state machine runs, aneg is finished (BMSR_ANEGCOMPLETE==1), first BMSR read: BMSR_ANEGCOMPLETE==1 and BMSR_LSTATUS==0, second BMSR read: BMSR_ANEGCOMPLETE==1 and BMSR_LSTATUS==1, phydev->link goes up, state change PHY_NOLINK to PHY_RUNNING Signed-off-by: Petr Oros Reviewed-by: Heiner Kallweit Signed-off-by: David S. Miller Cc: Macpaul Lin Signed-off-by: Greg Kroah-Hartman --- drivers/net/phy/phy-c45.c | 5 +++-- drivers/net/phy/phy_device.c | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c index a1caeee122361..bceb0dcdecbd6 100644 --- a/drivers/net/phy/phy-c45.c +++ b/drivers/net/phy/phy-c45.c @@ -239,9 +239,10 @@ int genphy_c45_read_link(struct phy_device *phydev) /* The link state is latched low so that momentary link * drops can be detected. Do not double-read the status - * in polling mode to detect such short link drops. + * in polling mode to detect such short link drops except + * the link was already down. */ - if (!phy_polling_mode(phydev)) { + if (!phy_polling_mode(phydev) || !phydev->link) { val = phy_read_mmd(phydev, devad, MDIO_STAT1); if (val < 0) return val; diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 9d0a306f05623..35ade5d21de51 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1766,9 +1766,10 @@ int genphy_update_link(struct phy_device *phydev) /* The link state is latched low so that momentary link * drops can be detected. Do not double-read the status - * in polling mode to detect such short link drops. + * in polling mode to detect such short link drops except + * the link was already down. */ - if (!phy_polling_mode(phydev)) { + if (!phy_polling_mode(phydev) || !phydev->link) { status = phy_read(phydev, MII_BMSR); if (status < 0) return status; From 295b89abe18630fc568d507e15662d298e3f1321 Mon Sep 17 00:00:00 2001 From: Jongsung Kim Date: Fri, 6 Dec 2019 20:40:00 +0900 Subject: [PATCH 0919/5344] net: stmmac: reset Tx desc base address before restarting Tx commit f421031e3ff0dd288a6e1bbde9aa41a25bb814e6 upstream. Refer to the databook of DesignWare Cores Ethernet MAC Universal: 6.2.1.5 Register 4 (Transmit Descriptor List Address Register If this register is not changed when the ST bit is set to 0, then the DMA takes the descriptor address where it was stopped earlier. The stmmac_tx_err() does zero indices to Tx descriptors, but does not reset HW current Tx descriptor address. To fix inconsistency, the base address of the Tx descriptors should be rewritten before restarting Tx. Signed-off-by: Jongsung Kim Signed-off-by: David S. Miller Cc: Macpaul Lin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 10d28be73f456..4e7cfd3bfcd2e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1987,6 +1987,8 @@ static void stmmac_tx_err(struct stmmac_priv *priv, u32 chan) tx_q->cur_tx = 0; tx_q->mss = 0; netdev_tx_reset_queue(netdev_get_tx_queue(priv->dev, chan)); + stmmac_init_tx_chan(priv, priv->ioaddr, priv->plat->dma_cfg, + tx_q->dma_tx_phy, chan); stmmac_start_tx_dma(priv, chan); priv->dev->stats.tx_errors++; From 055d8fb20dc796abafceebd5bb8a5796815b31ef Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Tue, 7 Sep 2021 20:00:47 -0700 Subject: [PATCH 0920/5344] Kconfig.debug: drop selecting non-existing HARDLOCKUP_DETECTOR_ARCH [ Upstream commit 6fe26259b4884b657cbc233fb9cdade9d704976e ] Commit 05a4a9527931 ("kernel/watchdog: split up config options") adds a new config HARDLOCKUP_DETECTOR, which selects the non-existing config HARDLOCKUP_DETECTOR_ARCH. Hence, ./scripts/checkkconfigsymbols.py warns: HARDLOCKUP_DETECTOR_ARCH Referencing files: lib/Kconfig.debug Simply drop selecting the non-existing HARDLOCKUP_DETECTOR_ARCH. Link: https://lkml.kernel.org/r/20210806115618.22088-1-lukas.bulwahn@gmail.com Fixes: 05a4a9527931 ("kernel/watchdog: split up config options") Signed-off-by: Lukas Bulwahn Cc: Nicholas Piggin Cc: Masahiro Yamada Cc: Babu Moger Cc: Don Zickus Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- lib/Kconfig.debug | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ea6628a9d76bd..0eb69113b411f 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -868,7 +868,6 @@ config HARDLOCKUP_DETECTOR depends on HAVE_HARDLOCKUP_DETECTOR_PERF || HAVE_HARDLOCKUP_DETECTOR_ARCH select LOCKUP_DETECTOR select HARDLOCKUP_DETECTOR_PERF if HAVE_HARDLOCKUP_DETECTOR_PERF - select HARDLOCKUP_DETECTOR_ARCH if HAVE_HARDLOCKUP_DETECTOR_ARCH help Say Y here to enable the kernel to act as a watchdog to detect hard lockups. From 9e26edd1c4a28e51639e09b6724aea18226e3e68 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 22 Jul 2021 11:06:44 +0200 Subject: [PATCH 0921/5344] thermal/core: Fix thermal_cooling_device_register() prototype [ Upstream commit fb83610762dd5927212aa62a468dd3b756b57a88 ] There are two pairs of declarations for thermal_cooling_device_register() and thermal_of_cooling_device_register(), and only one set was changed in a recent patch, so the other one now causes a compile-time warning: drivers/net/wireless/mediatek/mt76/mt7915/init.c: In function 'mt7915_thermal_init': drivers/net/wireless/mediatek/mt76/mt7915/init.c:134:48: error: passing argument 1 of 'thermal_cooling_device_register' discards 'const' qualifier from pointer target type [-Werror=discarded-qualifiers] 134 | cdev = thermal_cooling_device_register(wiphy_name(wiphy), phy, | ^~~~~~~~~~~~~~~~~ In file included from drivers/net/wireless/mediatek/mt76/mt7915/init.c:7: include/linux/thermal.h:407:39: note: expected 'char *' but argument is of type 'const char *' 407 | thermal_cooling_device_register(char *type, void *devdata, | ~~~~~~^~~~ Change the dummy helper functions to have the same arguments as the normal version. Fixes: f991de53a8ab ("thermal: make device_register's type argument const") Signed-off-by: Arnd Bergmann Reviewed-by: Jean-Francois Dagenais Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210722090717.1116748-1-arnd@kernel.org Signed-off-by: Sasha Levin --- include/linux/thermal.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/thermal.h b/include/linux/thermal.h index e45659c759209..a41378bdf27c7 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -501,12 +501,13 @@ static inline void thermal_zone_device_update(struct thermal_zone_device *tz, static inline void thermal_zone_set_trips(struct thermal_zone_device *tz) { } static inline struct thermal_cooling_device * -thermal_cooling_device_register(char *type, void *devdata, +thermal_cooling_device_register(const char *type, void *devdata, const struct thermal_cooling_device_ops *ops) { return ERR_PTR(-ENODEV); } static inline struct thermal_cooling_device * thermal_of_cooling_device_register(struct device_node *np, - char *type, void *devdata, const struct thermal_cooling_device_ops *ops) + const char *type, void *devdata, + const struct thermal_cooling_device_ops *ops) { return ERR_PTR(-ENODEV); } static inline struct thermal_cooling_device * devm_thermal_of_cooling_device_register(struct device *dev, From 05da4131da1833f9f22d7c5b03f710c4feffff95 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 31 Aug 2021 13:48:34 +0200 Subject: [PATCH 0922/5344] drivers: base: cacheinfo: Get rid of DEFINE_SMP_CALL_CACHE_FUNCTION() [ Upstream commit 4b92d4add5f6dcf21275185c997d6ecb800054cd ] DEFINE_SMP_CALL_CACHE_FUNCTION() was usefel before the CPU hotplug rework to ensure that the cache related functions are called on the upcoming CPU because the notifier itself could run on any online CPU. The hotplug state machine guarantees that the callbacks are invoked on the upcoming CPU. So there is no need to have this SMP function call obfuscation. That indirection was missed when the hotplug notifiers were converted. This also solves the problem of ARM64 init_cache_level() invoking ACPI functions which take a semaphore in that context. That's invalid as SMP function calls run with interrupts disabled. Running it just from the callback in context of the CPU hotplug thread solves this. Fixes: 8571890e1513 ("arm64: Add support for ACPI based firmware tables") Reported-by: Guenter Roeck Signed-off-by: Thomas Gleixner Tested-by: Guenter Roeck Acked-by: Will Deacon Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/871r69ersb.ffs@tglx Signed-off-by: Sasha Levin --- arch/arm64/kernel/cacheinfo.c | 7 ++----- arch/mips/kernel/cacheinfo.c | 7 ++----- arch/riscv/kernel/cacheinfo.c | 7 ++----- arch/x86/kernel/cpu/cacheinfo.c | 7 ++----- include/linux/cacheinfo.h | 18 ------------------ 5 files changed, 8 insertions(+), 38 deletions(-) diff --git a/arch/arm64/kernel/cacheinfo.c b/arch/arm64/kernel/cacheinfo.c index 7fa6828bb488a..587543c6c51cb 100644 --- a/arch/arm64/kernel/cacheinfo.c +++ b/arch/arm64/kernel/cacheinfo.c @@ -43,7 +43,7 @@ static void ci_leaf_init(struct cacheinfo *this_leaf, this_leaf->type = type; } -static int __init_cache_level(unsigned int cpu) +int init_cache_level(unsigned int cpu) { unsigned int ctype, level, leaves, fw_level; struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); @@ -78,7 +78,7 @@ static int __init_cache_level(unsigned int cpu) return 0; } -static int __populate_cache_leaves(unsigned int cpu) +int populate_cache_leaves(unsigned int cpu) { unsigned int level, idx; enum cache_type type; @@ -97,6 +97,3 @@ static int __populate_cache_leaves(unsigned int cpu) } return 0; } - -DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level) -DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves) diff --git a/arch/mips/kernel/cacheinfo.c b/arch/mips/kernel/cacheinfo.c index 47312c5294102..529dab855aac9 100644 --- a/arch/mips/kernel/cacheinfo.c +++ b/arch/mips/kernel/cacheinfo.c @@ -17,7 +17,7 @@ do { \ leaf++; \ } while (0) -static int __init_cache_level(unsigned int cpu) +int init_cache_level(unsigned int cpu) { struct cpuinfo_mips *c = ¤t_cpu_data; struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); @@ -69,7 +69,7 @@ static void fill_cpumask_cluster(int cpu, cpumask_t *cpu_map) cpumask_set_cpu(cpu1, cpu_map); } -static int __populate_cache_leaves(unsigned int cpu) +int populate_cache_leaves(unsigned int cpu) { struct cpuinfo_mips *c = ¤t_cpu_data; struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); @@ -98,6 +98,3 @@ static int __populate_cache_leaves(unsigned int cpu) return 0; } - -DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level) -DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves) diff --git a/arch/riscv/kernel/cacheinfo.c b/arch/riscv/kernel/cacheinfo.c index 4c90c07d8c39d..d930bd073b7b2 100644 --- a/arch/riscv/kernel/cacheinfo.c +++ b/arch/riscv/kernel/cacheinfo.c @@ -16,7 +16,7 @@ static void ci_leaf_init(struct cacheinfo *this_leaf, this_leaf->type = type; } -static int __init_cache_level(unsigned int cpu) +int init_cache_level(unsigned int cpu) { struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); struct device_node *np = of_cpu_device_node_get(cpu); @@ -58,7 +58,7 @@ static int __init_cache_level(unsigned int cpu) return 0; } -static int __populate_cache_leaves(unsigned int cpu) +int populate_cache_leaves(unsigned int cpu) { struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); struct cacheinfo *this_leaf = this_cpu_ci->info_list; @@ -95,6 +95,3 @@ static int __populate_cache_leaves(unsigned int cpu) return 0; } - -DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level) -DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves) diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index a9a08c7eadf39..adf41dcca0746 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -985,7 +985,7 @@ static void ci_leaf_init(struct cacheinfo *this_leaf, this_leaf->priv = base->nb; } -static int __init_cache_level(unsigned int cpu) +int init_cache_level(unsigned int cpu) { struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); @@ -1014,7 +1014,7 @@ static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs) id4_regs->id = c->apicid >> index_msb; } -static int __populate_cache_leaves(unsigned int cpu) +int populate_cache_leaves(unsigned int cpu) { unsigned int idx, ret; struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); @@ -1033,6 +1033,3 @@ static int __populate_cache_leaves(unsigned int cpu) return 0; } - -DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level) -DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves) diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index 46b92cd61d0c8..c8c71eea237d6 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -78,24 +78,6 @@ struct cpu_cacheinfo { bool cpu_map_populated; }; -/* - * Helpers to make sure "func" is executed on the cpu whose cache - * attributes are being detected - */ -#define DEFINE_SMP_CALL_CACHE_FUNCTION(func) \ -static inline void _##func(void *ret) \ -{ \ - int cpu = smp_processor_id(); \ - *(int *)ret = __##func(cpu); \ -} \ - \ -int func(unsigned int cpu) \ -{ \ - int ret; \ - smp_call_function_single(cpu, _##func, &ret, true); \ - return ret; \ -} - struct cpu_cacheinfo *get_cpu_cacheinfo(unsigned int cpu); int init_cache_level(unsigned int cpu); int populate_cache_leaves(unsigned int cpu); From 816f9c433f1ffa8c18b5db2b3f782a84e545278a Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Wed, 8 Sep 2021 08:30:41 -0700 Subject: [PATCH 0923/5344] parisc: Move pci_dev_is_behind_card_dino to where it is used [ Upstream commit 907872baa9f1538eed02ec737b8e89eba6c6e4b9 ] parisc build test images fail to compile with the following error. drivers/parisc/dino.c:160:12: error: 'pci_dev_is_behind_card_dino' defined but not used Move the function just ahead of its only caller to avoid the error. Fixes: 5fa1659105fa ("parisc: Disable HP HSC-PCI Cards to prevent kernel crash") Cc: Helge Deller Signed-off-by: Guenter Roeck Signed-off-by: Helge Deller Signed-off-by: Sasha Levin --- drivers/parisc/dino.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/parisc/dino.c b/drivers/parisc/dino.c index 2f1cac89ddf5c..544287e9f449b 100644 --- a/drivers/parisc/dino.c +++ b/drivers/parisc/dino.c @@ -156,15 +156,6 @@ static inline struct dino_device *DINO_DEV(struct pci_hba_data *hba) return container_of(hba, struct dino_device, hba); } -/* Check if PCI device is behind a Card-mode Dino. */ -static int pci_dev_is_behind_card_dino(struct pci_dev *dev) -{ - struct dino_device *dino_dev; - - dino_dev = DINO_DEV(parisc_walk_tree(dev->bus->bridge)); - return is_card_dino(&dino_dev->hba.dev->id); -} - /* * Dino Configuration Space Accessor Functions */ @@ -447,6 +438,15 @@ static void quirk_cirrus_cardbus(struct pci_dev *dev) DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_CIRRUS, PCI_DEVICE_ID_CIRRUS_6832, quirk_cirrus_cardbus ); #ifdef CONFIG_TULIP +/* Check if PCI device is behind a Card-mode Dino. */ +static int pci_dev_is_behind_card_dino(struct pci_dev *dev) +{ + struct dino_device *dino_dev; + + dino_dev = DINO_DEV(parisc_walk_tree(dev->bus->bridge)); + return is_card_dino(&dino_dev->hba.dev->id); +} + static void pci_fixup_tulip(struct pci_dev *dev) { if (!pci_dev_is_behind_card_dino(dev)) From 06a2fb23f746f2c907c0b4eb35434c8de43a34bc Mon Sep 17 00:00:00 2001 From: Zou Wei Date: Tue, 4 May 2021 10:22:57 +0800 Subject: [PATCH 0924/5344] dmaengine: sprd: Add missing MODULE_DEVICE_TABLE [ Upstream commit 4faee8b65ec32346f8096e64c5fa1d5a73121742 ] This patch adds missing MODULE_DEVICE_TABLE definition which generates correct modalias for automatic loading of this driver when it is built as an external module. Reported-by: Hulk Robot Signed-off-by: Zou Wei Reviewed-by: Baolin Wang Link: https://lore.kernel.org/r/1620094977-70146-1-git-send-email-zou_wei@huawei.com Signed-off-by: Vinod Koul Signed-off-by: Sasha Levin --- drivers/dma/sprd-dma.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/dma/sprd-dma.c b/drivers/dma/sprd-dma.c index 8546ad0347208..b966115bfad1d 100644 --- a/drivers/dma/sprd-dma.c +++ b/drivers/dma/sprd-dma.c @@ -1230,6 +1230,7 @@ static const struct of_device_id sprd_dma_match[] = { { .compatible = "sprd,sc9860-dma", }, {}, }; +MODULE_DEVICE_TABLE(of, sprd_dma_match); static int __maybe_unused sprd_dma_runtime_suspend(struct device *dev) { From 69f93b77e5f8c0d70a1bacd17b12298e41018f8a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 9 Aug 2021 11:24:09 +0200 Subject: [PATCH 0925/5344] dmaengine: ioat: depends on !UML [ Upstream commit bbac7a92a46f0876e588722ebe552ddfe6fd790f ] Now that UML has PCI support, this driver must depend also on !UML since it pokes at X86_64 architecture internals that don't exist on ARCH=um. Reported-by: Geert Uytterhoeven Signed-off-by: Johannes Berg Acked-by: Dave Jiang Link: https://lore.kernel.org/r/20210809112409.a3a0974874d2.I2ffe3d11ed37f735da2f39884a74c953b258b995@changeid Signed-off-by: Vinod Koul Signed-off-by: Sasha Levin --- drivers/dma/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index a32d0d7152475..1322461f1f3c5 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -276,7 +276,7 @@ config INTEL_IDMA64 config INTEL_IOATDMA tristate "Intel I/OAT DMA support" - depends on PCI && X86_64 + depends on PCI && X86_64 && !UML select DMA_ENGINE select DMA_ENGINE_RAID select DCA From 8450c2132ec1b16fc1323245daf8f4d94291d6a5 Mon Sep 17 00:00:00 2001 From: Radhey Shyam Pandey Date: Thu, 19 Aug 2021 14:28:48 +0530 Subject: [PATCH 0926/5344] dmaengine: xilinx_dma: Set DMA mask for coherent APIs [ Upstream commit aac6c0f90799d66b8989be1e056408f33fd99fe6 ] The xilinx dma driver uses the consistent allocations, so for correct operation also set the DMA mask for coherent APIs. It fixes the below kernel crash with dmatest client when DMA IP is configured with 64-bit address width and linux is booted from high (>4GB) memory. Call trace: [ 489.531257] dma_alloc_from_pool+0x8c/0x1c0 [ 489.535431] dma_direct_alloc+0x284/0x330 [ 489.539432] dma_alloc_attrs+0x80/0xf0 [ 489.543174] dma_pool_alloc+0x160/0x2c0 [ 489.547003] xilinx_cdma_prep_memcpy+0xa4/0x180 [ 489.551524] dmatest_func+0x3cc/0x114c [ 489.555266] kthread+0x124/0x130 [ 489.558486] ret_from_fork+0x10/0x3c [ 489.562051] ---[ end trace 248625b2d596a90a ]--- Signed-off-by: Radhey Shyam Pandey Reviewed-by: Harini Katakam Link: https://lore.kernel.org/r/1629363528-30347-1-git-send-email-radhey.shyam.pandey@xilinx.com Signed-off-by: Vinod Koul Signed-off-by: Sasha Levin --- drivers/dma/xilinx/xilinx_dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/xilinx/xilinx_dma.c b/drivers/dma/xilinx/xilinx_dma.c index ce18bca45ff27..7729b8d22553e 100644 --- a/drivers/dma/xilinx/xilinx_dma.c +++ b/drivers/dma/xilinx/xilinx_dma.c @@ -2703,7 +2703,7 @@ static int xilinx_dma_probe(struct platform_device *pdev) xdev->ext_addr = false; /* Set the dma mask bits */ - dma_set_mask(xdev->dev, DMA_BIT_MASK(addr_width)); + dma_set_mask_and_coherent(xdev->dev, DMA_BIT_MASK(addr_width)); /* Initialize the DMA engine */ xdev->common.dev = &pdev->dev; From 6668a3928c03dd4cfbbd5675908fa998f077ae4e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 11 Aug 2021 06:40:42 -0400 Subject: [PATCH 0927/5344] ceph: request Fw caps before updating the mtime in ceph_write_iter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit b11ed50346683a749632ea664959b28d524d7395 ] The current code will update the mtime and then try to get caps to handle the write. If we end up having to request caps from the MDS, then the mtime in the cap grant will clobber the updated mtime and it'll be lost. This is most noticable when two clients are alternately writing to the same file. Fw caps are continually being granted and revoked, and the mtime ends up stuck because the updated mtimes are always being overwritten with the old one. Fix this by changing the order of operations in ceph_write_iter to get the caps before updating the times. Also, make sure we check the pool full conditions before even getting any caps or uninlining. URL: https://tracker.ceph.com/issues/46574 Reported-by: Jozef Kováč Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Reviewed-by: Luis Henriques Signed-off-by: Ilya Dryomov Signed-off-by: Sasha Levin --- fs/ceph/file.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index a10711a6337af..34785a203461d 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1469,32 +1469,26 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out; } - err = file_remove_privs(file); - if (err) + down_read(&osdc->lock); + map_flags = osdc->osdmap->flags; + pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id); + up_read(&osdc->lock); + if ((map_flags & CEPH_OSDMAP_FULL) || + (pool_flags & CEPH_POOL_FLAG_FULL)) { + err = -ENOSPC; goto out; + } - err = file_update_time(file); + err = file_remove_privs(file); if (err) goto out; - inode_inc_iversion_raw(inode); - if (ci->i_inline_version != CEPH_INLINE_NONE) { err = ceph_uninline_data(file, NULL); if (err < 0) goto out; } - down_read(&osdc->lock); - map_flags = osdc->osdmap->flags; - pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id); - up_read(&osdc->lock); - if ((map_flags & CEPH_OSDMAP_FULL) || - (pool_flags & CEPH_POOL_FLAG_FULL)) { - err = -ENOSPC; - goto out; - } - dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", inode, ceph_vinop(inode), pos, count, i_size_read(inode)); if (fi->fmode & CEPH_FILE_MODE_LAZY) @@ -1507,6 +1501,12 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) if (err < 0) goto out; + err = file_update_time(file); + if (err) + goto out_caps; + + inode_inc_iversion_raw(inode); + dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n", inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); @@ -1590,6 +1590,8 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) } goto out_unlocked; +out_caps: + ceph_put_cap_refs(ci, got); out: if (direct_lock) ceph_end_io_direct(inode); From ed4d4f4a3f1401bb37fe1488fb1dd67ff920fa61 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 2 Sep 2021 08:31:03 -0400 Subject: [PATCH 0928/5344] ceph: lockdep annotations for try_nonblocking_invalidate [ Upstream commit 3eaf5aa1cfa8c97c72f5824e2e9263d6cc977b03 ] Signed-off-by: Jeff Layton Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov Signed-off-by: Sasha Levin --- fs/ceph/caps.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index a49bf1fbaea82..0fad044a5752b 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1775,6 +1775,8 @@ static u64 __mark_caps_flushing(struct inode *inode, * try to invalidate mapping pages without blocking. */ static int try_nonblocking_invalidate(struct inode *inode) + __releases(ci->i_ceph_lock) + __acquires(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); u32 invalidating_gen = ci->i_rdcache_gen; From 53866c20821c9d4d22b1b170959bb34370dade62 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Tue, 31 Aug 2021 09:21:28 +0800 Subject: [PATCH 0929/5344] btrfs: fix lockdep warning while mounting sprout fs [ Upstream commit c124706900c20dee70f921bb3a90492431561a0a ] Following test case reproduces lockdep warning. Test case: $ mkfs.btrfs -f $ btrfstune -S 1 $ mount $ btrfs device add -f $ umount $ mount $ umount The warning claims a possible ABBA deadlock between the threads initiated by [#1] btrfs device add and [#0] the mount. [ 540.743122] WARNING: possible circular locking dependency detected [ 540.743129] 5.11.0-rc7+ #5 Not tainted [ 540.743135] ------------------------------------------------------ [ 540.743142] mount/2515 is trying to acquire lock: [ 540.743149] ffffa0c5544c2ce0 (&fs_devs->device_list_mutex){+.+.}-{4:4}, at: clone_fs_devices+0x6d/0x210 [btrfs] [ 540.743458] but task is already holding lock: [ 540.743461] ffffa0c54a7932b8 (btrfs-chunk-00){++++}-{4:4}, at: __btrfs_tree_read_lock+0x32/0x200 [btrfs] [ 540.743541] which lock already depends on the new lock. [ 540.743543] the existing dependency chain (in reverse order) is: [ 540.743546] -> #1 (btrfs-chunk-00){++++}-{4:4}: [ 540.743566] down_read_nested+0x48/0x2b0 [ 540.743585] __btrfs_tree_read_lock+0x32/0x200 [btrfs] [ 540.743650] btrfs_read_lock_root_node+0x70/0x200 [btrfs] [ 540.743733] btrfs_search_slot+0x6c6/0xe00 [btrfs] [ 540.743785] btrfs_update_device+0x83/0x260 [btrfs] [ 540.743849] btrfs_finish_chunk_alloc+0x13f/0x660 [btrfs] <--- device_list_mutex [ 540.743911] btrfs_create_pending_block_groups+0x18d/0x3f0 [btrfs] [ 540.743982] btrfs_commit_transaction+0x86/0x1260 [btrfs] [ 540.744037] btrfs_init_new_device+0x1600/0x1dd0 [btrfs] [ 540.744101] btrfs_ioctl+0x1c77/0x24c0 [btrfs] [ 540.744166] __x64_sys_ioctl+0xe4/0x140 [ 540.744170] do_syscall_64+0x4b/0x80 [ 540.744174] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 540.744180] -> #0 (&fs_devs->device_list_mutex){+.+.}-{4:4}: [ 540.744184] __lock_acquire+0x155f/0x2360 [ 540.744188] lock_acquire+0x10b/0x5c0 [ 540.744190] __mutex_lock+0xb1/0xf80 [ 540.744193] mutex_lock_nested+0x27/0x30 [ 540.744196] clone_fs_devices+0x6d/0x210 [btrfs] [ 540.744270] btrfs_read_chunk_tree+0x3c7/0xbb0 [btrfs] [ 540.744336] open_ctree+0xf6e/0x2074 [btrfs] [ 540.744406] btrfs_mount_root.cold.72+0x16/0x127 [btrfs] [ 540.744472] legacy_get_tree+0x38/0x90 [ 540.744475] vfs_get_tree+0x30/0x140 [ 540.744478] fc_mount+0x16/0x60 [ 540.744482] vfs_kern_mount+0x91/0x100 [ 540.744484] btrfs_mount+0x1e6/0x670 [btrfs] [ 540.744536] legacy_get_tree+0x38/0x90 [ 540.744537] vfs_get_tree+0x30/0x140 [ 540.744539] path_mount+0x8d8/0x1070 [ 540.744541] do_mount+0x8d/0xc0 [ 540.744543] __x64_sys_mount+0x125/0x160 [ 540.744545] do_syscall_64+0x4b/0x80 [ 540.744547] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 540.744551] other info that might help us debug this: [ 540.744552] Possible unsafe locking scenario: [ 540.744553] CPU0 CPU1 [ 540.744554] ---- ---- [ 540.744555] lock(btrfs-chunk-00); [ 540.744557] lock(&fs_devs->device_list_mutex); [ 540.744560] lock(btrfs-chunk-00); [ 540.744562] lock(&fs_devs->device_list_mutex); [ 540.744564] *** DEADLOCK *** [ 540.744565] 3 locks held by mount/2515: [ 540.744567] #0: ffffa0c56bf7a0e0 (&type->s_umount_key#42/1){+.+.}-{4:4}, at: alloc_super.isra.16+0xdf/0x450 [ 540.744574] #1: ffffffffc05a9628 (uuid_mutex){+.+.}-{4:4}, at: btrfs_read_chunk_tree+0x63/0xbb0 [btrfs] [ 540.744640] #2: ffffa0c54a7932b8 (btrfs-chunk-00){++++}-{4:4}, at: __btrfs_tree_read_lock+0x32/0x200 [btrfs] [ 540.744708] stack backtrace: [ 540.744712] CPU: 2 PID: 2515 Comm: mount Not tainted 5.11.0-rc7+ #5 But the device_list_mutex in clone_fs_devices() is redundant, as explained below. Two threads [1] and [2] (below) could lead to clone_fs_device(). [1] open_ctree <== mount sprout fs btrfs_read_chunk_tree() mutex_lock(&uuid_mutex) <== global lock read_one_dev() open_seed_devices() clone_fs_devices() <== seed fs_devices mutex_lock(&orig->device_list_mutex) <== seed fs_devices [2] btrfs_init_new_device() <== sprouting mutex_lock(&uuid_mutex); <== global lock btrfs_prepare_sprout() lockdep_assert_held(&uuid_mutex) clone_fs_devices(seed_fs_device) <== seed fs_devices Both of these threads hold uuid_mutex which is sufficient to protect getting the seed device(s) freed while we are trying to clone it for sprouting [2] or mounting a sprout [1] (as above). A mounted seed device can not free/write/replace because it is read-only. An unmounted seed device can be freed by btrfs_free_stale_devices(), but it needs uuid_mutex. So this patch removes the unnecessary device_list_mutex in clone_fs_devices(). And adds a lockdep_assert_held(&uuid_mutex) in clone_fs_devices(). Reported-by: Su Yue Tested-by: Su Yue Signed-off-by: Anand Jain Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/volumes.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8deee49a6b3fa..f302bbb93f32c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -742,6 +742,8 @@ static int btrfs_free_stale_devices(const char *path, struct btrfs_device *device, *tmp_device; int ret = 0; + lockdep_assert_held(&uuid_mutex); + if (path) ret = -ENOENT; @@ -1181,11 +1183,12 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) struct btrfs_device *orig_dev; int ret = 0; + lockdep_assert_held(&uuid_mutex); + fs_devices = alloc_fs_devices(orig->fsid, NULL); if (IS_ERR(fs_devices)) return fs_devices; - mutex_lock(&orig->device_list_mutex); fs_devices->total_devices = orig->total_devices; list_for_each_entry(orig_dev, &orig->devices, dev_list) { @@ -1217,10 +1220,8 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) device->fs_devices = fs_devices; fs_devices->num_devices++; } - mutex_unlock(&orig->device_list_mutex); return fs_devices; error: - mutex_unlock(&orig->device_list_mutex); free_fs_devices(fs_devices); return ERR_PTR(ret); } From 5fe39761d97465bebb66bae061d8efdbce1d7ce6 Mon Sep 17 00:00:00 2001 From: Nanyong Sun Date: Tue, 7 Sep 2021 20:00:09 -0700 Subject: [PATCH 0930/5344] nilfs2: fix memory leak in nilfs_sysfs_create_device_group [ Upstream commit 5f5dec07aca7067216ed4c1342e464e7307a9197 ] Patch series "nilfs2: fix incorrect usage of kobject". This patchset from Nanyong Sun fixes memory leak issues and a NULL pointer dereference issue caused by incorrect usage of kboject in nilfs2 sysfs implementation. This patch (of 6): Reported by syzkaller: BUG: memory leak unreferenced object 0xffff888100ca8988 (size 8): comm "syz-executor.1", pid 1930, jiffies 4294745569 (age 18.052s) hex dump (first 8 bytes): 6c 6f 6f 70 31 00 ff ff loop1... backtrace: kstrdup+0x36/0x70 mm/util.c:60 kstrdup_const+0x35/0x60 mm/util.c:83 kvasprintf_const+0xf1/0x180 lib/kasprintf.c:48 kobject_set_name_vargs+0x56/0x150 lib/kobject.c:289 kobject_add_varg lib/kobject.c:384 [inline] kobject_init_and_add+0xc9/0x150 lib/kobject.c:473 nilfs_sysfs_create_device_group+0x150/0x7d0 fs/nilfs2/sysfs.c:986 init_nilfs+0xa21/0xea0 fs/nilfs2/the_nilfs.c:637 nilfs_fill_super fs/nilfs2/super.c:1046 [inline] nilfs_mount+0x7b4/0xe80 fs/nilfs2/super.c:1316 legacy_get_tree+0x105/0x210 fs/fs_context.c:592 vfs_get_tree+0x8e/0x2d0 fs/super.c:1498 do_new_mount fs/namespace.c:2905 [inline] path_mount+0xf9b/0x1990 fs/namespace.c:3235 do_mount+0xea/0x100 fs/namespace.c:3248 __do_sys_mount fs/namespace.c:3456 [inline] __se_sys_mount fs/namespace.c:3433 [inline] __x64_sys_mount+0x14b/0x1f0 fs/namespace.c:3433 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3b/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae If kobject_init_and_add return with error, then the cleanup of kobject is needed because memory may be allocated in kobject_init_and_add without freeing. And the place of cleanup_dev_kobject should use kobject_put to free the memory associated with the kobject. As the section "Kobject removal" of "Documentation/core-api/kobject.rst" says, kobject_del() just makes the kobject "invisible", but it is not cleaned up. And no more cleanup will do after cleanup_dev_kobject, so kobject_put is needed here. Link: https://lkml.kernel.org/r/1625651306-10829-1-git-send-email-konishi.ryusuke@gmail.com Link: https://lkml.kernel.org/r/1625651306-10829-2-git-send-email-konishi.ryusuke@gmail.com Reported-by: Hulk Robot Link: https://lkml.kernel.org/r/20210629022556.3985106-2-sunnanyong@huawei.com Signed-off-by: Nanyong Sun Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/nilfs2/sysfs.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index c6c8a33c81d5e..cbfc132206e86 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -1000,7 +1000,7 @@ int nilfs_sysfs_create_device_group(struct super_block *sb) err = kobject_init_and_add(&nilfs->ns_dev_kobj, &nilfs_dev_ktype, NULL, "%s", sb->s_id); if (err) - goto free_dev_subgroups; + goto cleanup_dev_kobject; err = nilfs_sysfs_create_mounted_snapshots_group(nilfs); if (err) @@ -1037,9 +1037,7 @@ int nilfs_sysfs_create_device_group(struct super_block *sb) nilfs_sysfs_delete_mounted_snapshots_group(nilfs); cleanup_dev_kobject: - kobject_del(&nilfs->ns_dev_kobj); - -free_dev_subgroups: + kobject_put(&nilfs->ns_dev_kobj); kfree(nilfs->ns_dev_subgroups); failed_create_device_group: From 6ac092de6ea7deea4efa046ab5b79b4dd0a021ea Mon Sep 17 00:00:00 2001 From: Nanyong Sun Date: Tue, 7 Sep 2021 20:00:12 -0700 Subject: [PATCH 0931/5344] nilfs2: fix NULL pointer in nilfs_##name##_attr_release [ Upstream commit dbc6e7d44a514f231a64d9d5676e001b660b6448 ] In nilfs_##name##_attr_release, kobj->parent should not be referenced because it is a NULL pointer. The release() method of kobject is always called in kobject_put(kobj), in the implementation of kobject_put(), the kobj->parent will be assigned as NULL before call the release() method. So just use kobj to get the subgroups, which is more efficient and can fix a NULL pointer reference problem. Link: https://lkml.kernel.org/r/20210629022556.3985106-3-sunnanyong@huawei.com Link: https://lkml.kernel.org/r/1625651306-10829-3-git-send-email-konishi.ryusuke@gmail.com Signed-off-by: Nanyong Sun Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/nilfs2/sysfs.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index cbfc132206e86..ca720d9583153 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -64,11 +64,9 @@ static const struct sysfs_ops nilfs_##name##_attr_ops = { \ #define NILFS_DEV_INT_GROUP_TYPE(name, parent_name) \ static void nilfs_##name##_attr_release(struct kobject *kobj) \ { \ - struct nilfs_sysfs_##parent_name##_subgroups *subgroups; \ - struct the_nilfs *nilfs = container_of(kobj->parent, \ - struct the_nilfs, \ - ns_##parent_name##_kobj); \ - subgroups = nilfs->ns_##parent_name##_subgroups; \ + struct nilfs_sysfs_##parent_name##_subgroups *subgroups = container_of(kobj, \ + struct nilfs_sysfs_##parent_name##_subgroups, \ + sg_##name##_kobj); \ complete(&subgroups->sg_##name##_kobj_unregister); \ } \ static struct kobj_type nilfs_##name##_ktype = { \ From 1c97baf887f34ef1335bdb73dce3c8a3f693ed87 Mon Sep 17 00:00:00 2001 From: Nanyong Sun Date: Tue, 7 Sep 2021 20:00:15 -0700 Subject: [PATCH 0932/5344] nilfs2: fix memory leak in nilfs_sysfs_create_##name##_group [ Upstream commit 24f8cb1ed057c840728167dab33b32e44147c86f ] If kobject_init_and_add return with error, kobject_put() is needed here to avoid memory leak, because kobject_init_and_add may return error without freeing the memory associated with the kobject it allocated. Link: https://lkml.kernel.org/r/20210629022556.3985106-4-sunnanyong@huawei.com Link: https://lkml.kernel.org/r/1625651306-10829-4-git-send-email-konishi.ryusuke@gmail.com Signed-off-by: Nanyong Sun Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/nilfs2/sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index ca720d9583153..31d640a87b591 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -92,8 +92,8 @@ static int nilfs_sysfs_create_##name##_group(struct the_nilfs *nilfs) \ err = kobject_init_and_add(kobj, &nilfs_##name##_ktype, parent, \ #name); \ if (err) \ - return err; \ - return 0; \ + kobject_put(kobj); \ + return err; \ } \ static void nilfs_sysfs_delete_##name##_group(struct the_nilfs *nilfs) \ { \ From 54d76c9b6e304a7db9e61bcb3c0c3584cde1abe1 Mon Sep 17 00:00:00 2001 From: Nanyong Sun Date: Tue, 7 Sep 2021 20:00:18 -0700 Subject: [PATCH 0933/5344] nilfs2: fix memory leak in nilfs_sysfs_delete_##name##_group [ Upstream commit a3e181259ddd61fd378390977a1e4e2316853afa ] The kobject_put() should be used to cleanup the memory associated with the kobject instead of kobject_del. See the section "Kobject removal" of "Documentation/core-api/kobject.rst". Link: https://lkml.kernel.org/r/20210629022556.3985106-5-sunnanyong@huawei.com Link: https://lkml.kernel.org/r/1625651306-10829-5-git-send-email-konishi.ryusuke@gmail.com Signed-off-by: Nanyong Sun Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/nilfs2/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index 31d640a87b591..195f42192a150 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -97,7 +97,7 @@ static int nilfs_sysfs_create_##name##_group(struct the_nilfs *nilfs) \ } \ static void nilfs_sysfs_delete_##name##_group(struct the_nilfs *nilfs) \ { \ - kobject_del(&nilfs->ns_##parent_name##_subgroups->sg_##name##_kobj); \ + kobject_put(&nilfs->ns_##parent_name##_subgroups->sg_##name##_kobj); \ } /************************************************************************ From 31442d940702f19ae37fa69daff8b26ffded489a Mon Sep 17 00:00:00 2001 From: Nanyong Sun Date: Tue, 7 Sep 2021 20:00:21 -0700 Subject: [PATCH 0934/5344] nilfs2: fix memory leak in nilfs_sysfs_create_snapshot_group [ Upstream commit b2fe39c248f3fa4bbb2a20759b4fdd83504190f7 ] If kobject_init_and_add returns with error, kobject_put() is needed here to avoid memory leak, because kobject_init_and_add may return error without freeing the memory associated with the kobject it allocated. Link: https://lkml.kernel.org/r/20210629022556.3985106-6-sunnanyong@huawei.com Link: https://lkml.kernel.org/r/1625651306-10829-6-git-send-email-konishi.ryusuke@gmail.com Signed-off-by: Nanyong Sun Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/nilfs2/sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index 195f42192a150..6c92ac314b066 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -208,9 +208,9 @@ int nilfs_sysfs_create_snapshot_group(struct nilfs_root *root) } if (err) - return err; + kobject_put(&root->snapshot_kobj); - return 0; + return err; } void nilfs_sysfs_delete_snapshot_group(struct nilfs_root *root) From 52a865dcc6bcdd4f3e05042f470614d85298b7aa Mon Sep 17 00:00:00 2001 From: Nanyong Sun Date: Tue, 7 Sep 2021 20:00:23 -0700 Subject: [PATCH 0935/5344] nilfs2: fix memory leak in nilfs_sysfs_delete_snapshot_group [ Upstream commit 17243e1c3072b8417a5ebfc53065d0a87af7ca77 ] kobject_put() should be used to cleanup the memory associated with the kobject instead of kobject_del(). See the section "Kobject removal" of "Documentation/core-api/kobject.rst". Link: https://lkml.kernel.org/r/20210629022556.3985106-7-sunnanyong@huawei.com Link: https://lkml.kernel.org/r/1625651306-10829-7-git-send-email-konishi.ryusuke@gmail.com Signed-off-by: Nanyong Sun Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/nilfs2/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index 6c92ac314b066..28a2db3b1787f 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -215,7 +215,7 @@ int nilfs_sysfs_create_snapshot_group(struct nilfs_root *root) void nilfs_sysfs_delete_snapshot_group(struct nilfs_root *root) { - kobject_del(&root->snapshot_kobj); + kobject_put(&root->snapshot_kobj); } /************************************************************************ From 8e5088cd095429369578dc57d0d37b8910dd8858 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 7 Jul 2021 18:27:51 +0200 Subject: [PATCH 0936/5344] pwm: img: Don't modify HW state in .remove() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit c68eb29c8e9067c08175dd0414f6984f236f719d ] A consumer is expected to disable a PWM before calling pwm_put(). And if they didn't there is hopefully a good reason (or the consumer needs fixing). Also if disabling an enabled PWM was the right thing to do, this should better be done in the framework instead of in each low level driver. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding Signed-off-by: Sasha Levin --- drivers/pwm/pwm-img.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/drivers/pwm/pwm-img.c b/drivers/pwm/pwm-img.c index 22c002e685b34..37f9b688661d4 100644 --- a/drivers/pwm/pwm-img.c +++ b/drivers/pwm/pwm-img.c @@ -329,23 +329,7 @@ static int img_pwm_probe(struct platform_device *pdev) static int img_pwm_remove(struct platform_device *pdev) { struct img_pwm_chip *pwm_chip = platform_get_drvdata(pdev); - u32 val; - unsigned int i; - int ret; - - ret = pm_runtime_get_sync(&pdev->dev); - if (ret < 0) { - pm_runtime_put(&pdev->dev); - return ret; - } - - for (i = 0; i < pwm_chip->chip.npwm; i++) { - val = img_pwm_readl(pwm_chip, PWM_CTRL_CFG); - val &= ~BIT(i); - img_pwm_writel(pwm_chip, PWM_CTRL_CFG, val); - } - pm_runtime_put(&pdev->dev); pm_runtime_disable(&pdev->dev); if (!pm_runtime_status_suspended(&pdev->dev)) img_pwm_runtime_suspend(&pdev->dev); From d01276233f97cf7689853a4551a549d0bee08984 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 7 Jul 2021 18:27:52 +0200 Subject: [PATCH 0937/5344] pwm: rockchip: Don't modify HW state in .remove() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 9d768cd7fd42bb0be16f36aec48548fca5260759 ] A consumer is expected to disable a PWM before calling pwm_put(). And if they didn't there is hopefully a good reason (or the consumer needs fixing). Also if disabling an enabled PWM was the right thing to do, this should better be done in the framework instead of in each low level driver. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding Signed-off-by: Sasha Levin --- drivers/pwm/pwm-rockchip.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/drivers/pwm/pwm-rockchip.c b/drivers/pwm/pwm-rockchip.c index 6ad6aad215cf1..8c0af705c5ae9 100644 --- a/drivers/pwm/pwm-rockchip.c +++ b/drivers/pwm/pwm-rockchip.c @@ -383,20 +383,6 @@ static int rockchip_pwm_remove(struct platform_device *pdev) { struct rockchip_pwm_chip *pc = platform_get_drvdata(pdev); - /* - * Disable the PWM clk before unpreparing it if the PWM device is still - * running. This should only happen when the last PWM user left it - * enabled, or when nobody requested a PWM that was previously enabled - * by the bootloader. - * - * FIXME: Maybe the core should disable all PWM devices in - * pwmchip_remove(). In this case we'd only have to call - * clk_unprepare() after pwmchip_remove(). - * - */ - if (pwm_is_enabled(pc->chip.pwms)) - clk_disable(pc->clk); - clk_unprepare(pc->pclk); clk_unprepare(pc->clk); From 6d7860c86a229381eedb44ff912848f1976eb288 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 7 Jul 2021 18:27:53 +0200 Subject: [PATCH 0938/5344] pwm: stm32-lp: Don't modify HW state in .remove() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit d44084c93427bb0a9261432db1a8ca76a42d805e ] A consumer is expected to disable a PWM before calling pwm_put(). And if they didn't there is hopefully a good reason (or the consumer needs fixing). Also if disabling an enabled PWM was the right thing to do, this should better be done in the framework instead of in each low level driver. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding Signed-off-by: Sasha Levin --- drivers/pwm/pwm-stm32-lp.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/pwm/pwm-stm32-lp.c b/drivers/pwm/pwm-stm32-lp.c index 67fca62524dc2..05bb1f95a7739 100644 --- a/drivers/pwm/pwm-stm32-lp.c +++ b/drivers/pwm/pwm-stm32-lp.c @@ -225,8 +225,6 @@ static int stm32_pwm_lp_remove(struct platform_device *pdev) { struct stm32_pwm_lp *priv = platform_get_drvdata(pdev); - pwm_disable(&priv->chip.pwms[0]); - return pwmchip_remove(&priv->chip); } From e81a350ee7b2966741364a96ca554a29f99f9935 Mon Sep 17 00:00:00 2001 From: Yu-Tung Chang Date: Mon, 30 Aug 2021 13:25:32 +0800 Subject: [PATCH 0939/5344] rtc: rx8010: select REGMAP_I2C [ Upstream commit 0c45d3e24ef3d3d87c5e0077b8f38d1372af7176 ] The rtc-rx8010 uses the I2C regmap but doesn't select it in Kconfig so depending on the configuration the build may fail. Fix it. Signed-off-by: Yu-Tung Chang Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20210830052532.40356-1-mtwget@gmail.com Signed-off-by: Sasha Levin --- drivers/rtc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 9ae7ce3f50696..0ad8d84aeb339 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -625,6 +625,7 @@ config RTC_DRV_FM3130 config RTC_DRV_RX8010 tristate "Epson RX8010SJ" + select REGMAP_I2C help If you say yes here you get support for the Epson RX8010SJ RTC chip. From 2ad443a862eeea555fdf6fbf10050b4efe065d53 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Wed, 8 Sep 2021 12:08:17 -0700 Subject: [PATCH 0940/5344] drm/nouveau/nvkm: Replace -ENOSYS with -ENODEV commit e8f71f89236ef82d449991bfbc237e3cb6ea584f upstream. nvkm test builds fail with the following error. drivers/gpu/drm/nouveau/nvkm/engine/device/ctrl.c: In function 'nvkm_control_mthd_pstate_info': drivers/gpu/drm/nouveau/nvkm/engine/device/ctrl.c:60:35: error: overflow in conversion from 'int' to '__s8' {aka 'signed char'} changes value from '-251' to '5' The code builds on most architectures, but fails on parisc where ENOSYS is defined as 251. Replace the error code with -ENODEV (-19). The actual error code does not really matter and is not passed to userspace - it just has to be negative. Fixes: 7238eca4cf18 ("drm/nouveau: expose pstate selection per-power source in sysfs") Signed-off-by: Guenter Roeck Cc: Ben Skeggs Cc: David Airlie Cc: Daniel Vetter Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/nouveau/nvkm/engine/device/ctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/ctrl.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/ctrl.c index b0ece71aefdee..ce774579c89d1 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/ctrl.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/ctrl.c @@ -57,7 +57,7 @@ nvkm_control_mthd_pstate_info(struct nvkm_control *ctrl, void *data, u32 size) args->v0.count = 0; args->v0.ustate_ac = NVIF_CONTROL_PSTATE_INFO_V0_USTATE_DISABLE; args->v0.ustate_dc = NVIF_CONTROL_PSTATE_INFO_V0_USTATE_DISABLE; - args->v0.pwrsrc = -ENOSYS; + args->v0.pwrsrc = -ENODEV; args->v0.pstate = NVIF_CONTROL_PSTATE_INFO_V0_PSTATE_UNKNOWN; } From 7a31a0d3026abd456049e3c9864d0b3bb6429b50 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 26 Sep 2021 14:07:14 +0200 Subject: [PATCH 0941/5344] Linux 5.4.149 Link: https://lore.kernel.org/r/20210924124332.229289734@linuxfoundation.org Tested-by: Jon Hunter Tested-by: Shuah Khan Tested-by: Florian Fainelli Link: https://lore.kernel.org/r/20210925120748.206179334@linuxfoundation.org Tested-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b84706c6d6248..1834f47fbaf61 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 4 -SUBLEVEL = 148 +SUBLEVEL = 149 EXTRAVERSION = NAME = Kleptomaniac Octopus From 2362671b4fe1c668f9467bdd3a601a0c9e6ca90f Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Fri, 24 Sep 2021 15:43:35 -0700 Subject: [PATCH 0942/5344] ocfs2: drop acl cache for directories too commit 9c0f0a03e386f4e1df33db676401547e1b7800c6 upstream. ocfs2_data_convert_worker() is currently dropping any cached acl info for FILE before down-converting meta lock. It should also drop for DIRECTORY. Otherwise the second acl lookup returns the cached one (from VFS layer) which could be already stale. The problem we are seeing is that the acl changes on one node doesn't get refreshed on other nodes in the following case: Node 1 Node 2 -------------- ---------------- getfacl dir1 getfacl dir1 <-- this is OK setfacl -m u:user1:rwX dir1 getfacl dir1 <-- see the change for user1 getfacl dir1 <-- can't see change for user1 Link: https://lkml.kernel.org/r/20210903012631.6099-1-wen.gang.wang@oracle.com Signed-off-by: Wengang Wang Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/ocfs2/dlmglue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 50a863fc17792..207ec61569ea4 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3933,7 +3933,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, oi = OCFS2_I(inode); oi->ip_dir_lock_gen++; mlog(0, "generation: %u\n", oi->ip_dir_lock_gen); - goto out; + goto out_forget; } if (!S_ISREG(inode->i_mode)) @@ -3964,6 +3964,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, filemap_fdatawait(mapping); } +out_forget: forget_all_cached_acls(inode); out: From 158bd94fa14c5624c6dfcefc979fee764f29a56f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 6 Sep 2021 12:42:21 +0300 Subject: [PATCH 0943/5344] usb: gadget: r8a66597: fix a loop in set_feature() commit 17956b53ebff6a490baf580a836cbd3eae94892b upstream. This loop is supposed to loop until if reads something other than CS_IDST or until it times out after 30,000 attempts. But because of the || vs && bug, it will never time out and instead it will loop a minimum of 30,000 times. This bug is quite old but the code is only used in USB_DEVICE_TEST_MODE so it probably doesn't affect regular usage. Fixes: 96fe53ef5498 ("usb: gadget: r8a66597-udc: add support for TEST_MODE") Cc: stable Reviewed-by: Yoshihiro Shimoda Acked-by: Felipe Balbi Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20210906094221.GA10957@kili Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/r8a66597-udc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/udc/r8a66597-udc.c b/drivers/usb/gadget/udc/r8a66597-udc.c index a766476fd742e..ca0aebb5bd0cc 100644 --- a/drivers/usb/gadget/udc/r8a66597-udc.c +++ b/drivers/usb/gadget/udc/r8a66597-udc.c @@ -1250,7 +1250,7 @@ static void set_feature(struct r8a66597 *r8a66597, struct usb_ctrlrequest *ctrl) do { tmp = r8a66597_read(r8a66597, INTSTS0) & CTSQ; udelay(1); - } while (tmp != CS_IDST || timeout-- > 0); + } while (tmp != CS_IDST && timeout-- > 0); if (tmp == CS_IDST) r8a66597_bset(r8a66597, From 3d2f96fc2c528b4f1bec1fdcbeae7415baf5a1db Mon Sep 17 00:00:00 2001 From: Minas Harutyunyan Date: Thu, 9 Sep 2021 14:45:15 +0400 Subject: [PATCH 0944/5344] usb: dwc2: gadget: Fix ISOC flow for BDMA and Slave commit 91bb163e1e4f88092f50dfaa5a816b658753e4b2 upstream. According USB spec each ISOC transaction should be performed in a designated for that transaction interval. On bus errors or delays in operating system scheduling of client software can result in no packet being transferred for a (micro)frame. An error indication should be returned as status to the client software in such a case. Current implementation in case of missed/dropped interval send same data in next possible interval instead of reporting missed isoc. This fix complete requests with -ENODATA if interval elapsed. HSOTG core in BDMA and Slave modes haven't HW support for (micro)frames tracking, this is why SW should care about tracking of (micro)frames. Because of that method and consider operating system scheduling delays, added few additional checking's of elapsed target (micro)frame: 1. Immediately before enabling EP to start transfer. 2. With any transfer completion interrupt. 3. With incomplete isoc in/out interrupt. 4. With EP disabled interrupt because of incomplete transfer. 5. With OUT token received while EP disabled interrupt (for OUT transfers). 6. With NAK replied to IN token interrupt (for IN transfers). As part of ISOC flow, additionally fixed 'current' and 'target' frame calculation functions. In HS mode SOF limits provided by DSTS register is 0x3fff, but in non HS mode this limit is 0x7ff. Tested by internal tool which also using for dwc3 testing. Signed-off-by: Minas Harutyunyan Cc: stable Link: https://lore.kernel.org/r/95d1423adf4b0f68187c9894820c4b7e964a3f7f.1631175721.git.Minas.Harutyunyan@synopsys.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc2/gadget.c | 189 +++++++++++++++++++++----------------- 1 file changed, 106 insertions(+), 83 deletions(-) diff --git a/drivers/usb/dwc2/gadget.c b/drivers/usb/dwc2/gadget.c index 66dfcdbd1e03a..d113f0ac0ea83 100644 --- a/drivers/usb/dwc2/gadget.c +++ b/drivers/usb/dwc2/gadget.c @@ -115,10 +115,16 @@ static inline bool using_desc_dma(struct dwc2_hsotg *hsotg) */ static inline void dwc2_gadget_incr_frame_num(struct dwc2_hsotg_ep *hs_ep) { + struct dwc2_hsotg *hsotg = hs_ep->parent; + u16 limit = DSTS_SOFFN_LIMIT; + + if (hsotg->gadget.speed != USB_SPEED_HIGH) + limit >>= 3; + hs_ep->target_frame += hs_ep->interval; - if (hs_ep->target_frame > DSTS_SOFFN_LIMIT) { + if (hs_ep->target_frame > limit) { hs_ep->frame_overrun = true; - hs_ep->target_frame &= DSTS_SOFFN_LIMIT; + hs_ep->target_frame &= limit; } else { hs_ep->frame_overrun = false; } @@ -136,10 +142,16 @@ static inline void dwc2_gadget_incr_frame_num(struct dwc2_hsotg_ep *hs_ep) */ static inline void dwc2_gadget_dec_frame_num_by_one(struct dwc2_hsotg_ep *hs_ep) { + struct dwc2_hsotg *hsotg = hs_ep->parent; + u16 limit = DSTS_SOFFN_LIMIT; + + if (hsotg->gadget.speed != USB_SPEED_HIGH) + limit >>= 3; + if (hs_ep->target_frame) hs_ep->target_frame -= 1; else - hs_ep->target_frame = DSTS_SOFFN_LIMIT; + hs_ep->target_frame = limit; } /** @@ -1018,6 +1030,12 @@ static void dwc2_gadget_start_isoc_ddma(struct dwc2_hsotg_ep *hs_ep) dwc2_writel(hsotg, ctrl, depctl); } +static bool dwc2_gadget_target_frame_elapsed(struct dwc2_hsotg_ep *hs_ep); +static void dwc2_hsotg_complete_request(struct dwc2_hsotg *hsotg, + struct dwc2_hsotg_ep *hs_ep, + struct dwc2_hsotg_req *hs_req, + int result); + /** * dwc2_hsotg_start_req - start a USB request from an endpoint's queue * @hsotg: The controller state. @@ -1170,14 +1188,19 @@ static void dwc2_hsotg_start_req(struct dwc2_hsotg *hsotg, } } - if (hs_ep->isochronous && hs_ep->interval == 1) { - hs_ep->target_frame = dwc2_hsotg_read_frameno(hsotg); - dwc2_gadget_incr_frame_num(hs_ep); - - if (hs_ep->target_frame & 0x1) - ctrl |= DXEPCTL_SETODDFR; - else - ctrl |= DXEPCTL_SETEVENFR; + if (hs_ep->isochronous) { + if (!dwc2_gadget_target_frame_elapsed(hs_ep)) { + if (hs_ep->interval == 1) { + if (hs_ep->target_frame & 0x1) + ctrl |= DXEPCTL_SETODDFR; + else + ctrl |= DXEPCTL_SETEVENFR; + } + ctrl |= DXEPCTL_CNAK; + } else { + dwc2_hsotg_complete_request(hsotg, hs_ep, hs_req, -ENODATA); + return; + } } ctrl |= DXEPCTL_EPENA; /* ensure ep enabled */ @@ -1325,12 +1348,16 @@ static bool dwc2_gadget_target_frame_elapsed(struct dwc2_hsotg_ep *hs_ep) u32 target_frame = hs_ep->target_frame; u32 current_frame = hsotg->frame_number; bool frame_overrun = hs_ep->frame_overrun; + u16 limit = DSTS_SOFFN_LIMIT; + + if (hsotg->gadget.speed != USB_SPEED_HIGH) + limit >>= 3; if (!frame_overrun && current_frame >= target_frame) return true; if (frame_overrun && current_frame >= target_frame && - ((current_frame - target_frame) < DSTS_SOFFN_LIMIT / 2)) + ((current_frame - target_frame) < limit / 2)) return true; return false; @@ -1712,11 +1739,9 @@ static struct dwc2_hsotg_req *get_ep_head(struct dwc2_hsotg_ep *hs_ep) */ static void dwc2_gadget_start_next_request(struct dwc2_hsotg_ep *hs_ep) { - u32 mask; struct dwc2_hsotg *hsotg = hs_ep->parent; int dir_in = hs_ep->dir_in; struct dwc2_hsotg_req *hs_req; - u32 epmsk_reg = dir_in ? DIEPMSK : DOEPMSK; if (!list_empty(&hs_ep->queue)) { hs_req = get_ep_head(hs_ep); @@ -1732,9 +1757,6 @@ static void dwc2_gadget_start_next_request(struct dwc2_hsotg_ep *hs_ep) } else { dev_dbg(hsotg->dev, "%s: No more ISOC-OUT requests\n", __func__); - mask = dwc2_readl(hsotg, epmsk_reg); - mask |= DOEPMSK_OUTTKNEPDISMSK; - dwc2_writel(hsotg, mask, epmsk_reg); } } @@ -2304,19 +2326,6 @@ static void dwc2_hsotg_ep0_zlp(struct dwc2_hsotg *hsotg, bool dir_in) dwc2_hsotg_program_zlp(hsotg, hsotg->eps_out[0]); } -static void dwc2_hsotg_change_ep_iso_parity(struct dwc2_hsotg *hsotg, - u32 epctl_reg) -{ - u32 ctrl; - - ctrl = dwc2_readl(hsotg, epctl_reg); - if (ctrl & DXEPCTL_EOFRNUM) - ctrl |= DXEPCTL_SETEVENFR; - else - ctrl |= DXEPCTL_SETODDFR; - dwc2_writel(hsotg, ctrl, epctl_reg); -} - /* * dwc2_gadget_get_xfersize_ddma - get transferred bytes amount from desc * @hs_ep - The endpoint on which transfer went @@ -2437,20 +2446,11 @@ static void dwc2_hsotg_handle_outdone(struct dwc2_hsotg *hsotg, int epnum) dwc2_hsotg_ep0_zlp(hsotg, true); } - /* - * Slave mode OUT transfers do not go through XferComplete so - * adjust the ISOC parity here. - */ - if (!using_dma(hsotg)) { - if (hs_ep->isochronous && hs_ep->interval == 1) - dwc2_hsotg_change_ep_iso_parity(hsotg, DOEPCTL(epnum)); - else if (hs_ep->isochronous && hs_ep->interval > 1) - dwc2_gadget_incr_frame_num(hs_ep); - } - /* Set actual frame number for completed transfers */ - if (!using_desc_dma(hsotg) && hs_ep->isochronous) - req->frame_number = hsotg->frame_number; + if (!using_desc_dma(hsotg) && hs_ep->isochronous) { + req->frame_number = hs_ep->target_frame; + dwc2_gadget_incr_frame_num(hs_ep); + } dwc2_hsotg_complete_request(hsotg, hs_ep, hs_req, result); } @@ -2764,6 +2764,12 @@ static void dwc2_hsotg_complete_in(struct dwc2_hsotg *hsotg, return; } + /* Set actual frame number for completed transfers */ + if (!using_desc_dma(hsotg) && hs_ep->isochronous) { + hs_req->req.frame_number = hs_ep->target_frame; + dwc2_gadget_incr_frame_num(hs_ep); + } + dwc2_hsotg_complete_request(hsotg, hs_ep, hs_req, 0); } @@ -2824,23 +2830,18 @@ static void dwc2_gadget_handle_ep_disabled(struct dwc2_hsotg_ep *hs_ep) dwc2_hsotg_txfifo_flush(hsotg, hs_ep->fifo_index); - if (hs_ep->isochronous) { - dwc2_hsotg_complete_in(hsotg, hs_ep); - return; - } - if ((epctl & DXEPCTL_STALL) && (epctl & DXEPCTL_EPTYPE_BULK)) { int dctl = dwc2_readl(hsotg, DCTL); dctl |= DCTL_CGNPINNAK; dwc2_writel(hsotg, dctl, DCTL); } - return; - } + } else { - if (dctl & DCTL_GOUTNAKSTS) { - dctl |= DCTL_CGOUTNAK; - dwc2_writel(hsotg, dctl, DCTL); + if (dctl & DCTL_GOUTNAKSTS) { + dctl |= DCTL_CGOUTNAK; + dwc2_writel(hsotg, dctl, DCTL); + } } if (!hs_ep->isochronous) @@ -2861,8 +2862,6 @@ static void dwc2_gadget_handle_ep_disabled(struct dwc2_hsotg_ep *hs_ep) /* Update current frame number value. */ hsotg->frame_number = dwc2_hsotg_read_frameno(hsotg); } while (dwc2_gadget_target_frame_elapsed(hs_ep)); - - dwc2_gadget_start_next_request(hs_ep); } /** @@ -2879,8 +2878,8 @@ static void dwc2_gadget_handle_ep_disabled(struct dwc2_hsotg_ep *hs_ep) static void dwc2_gadget_handle_out_token_ep_disabled(struct dwc2_hsotg_ep *ep) { struct dwc2_hsotg *hsotg = ep->parent; + struct dwc2_hsotg_req *hs_req; int dir_in = ep->dir_in; - u32 doepmsk; if (dir_in || !ep->isochronous) return; @@ -2894,28 +2893,39 @@ static void dwc2_gadget_handle_out_token_ep_disabled(struct dwc2_hsotg_ep *ep) return; } - if (ep->interval > 1 && - ep->target_frame == TARGET_FRAME_INITIAL) { + if (ep->target_frame == TARGET_FRAME_INITIAL) { u32 ctrl; ep->target_frame = hsotg->frame_number; - dwc2_gadget_incr_frame_num(ep); + if (ep->interval > 1) { + ctrl = dwc2_readl(hsotg, DOEPCTL(ep->index)); + if (ep->target_frame & 0x1) + ctrl |= DXEPCTL_SETODDFR; + else + ctrl |= DXEPCTL_SETEVENFR; - ctrl = dwc2_readl(hsotg, DOEPCTL(ep->index)); - if (ep->target_frame & 0x1) - ctrl |= DXEPCTL_SETODDFR; - else - ctrl |= DXEPCTL_SETEVENFR; + dwc2_writel(hsotg, ctrl, DOEPCTL(ep->index)); + } + } + + while (dwc2_gadget_target_frame_elapsed(ep)) { + hs_req = get_ep_head(ep); + if (hs_req) + dwc2_hsotg_complete_request(hsotg, ep, hs_req, -ENODATA); - dwc2_writel(hsotg, ctrl, DOEPCTL(ep->index)); + dwc2_gadget_incr_frame_num(ep); + /* Update current frame number value. */ + hsotg->frame_number = dwc2_hsotg_read_frameno(hsotg); } - dwc2_gadget_start_next_request(ep); - doepmsk = dwc2_readl(hsotg, DOEPMSK); - doepmsk &= ~DOEPMSK_OUTTKNEPDISMSK; - dwc2_writel(hsotg, doepmsk, DOEPMSK); + if (!ep->req) + dwc2_gadget_start_next_request(ep); + } +static void dwc2_hsotg_ep_stop_xfr(struct dwc2_hsotg *hsotg, + struct dwc2_hsotg_ep *hs_ep); + /** * dwc2_gadget_handle_nak - handle NAK interrupt * @hs_ep: The endpoint on which interrupt is asserted. @@ -2933,7 +2943,9 @@ static void dwc2_gadget_handle_out_token_ep_disabled(struct dwc2_hsotg_ep *ep) static void dwc2_gadget_handle_nak(struct dwc2_hsotg_ep *hs_ep) { struct dwc2_hsotg *hsotg = hs_ep->parent; + struct dwc2_hsotg_req *hs_req; int dir_in = hs_ep->dir_in; + u32 ctrl; if (!dir_in || !hs_ep->isochronous) return; @@ -2975,13 +2987,29 @@ static void dwc2_gadget_handle_nak(struct dwc2_hsotg_ep *hs_ep) dwc2_writel(hsotg, ctrl, DIEPCTL(hs_ep->index)); } - - dwc2_hsotg_complete_request(hsotg, hs_ep, - get_ep_head(hs_ep), 0); } - if (!using_desc_dma(hsotg)) + if (using_desc_dma(hsotg)) + return; + + ctrl = dwc2_readl(hsotg, DIEPCTL(hs_ep->index)); + if (ctrl & DXEPCTL_EPENA) + dwc2_hsotg_ep_stop_xfr(hsotg, hs_ep); + else + dwc2_hsotg_txfifo_flush(hsotg, hs_ep->fifo_index); + + while (dwc2_gadget_target_frame_elapsed(hs_ep)) { + hs_req = get_ep_head(hs_ep); + if (hs_req) + dwc2_hsotg_complete_request(hsotg, hs_ep, hs_req, -ENODATA); + dwc2_gadget_incr_frame_num(hs_ep); + /* Update current frame number value. */ + hsotg->frame_number = dwc2_hsotg_read_frameno(hsotg); + } + + if (!hs_ep->req) + dwc2_gadget_start_next_request(hs_ep); } /** @@ -3048,12 +3076,8 @@ static void dwc2_hsotg_epint(struct dwc2_hsotg *hsotg, unsigned int idx, * need to look at completing IN requests here * if operating slave mode */ - if (hs_ep->isochronous && hs_ep->interval > 1) - dwc2_gadget_incr_frame_num(hs_ep); - - dwc2_hsotg_complete_in(hsotg, hs_ep); - if (ints & DXEPINT_NAKINTRPT) - ints &= ~DXEPINT_NAKINTRPT; + if (!hs_ep->isochronous || !(ints & DXEPINT_NAKINTRPT)) + dwc2_hsotg_complete_in(hsotg, hs_ep); if (idx == 0 && !hs_ep->req) dwc2_hsotg_enqueue_setup(hsotg); @@ -3062,10 +3086,8 @@ static void dwc2_hsotg_epint(struct dwc2_hsotg *hsotg, unsigned int idx, * We're using DMA, we need to fire an OutDone here * as we ignore the RXFIFO. */ - if (hs_ep->isochronous && hs_ep->interval > 1) - dwc2_gadget_incr_frame_num(hs_ep); - - dwc2_hsotg_handle_outdone(hsotg, idx); + if (!hs_ep->isochronous || !(ints & DXEPINT_OUTTKNEPDIS)) + dwc2_hsotg_handle_outdone(hsotg, idx); } } @@ -4055,6 +4077,7 @@ static int dwc2_hsotg_ep_enable(struct usb_ep *ep, mask |= DIEPMSK_NAKMSK; dwc2_writel(hsotg, mask, DIEPMSK); } else { + epctrl |= DXEPCTL_SNAK; mask = dwc2_readl(hsotg, DOEPMSK); mask |= DOEPMSK_OUTTKNEPDISMSK; dwc2_writel(hsotg, mask, DOEPMSK); From 8f5c8e10cc45db17c81c790aee1709f48c529958 Mon Sep 17 00:00:00 2001 From: Minas Harutyunyan Date: Sat, 11 Sep 2021 22:58:30 +0400 Subject: [PATCH 0945/5344] usb: dwc2: gadget: Fix ISOC transfer complete handling for DDMA commit dbe2518b2d8eabffa74dbf7d9fdd7dacddab7fc0 upstream. When last descriptor in a descriptor list completed with XferComplete interrupt, core switching to handle next descriptor and assert BNA interrupt. Both these interrupts are set while dwc2_hsotg_epint() handler called. Each interrupt should be handled separately: first XferComplete interrupt then BNA interrupt, otherwise last completed transfer will not be giveback to function driver as completed request. Fixes: 729cac693eec ("usb: dwc2: Change ISOC DDMA flow") Cc: stable Signed-off-by: Minas Harutyunyan Link: https://lore.kernel.org/r/a36981accc26cd674c5d8f8da6164344b94ec1fe.1631386531.git.Minas.Harutyunyan@synopsys.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc2/gadget.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/usb/dwc2/gadget.c b/drivers/usb/dwc2/gadget.c index d113f0ac0ea83..e8b25dae09499 100644 --- a/drivers/usb/dwc2/gadget.c +++ b/drivers/usb/dwc2/gadget.c @@ -3067,9 +3067,7 @@ static void dwc2_hsotg_epint(struct dwc2_hsotg *hsotg, unsigned int idx, /* In DDMA handle isochronous requests separately */ if (using_desc_dma(hsotg) && hs_ep->isochronous) { - /* XferCompl set along with BNA */ - if (!(ints & DXEPINT_BNAINTR)) - dwc2_gadget_complete_isoc_request_ddma(hs_ep); + dwc2_gadget_complete_isoc_request_ddma(hs_ep); } else if (dir_in) { /* * We get OutDone from the FIFO, so we only From 32bc568078e64c737f839a2af73b4e1195336c94 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 16 Sep 2021 16:57:37 +0300 Subject: [PATCH 0946/5344] usb: musb: tusb6010: uninitialized data in tusb_fifo_write_unaligned() commit 517c7bf99bad3d6b9360558414aae634b7472d80 upstream. This is writing to the first 1 - 3 bytes of "val" and then writing all four bytes to musb_writel(). The last byte is always going to be garbage. Zero out the last bytes instead. Fixes: 550a7375fe72 ("USB: Add MUSB and TUSB support") Signed-off-by: Dan Carpenter Cc: stable Link: https://lore.kernel.org/r/20210916135737.GI25094@kili Signed-off-by: Greg Kroah-Hartman --- drivers/usb/musb/tusb6010.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/musb/tusb6010.c b/drivers/usb/musb/tusb6010.c index 39453287b5c36..4ecfbf6bb1fa8 100644 --- a/drivers/usb/musb/tusb6010.c +++ b/drivers/usb/musb/tusb6010.c @@ -190,6 +190,7 @@ tusb_fifo_write_unaligned(void __iomem *fifo, const u8 *buf, u16 len) } if (len > 0) { /* Write the rest 1 - 3 bytes to FIFO */ + val = 0; memcpy(&val, buf, len); musb_writel(fifo, 0, val); } From 4abc591d6ebff94327daaed9dce317bf7982857f Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 23 Sep 2021 19:18:37 -0500 Subject: [PATCH 0947/5344] cifs: fix incorrect check for null pointer in header_assemble commit 9ed38fd4a15417cac83967360cf20b853bfab9b6 upstream. Although very unlikely that the tlink pointer would be null in this case, get_next_mid function can in theory return null (but not an error) so need to check for null (not for IS_ERR, which can not be returned here). Address warning: fs/smbfs_client/connect.c:2392 cifs_match_super() warn: 'tlink' isn't an ERR_PTR Pointed out by Dan Carpenter via smatch code analysis tool CC: stable@vger.kernel.org Reported-by: Dan Carpenter Acked-by: Ronnie Sahlberg Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/connect.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index da0720f41ebcb..86bdebd2ece65 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3691,9 +3691,10 @@ cifs_match_super(struct super_block *sb, void *data) spin_lock(&cifs_tcp_ses_lock); cifs_sb = CIFS_SB(sb); tlink = cifs_get_tlink(cifs_sb_master_tlink(cifs_sb)); - if (IS_ERR(tlink)) { + if (tlink == NULL) { + /* can not match superblock if tlink were ever null */ spin_unlock(&cifs_tcp_ses_lock); - return rc; + return 0; } tcon = tlink_tcon(tlink); ses = tcon->ses; From 245df652418b5606b101c72f4cb71ce3a692fb3a Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 20 Sep 2021 18:15:11 +0200 Subject: [PATCH 0948/5344] xen/x86: fix PV trap handling on secondary processors commit 0594c58161b6e0f3da8efa9c6e3d4ba52b652717 upstream. The initial observation was that in PV mode under Xen 32-bit user space didn't work anymore. Attempts of system calls ended in #GP(0x402). All of the sudden the vector 0x80 handler was not in place anymore. As it turns out up to 5.13 redundant initialization did occur: Once from cpu_initialize_context() (through its VCPUOP_initialise hypercall) and a 2nd time while each CPU was brought fully up. This 2nd initialization is now gone, uncovering that the 1st one was flawed: Unlike for the set_trap_table hypercall, a full virtual IDT needs to be specified here; the "vector" fields of the individual entries are of no interest. With many (kernel) IDT entries still(?) (i.e. at that point at least) empty, the syscall vector 0x80 ended up in slot 0x20 of the virtual IDT, thus becoming the domain's handler for vector 0x20. Make xen_convert_trap_info() fit for either purpose, leveraging the fact that on the xen_copy_trap_info() path the table starts out zero-filled. This includes moving out the writing of the sentinel, which would also have lead to a buffer overrun in the xen_copy_trap_info() case if all (kernel) IDT entries were populated. Convert the writing of the sentinel to clearing of the entire table entry rather than just the address field. (I didn't bother trying to identify the commit which uncovered the issue in 5.14; the commit named below is the one which actually introduced the bad code.) Fixes: f87e4cac4f4e ("xen: SMP guest support") Cc: stable@vger.kernel.org Signed-off-by: Jan Beulich Reviewed-by: Boris Ostrovsky Link: https://lore.kernel.org/r/7a266932-092e-b68f-f2bb-1473b61adc6e@suse.com Signed-off-by: Juergen Gross Signed-off-by: Greg Kroah-Hartman --- arch/x86/xen/enlighten_pv.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index b99074ca5e686..65cf405cd9753 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -727,8 +727,8 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) preempt_enable(); } -static void xen_convert_trap_info(const struct desc_ptr *desc, - struct trap_info *traps) +static unsigned xen_convert_trap_info(const struct desc_ptr *desc, + struct trap_info *traps, bool full) { unsigned in, out, count; @@ -738,17 +738,18 @@ static void xen_convert_trap_info(const struct desc_ptr *desc, for (in = out = 0; in < count; in++) { gate_desc *entry = (gate_desc *)(desc->address) + in; - if (cvt_gate_to_trap(in, entry, &traps[out])) + if (cvt_gate_to_trap(in, entry, &traps[out]) || full) out++; } - traps[out].address = 0; + + return out; } void xen_copy_trap_info(struct trap_info *traps) { const struct desc_ptr *desc = this_cpu_ptr(&idt_desc); - xen_convert_trap_info(desc, traps); + xen_convert_trap_info(desc, traps, true); } /* Load a new IDT into Xen. In principle this can be per-CPU, so we @@ -758,6 +759,7 @@ static void xen_load_idt(const struct desc_ptr *desc) { static DEFINE_SPINLOCK(lock); static struct trap_info traps[257]; + unsigned out; trace_xen_cpu_load_idt(desc); @@ -765,7 +767,8 @@ static void xen_load_idt(const struct desc_ptr *desc) memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc)); - xen_convert_trap_info(desc, traps); + out = xen_convert_trap_info(desc, traps, false); + memset(&traps[out], 0, sizeof(traps[0])); xen_mc_flush(); if (HYPERVISOR_set_trap_table(traps)) From 0bae135f0fba0c800945f0f51d90d2d53669e481 Mon Sep 17 00:00:00 2001 From: Ondrej Zary Date: Mon, 13 Sep 2021 23:01:06 +0200 Subject: [PATCH 0949/5344] usb-storage: Add quirk for ScanLogic SL11R-IDE older than 2.6c commit b55d37ef6b7db3eda9b4495a8d9b0a944ee8c67d upstream. ScanLogic SL11R-IDE with firmware older than 2.6c (the latest one) has broken tag handling, preventing the device from working at all: usb 1-1: new full-speed USB device number 2 using uhci_hcd usb 1-1: New USB device found, idVendor=04ce, idProduct=0002, bcdDevice= 2.60 usb 1-1: New USB device strings: Mfr=1, Product=1, SerialNumber=0 usb 1-1: Product: USB Device usb 1-1: Manufacturer: USB Device usb-storage 1-1:1.0: USB Mass Storage device detected scsi host2: usb-storage 1-1:1.0 usbcore: registered new interface driver usb-storage usb 1-1: reset full-speed USB device number 2 using uhci_hcd usb 1-1: reset full-speed USB device number 2 using uhci_hcd usb 1-1: reset full-speed USB device number 2 using uhci_hcd usb 1-1: reset full-speed USB device number 2 using uhci_hcd Add US_FL_BULK_IGNORE_TAG to fix it. Also update my e-mail address. 2.6c is the only firmware that claims Linux compatibility. The firmware can be upgraded using ezotgdbg utility: https://github.com/asciilifeform/ezotgdbg Acked-by: Alan Stern Signed-off-by: Ondrej Zary Cc: stable Link: https://lore.kernel.org/r/20210913210106.12717-1-linux@zary.sk Signed-off-by: Greg Kroah-Hartman --- drivers/usb/storage/unusual_devs.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h index 861153d294b67..7442793fe0502 100644 --- a/drivers/usb/storage/unusual_devs.h +++ b/drivers/usb/storage/unusual_devs.h @@ -416,9 +416,16 @@ UNUSUAL_DEV( 0x04cb, 0x0100, 0x0000, 0x2210, USB_SC_UFI, USB_PR_DEVICE, NULL, US_FL_FIX_INQUIRY | US_FL_SINGLE_LUN), /* - * Reported by Ondrej Zary + * Reported by Ondrej Zary * The device reports one sector more and breaks when that sector is accessed + * Firmwares older than 2.6c (the latest one and the only that claims Linux + * support) have also broken tag handling */ +UNUSUAL_DEV( 0x04ce, 0x0002, 0x0000, 0x026b, + "ScanLogic", + "SL11R-IDE", + USB_SC_DEVICE, USB_PR_DEVICE, NULL, + US_FL_FIX_CAPACITY | US_FL_BULK_IGNORE_TAG), UNUSUAL_DEV( 0x04ce, 0x0002, 0x026c, 0x026c, "ScanLogic", "SL11R-IDE", From 2f3bdcea4f34ac6c3a40f6e41f1801e6aaa0aeff Mon Sep 17 00:00:00 2001 From: Uwe Brandt Date: Tue, 21 Sep 2021 19:54:46 +0200 Subject: [PATCH 0950/5344] USB: serial: cp210x: add ID for GW Instek GDM-834x Digital Multimeter commit 3bd18ba7d859eb1fbef3beb1e80c24f6f7d7596c upstream. Add the USB serial device ID for the GW Instek GDM-834x Digital Multimeter. Signed-off-by: Uwe Brandt Link: https://lore.kernel.org/r/YUxFl3YUCPGJZd8Y@hovoldconsulting.com Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/cp210x.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c index 8e5878ec656d0..004b6598706b1 100644 --- a/drivers/usb/serial/cp210x.c +++ b/drivers/usb/serial/cp210x.c @@ -234,6 +234,7 @@ static const struct usb_device_id id_table[] = { { USB_DEVICE(0x1FB9, 0x0602) }, /* Lake Shore Model 648 Magnet Power Supply */ { USB_DEVICE(0x1FB9, 0x0700) }, /* Lake Shore Model 737 VSM Controller */ { USB_DEVICE(0x1FB9, 0x0701) }, /* Lake Shore Model 776 Hall Matrix */ + { USB_DEVICE(0x2184, 0x0030) }, /* GW Instek GDM-834x Digital Multimeter */ { USB_DEVICE(0x2626, 0xEA60) }, /* Aruba Networks 7xxx USB Serial Console */ { USB_DEVICE(0x3195, 0xF190) }, /* Link Instruments MSO-19 */ { USB_DEVICE(0x3195, 0xF280) }, /* Link Instruments MSO-28 */ From e32ab38d1e9e96dfe24b2b2d2cbb63c4e6b6db2b Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 7 Sep 2021 10:23:18 +0200 Subject: [PATCH 0951/5344] USB: cdc-acm: fix minor-number release commit 91fac0741d4817945c6ee0a17591421e7f5ecb86 upstream. If the driver runs out of minor numbers it would release minor 0 and allow another device to claim the minor while still in use. Fortunately, registering the tty class device of the second device would fail (with a stack dump) due to the sysfs name collision so no memory is leaked. Fixes: cae2bc768d17 ("usb: cdc-acm: Decrement tty port's refcount if probe() fail") Cc: stable@vger.kernel.org # 4.19 Cc: Jaejoong Kim Acked-by: Oliver Neukum Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20210907082318.7757-1-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-acm.c | 7 +++++-- drivers/usb/class/cdc-acm.h | 2 ++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c index 10cfa7d089e9f..5dc8827ede7e8 100644 --- a/drivers/usb/class/cdc-acm.c +++ b/drivers/usb/class/cdc-acm.c @@ -733,7 +733,8 @@ static void acm_port_destruct(struct tty_port *port) { struct acm *acm = container_of(port, struct acm, port); - acm_release_minor(acm); + if (acm->minor != ACM_MINOR_INVALID) + acm_release_minor(acm); usb_put_intf(acm->control); kfree(acm->country_codes); kfree(acm); @@ -1364,8 +1365,10 @@ static int acm_probe(struct usb_interface *intf, usb_get_intf(acm->control); /* undone in destruct() */ minor = acm_alloc_minor(acm); - if (minor < 0) + if (minor < 0) { + acm->minor = ACM_MINOR_INVALID; goto alloc_fail1; + } acm->minor = minor; acm->dev = usb_dev; diff --git a/drivers/usb/class/cdc-acm.h b/drivers/usb/class/cdc-acm.h index b95ff769072e7..ef7fe5eacff69 100644 --- a/drivers/usb/class/cdc-acm.h +++ b/drivers/usb/class/cdc-acm.h @@ -22,6 +22,8 @@ #define ACM_TTY_MAJOR 166 #define ACM_TTY_MINORS 256 +#define ACM_MINOR_INVALID ACM_TTY_MINORS + /* * Requests. */ From b238ff9c44b9d8c4d02e44472d5b3965d28f5472 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Mon, 30 Aug 2021 12:51:46 -0700 Subject: [PATCH 0952/5344] binder: make sure fd closes complete commit 5fdb55c1ac9585eb23bb2541d5819224429e103d upstream. During BC_FREE_BUFFER processing, the BINDER_TYPE_FDA object cleanup may close 1 or more fds. The close operations are completed using the task work mechanism -- which means the thread needs to return to userspace or the file object may never be dereferenced -- which can lead to hung processes. Force the binder thread back to userspace if an fd is closed during BC_FREE_BUFFER handling. Fixes: 80cd795630d6 ("binder: fix use-after-free due to ksys_close() during fdget()") Cc: stable Reviewed-by: Martijn Coenen Acked-by: Christian Brauner Signed-off-by: Todd Kjos Link: https://lore.kernel.org/r/20210830195146.587206-1-tkjos@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 9316a7910389f..721218a0f7bac 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2241,6 +2241,7 @@ static void binder_deferred_fd_close(int fd) } static void binder_transaction_buffer_release(struct binder_proc *proc, + struct binder_thread *thread, struct binder_buffer *buffer, binder_size_t failed_at, bool is_failure) @@ -2400,8 +2401,16 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, &proc->alloc, &fd, buffer, offset, sizeof(fd)); WARN_ON(err); - if (!err) + if (!err) { binder_deferred_fd_close(fd); + /* + * Need to make sure the thread goes + * back to userspace to complete the + * deferred close + */ + if (thread) + thread->looper_need_return = true; + } } } break; default: @@ -3471,7 +3480,7 @@ static void binder_transaction(struct binder_proc *proc, err_copy_data_failed: binder_free_txn_fixups(t); trace_binder_transaction_failed_buffer_release(t->buffer); - binder_transaction_buffer_release(target_proc, t->buffer, + binder_transaction_buffer_release(target_proc, NULL, t->buffer, buffer_offset, true); if (target_node) binder_dec_node_tmpref(target_node); @@ -3548,7 +3557,9 @@ static void binder_transaction(struct binder_proc *proc, * Cleanup buffer and free it. */ static void -binder_free_buf(struct binder_proc *proc, struct binder_buffer *buffer) +binder_free_buf(struct binder_proc *proc, + struct binder_thread *thread, + struct binder_buffer *buffer) { binder_inner_proc_lock(proc); if (buffer->transaction) { @@ -3576,7 +3587,7 @@ binder_free_buf(struct binder_proc *proc, struct binder_buffer *buffer) binder_node_inner_unlock(buf_node); } trace_binder_transaction_buffer_release(buffer); - binder_transaction_buffer_release(proc, buffer, 0, false); + binder_transaction_buffer_release(proc, thread, buffer, 0, false); binder_alloc_free_buf(&proc->alloc, buffer); } @@ -3777,7 +3788,7 @@ static int binder_thread_write(struct binder_proc *proc, proc->pid, thread->pid, (u64)data_ptr, buffer->debug_id, buffer->transaction ? "active" : "finished"); - binder_free_buf(proc, buffer); + binder_free_buf(proc, thread, buffer); break; } @@ -4465,7 +4476,7 @@ static int binder_thread_read(struct binder_proc *proc, buffer->transaction = NULL; binder_cleanup_transaction(t, "fd fixups failed", BR_FAILED_REPLY); - binder_free_buf(proc, buffer); + binder_free_buf(proc, thread, buffer); binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, "%d:%d %stransaction %d fd fixups failed %d/%d, line %d\n", proc->pid, thread->pid, From 78100e81fb0f025762708d2c81946bee52102d3b Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 6 Sep 2021 14:45:38 +0200 Subject: [PATCH 0953/5344] staging: greybus: uart: fix tty use after free commit 92dc0b1f46e12cfabd28d709bb34f7a39431b44f upstream. User space can hold a tty open indefinitely and tty drivers must not release the underlying structures until the last user is gone. Switch to using the tty-port reference counter to manage the life time of the greybus tty state to avoid use after free after a disconnect. Fixes: a18e15175708 ("greybus: more uart work") Cc: stable@vger.kernel.org # 4.9 Reviewed-by: Alex Elder Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20210906124538.22358-1-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/staging/greybus/uart.c | 62 ++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/drivers/staging/greybus/uart.c b/drivers/staging/greybus/uart.c index fc1bd68889c91..0e6bebc20695d 100644 --- a/drivers/staging/greybus/uart.c +++ b/drivers/staging/greybus/uart.c @@ -789,6 +789,17 @@ static void gb_tty_port_shutdown(struct tty_port *port) gbphy_runtime_put_autosuspend(gb_tty->gbphy_dev); } +static void gb_tty_port_destruct(struct tty_port *port) +{ + struct gb_tty *gb_tty = container_of(port, struct gb_tty, port); + + if (gb_tty->minor != GB_NUM_MINORS) + release_minor(gb_tty); + kfifo_free(&gb_tty->write_fifo); + kfree(gb_tty->buffer); + kfree(gb_tty); +} + static const struct tty_operations gb_ops = { .install = gb_tty_install, .open = gb_tty_open, @@ -814,6 +825,7 @@ static const struct tty_port_operations gb_port_ops = { .dtr_rts = gb_tty_dtr_rts, .activate = gb_tty_port_activate, .shutdown = gb_tty_port_shutdown, + .destruct = gb_tty_port_destruct, }; static int gb_uart_probe(struct gbphy_device *gbphy_dev, @@ -826,17 +838,11 @@ static int gb_uart_probe(struct gbphy_device *gbphy_dev, int retval; int minor; - gb_tty = kzalloc(sizeof(*gb_tty), GFP_KERNEL); - if (!gb_tty) - return -ENOMEM; - connection = gb_connection_create(gbphy_dev->bundle, le16_to_cpu(gbphy_dev->cport_desc->id), gb_uart_request_handler); - if (IS_ERR(connection)) { - retval = PTR_ERR(connection); - goto exit_tty_free; - } + if (IS_ERR(connection)) + return PTR_ERR(connection); max_payload = gb_operation_get_payload_size_max(connection); if (max_payload < sizeof(struct gb_uart_send_data_request)) { @@ -844,13 +850,23 @@ static int gb_uart_probe(struct gbphy_device *gbphy_dev, goto exit_connection_destroy; } + gb_tty = kzalloc(sizeof(*gb_tty), GFP_KERNEL); + if (!gb_tty) { + retval = -ENOMEM; + goto exit_connection_destroy; + } + + tty_port_init(&gb_tty->port); + gb_tty->port.ops = &gb_port_ops; + gb_tty->minor = GB_NUM_MINORS; + gb_tty->buffer_payload_max = max_payload - sizeof(struct gb_uart_send_data_request); gb_tty->buffer = kzalloc(gb_tty->buffer_payload_max, GFP_KERNEL); if (!gb_tty->buffer) { retval = -ENOMEM; - goto exit_connection_destroy; + goto exit_put_port; } INIT_WORK(&gb_tty->tx_work, gb_uart_tx_write_work); @@ -858,7 +874,7 @@ static int gb_uart_probe(struct gbphy_device *gbphy_dev, retval = kfifo_alloc(&gb_tty->write_fifo, GB_UART_WRITE_FIFO_SIZE, GFP_KERNEL); if (retval) - goto exit_buf_free; + goto exit_put_port; gb_tty->credits = GB_UART_FIRMWARE_CREDITS; init_completion(&gb_tty->credits_complete); @@ -872,7 +888,7 @@ static int gb_uart_probe(struct gbphy_device *gbphy_dev, } else { retval = minor; } - goto exit_kfifo_free; + goto exit_put_port; } gb_tty->minor = minor; @@ -881,9 +897,6 @@ static int gb_uart_probe(struct gbphy_device *gbphy_dev, init_waitqueue_head(&gb_tty->wioctl); mutex_init(&gb_tty->mutex); - tty_port_init(&gb_tty->port); - gb_tty->port.ops = &gb_port_ops; - gb_tty->connection = connection; gb_tty->gbphy_dev = gbphy_dev; gb_connection_set_data(connection, gb_tty); @@ -891,7 +904,7 @@ static int gb_uart_probe(struct gbphy_device *gbphy_dev, retval = gb_connection_enable_tx(connection); if (retval) - goto exit_release_minor; + goto exit_put_port; send_control(gb_tty, gb_tty->ctrlout); @@ -918,16 +931,10 @@ static int gb_uart_probe(struct gbphy_device *gbphy_dev, exit_connection_disable: gb_connection_disable(connection); -exit_release_minor: - release_minor(gb_tty); -exit_kfifo_free: - kfifo_free(&gb_tty->write_fifo); -exit_buf_free: - kfree(gb_tty->buffer); +exit_put_port: + tty_port_put(&gb_tty->port); exit_connection_destroy: gb_connection_destroy(connection); -exit_tty_free: - kfree(gb_tty); return retval; } @@ -958,15 +965,10 @@ static void gb_uart_remove(struct gbphy_device *gbphy_dev) gb_connection_disable_rx(connection); tty_unregister_device(gb_tty_driver, gb_tty->minor); - /* FIXME - free transmit / receive buffers */ - gb_connection_disable(connection); - tty_port_destroy(&gb_tty->port); gb_connection_destroy(connection); - release_minor(gb_tty); - kfifo_free(&gb_tty->write_fifo); - kfree(gb_tty->buffer); - kfree(gb_tty); + + tty_port_put(&gb_tty->port); } static int gb_tty_init(void) From 53bb1c2375ea3dd04dde3decf47f8bbbf5463703 Mon Sep 17 00:00:00 2001 From: Julian Sikorski Date: Mon, 13 Sep 2021 20:14:55 +0200 Subject: [PATCH 0954/5344] Re-enable UAS for LaCie Rugged USB3-FW with fk quirk commit ce1c42b4dacfe7d71c852d8bf3371067ccba865c upstream. Further testing has revealed that LaCie Rugged USB3-FW does work with uas as long as US_FL_NO_REPORT_OPCODES and US_FL_NO_SAME are enabled. Link: https://lore.kernel.org/linux-usb/2167ea48-e273-a336-a4e0-10a4e883e75e@redhat.com/ Cc: stable Suggested-by: Hans de Goede Reviewed-by: Hans de Goede Acked-by: Oliver Neukum Signed-off-by: Julian Sikorski Link: https://lore.kernel.org/r/20210913181454.7365-1-belegdol+github@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/storage/unusual_uas.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/storage/unusual_uas.h b/drivers/usb/storage/unusual_uas.h index c7db6c943ba51..2f72753c3e225 100644 --- a/drivers/usb/storage/unusual_uas.h +++ b/drivers/usb/storage/unusual_uas.h @@ -50,7 +50,7 @@ UNUSUAL_DEV(0x059f, 0x1061, 0x0000, 0x9999, "LaCie", "Rugged USB3-FW", USB_SC_DEVICE, USB_PR_DEVICE, NULL, - US_FL_IGNORE_UAS), + US_FL_NO_REPORT_OPCODES | US_FL_NO_SAME), /* * Apricorn USB3 dongle sometimes returns "USBSUSBSUSBS" in response to SCSI From 364bbcde8333049749d24ef57e8292155c3178dd Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Thu, 9 Sep 2021 12:11:58 +0530 Subject: [PATCH 0955/5344] usb: core: hcd: Add support for deferring roothub registration commit 58877b0824da15698bd85a0a9dbfa8c354e6ecb7 upstream. It has been observed with certain PCIe USB cards (like Inateck connected to AM64 EVM or J7200 EVM) that as soon as the primary roothub is registered, port status change is handled even before xHC is running leading to cold plug USB devices not detected. For such cases, registering both the root hubs along with the second HCD is required. Add support for deferring roothub registration in usb_add_hcd(), so that both primary and secondary roothubs are registered along with the second HCD. CC: stable@vger.kernel.org # 5.4+ Suggested-by: Mathias Nyman Tested-by: Chris Chiu Acked-by: Alan Stern Signed-off-by: Kishon Vijay Abraham I Link: https://lore.kernel.org/r/20210909064200.16216-2-kishon@ti.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hcd.c | 29 +++++++++++++++++++++++------ include/linux/usb/hcd.h | 2 ++ 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index d0f45600b6698..48ff9c66ae46d 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -2636,6 +2636,7 @@ int usb_add_hcd(struct usb_hcd *hcd, { int retval; struct usb_device *rhdev; + struct usb_hcd *shared_hcd; if (!hcd->skip_phy_initialization && usb_hcd_is_primary_hcd(hcd)) { hcd->phy_roothub = usb_phy_roothub_alloc(hcd->self.sysdev); @@ -2792,13 +2793,26 @@ int usb_add_hcd(struct usb_hcd *hcd, goto err_hcd_driver_start; } + /* starting here, usbcore will pay attention to the shared HCD roothub */ + shared_hcd = hcd->shared_hcd; + if (!usb_hcd_is_primary_hcd(hcd) && shared_hcd && HCD_DEFER_RH_REGISTER(shared_hcd)) { + retval = register_root_hub(shared_hcd); + if (retval != 0) + goto err_register_root_hub; + + if (shared_hcd->uses_new_polling && HCD_POLL_RH(shared_hcd)) + usb_hcd_poll_rh_status(shared_hcd); + } + /* starting here, usbcore will pay attention to this root hub */ - retval = register_root_hub(hcd); - if (retval != 0) - goto err_register_root_hub; + if (!HCD_DEFER_RH_REGISTER(hcd)) { + retval = register_root_hub(hcd); + if (retval != 0) + goto err_register_root_hub; - if (hcd->uses_new_polling && HCD_POLL_RH(hcd)) - usb_hcd_poll_rh_status(hcd); + if (hcd->uses_new_polling && HCD_POLL_RH(hcd)) + usb_hcd_poll_rh_status(hcd); + } return retval; @@ -2841,6 +2855,7 @@ EXPORT_SYMBOL_GPL(usb_add_hcd); void usb_remove_hcd(struct usb_hcd *hcd) { struct usb_device *rhdev = hcd->self.root_hub; + bool rh_registered; dev_info(hcd->self.controller, "remove, state %x\n", hcd->state); @@ -2851,6 +2866,7 @@ void usb_remove_hcd(struct usb_hcd *hcd) dev_dbg(hcd->self.controller, "roothub graceful disconnect\n"); spin_lock_irq (&hcd_root_hub_lock); + rh_registered = hcd->rh_registered; hcd->rh_registered = 0; spin_unlock_irq (&hcd_root_hub_lock); @@ -2860,7 +2876,8 @@ void usb_remove_hcd(struct usb_hcd *hcd) cancel_work_sync(&hcd->died_work); mutex_lock(&usb_bus_idr_lock); - usb_disconnect(&rhdev); /* Sets rhdev to NULL */ + if (rh_registered) + usb_disconnect(&rhdev); /* Sets rhdev to NULL */ mutex_unlock(&usb_bus_idr_lock); /* diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h index 712b2a603645f..c0eb85b2981e0 100644 --- a/include/linux/usb/hcd.h +++ b/include/linux/usb/hcd.h @@ -124,6 +124,7 @@ struct usb_hcd { #define HCD_FLAG_RH_RUNNING 5 /* root hub is running? */ #define HCD_FLAG_DEAD 6 /* controller has died? */ #define HCD_FLAG_INTF_AUTHORIZED 7 /* authorize interfaces? */ +#define HCD_FLAG_DEFER_RH_REGISTER 8 /* Defer roothub registration */ /* The flags can be tested using these macros; they are likely to * be slightly faster than test_bit(). @@ -134,6 +135,7 @@ struct usb_hcd { #define HCD_WAKEUP_PENDING(hcd) ((hcd)->flags & (1U << HCD_FLAG_WAKEUP_PENDING)) #define HCD_RH_RUNNING(hcd) ((hcd)->flags & (1U << HCD_FLAG_RH_RUNNING)) #define HCD_DEAD(hcd) ((hcd)->flags & (1U << HCD_FLAG_DEAD)) +#define HCD_DEFER_RH_REGISTER(hcd) ((hcd)->flags & (1U << HCD_FLAG_DEFER_RH_REGISTER)) /* * Specifies if interfaces are authorized by default From 7d8db355efa918bfc10e92bd3cb040176f8b35f6 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 17 Sep 2021 11:18:47 +0200 Subject: [PATCH 0956/5344] USB: serial: mos7840: remove duplicated 0xac24 device ID commit 211f323768a25b30c106fd38f15a0f62c7c2b5f4 upstream. 0xac24 device ID is already defined and used via BANDB_DEVICE_ID_USO9ML2_4. Remove the duplicate from the list. Fixes: 27f1281d5f72 ("USB: serial: Extra device/vendor ID for mos7840 driver") Signed-off-by: Krzysztof Kozlowski Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/mos7840.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c index 2b8a0d4b66fce..84b6093ed1d28 100644 --- a/drivers/usb/serial/mos7840.c +++ b/drivers/usb/serial/mos7840.c @@ -114,7 +114,6 @@ #define BANDB_DEVICE_ID_USOPTL4_2P 0xBC02 #define BANDB_DEVICE_ID_USOPTL4_4 0xAC44 #define BANDB_DEVICE_ID_USOPTL4_4P 0xBC03 -#define BANDB_DEVICE_ID_USOPTL2_4 0xAC24 /* This driver also supports * ATEN UC2324 device using Moschip MCS7840 @@ -196,7 +195,6 @@ static const struct usb_device_id id_table[] = { {USB_DEVICE(USB_VENDOR_ID_BANDB, BANDB_DEVICE_ID_USOPTL4_2P)}, {USB_DEVICE(USB_VENDOR_ID_BANDB, BANDB_DEVICE_ID_USOPTL4_4)}, {USB_DEVICE(USB_VENDOR_ID_BANDB, BANDB_DEVICE_ID_USOPTL4_4P)}, - {USB_DEVICE(USB_VENDOR_ID_BANDB, BANDB_DEVICE_ID_USOPTL2_4)}, {USB_DEVICE(USB_VENDOR_ID_ATENINTL, ATENINTL_DEVICE_ID_UC2324)}, {USB_DEVICE(USB_VENDOR_ID_ATENINTL, ATENINTL_DEVICE_ID_UC2322)}, {USB_DEVICE(USB_VENDOR_ID_MOXA, MOXA_DEVICE_ID_2210)}, From b9b020025421ed936a03a9cb78b186be5158dfa8 Mon Sep 17 00:00:00 2001 From: Carlo Lobrano Date: Fri, 3 Sep 2021 14:39:13 +0200 Subject: [PATCH 0957/5344] USB: serial: option: add Telit LN920 compositions commit 7bb057134d609b9c038a00b6876cf0d37d0118ce upstream. This patch adds the following Telit LN920 compositions: 0x1060: tty, adb, rmnet, tty, tty, tty, tty 0x1061: tty, adb, mbim, tty, tty, tty, tty 0x1062: rndis, tty, adb, tty, tty, tty, tty 0x1063: tty, adb, ecm, tty, tty, tty, tty Signed-off-by: Carlo Lobrano Link: https://lore.kernel.org/r/20210903123913.1086513-1-c.lobrano@gmail.com Reviewed-by: Daniele Palmas Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/option.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index d42ca13569965..3e3a5ffc5a41a 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1205,6 +1205,14 @@ static const struct usb_device_id option_ids[] = { .driver_info = NCTRL(0) | RSVD(1) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1056, 0xff), /* Telit FD980 */ .driver_info = NCTRL(2) | RSVD(3) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1060, 0xff), /* Telit LN920 (rmnet) */ + .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1061, 0xff), /* Telit LN920 (MBIM) */ + .driver_info = NCTRL(0) | RSVD(1) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1062, 0xff), /* Telit LN920 (RNDIS) */ + .driver_info = NCTRL(2) | RSVD(3) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1063, 0xff), /* Telit LN920 (ECM) */ + .driver_info = NCTRL(0) | RSVD(1) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910), .driver_info = NCTRL(0) | RSVD(1) | RSVD(3) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910_DUAL_MODEM), From 37711c3da659630b53cdede5c0c5ba9e8697e398 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 17 Sep 2021 11:18:48 +0200 Subject: [PATCH 0958/5344] USB: serial: option: remove duplicate USB device ID commit 1ca200a8c6f079950a04ea3c3380fe8cf78e95a2 upstream. The device ZTE 0x0094 is already on the list. Signed-off-by: Krzysztof Kozlowski Fixes: b9e44fe5ecda ("USB: option: cleanup zte 3g-dongle's pid in option.c") Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/option.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 3e3a5ffc5a41a..5965957e7663c 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1658,7 +1658,6 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0060, 0xff, 0xff, 0xff) }, { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0070, 0xff, 0xff, 0xff) }, { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0073, 0xff, 0xff, 0xff) }, - { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0094, 0xff, 0xff, 0xff) }, { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0130, 0xff, 0xff, 0xff), .driver_info = RSVD(1) }, { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0133, 0xff, 0xff, 0xff), From 574f754c327acf978af84fd4a9cc8d885b526f3a Mon Sep 17 00:00:00 2001 From: Slark Xiao Date: Fri, 17 Sep 2021 19:01:06 +0800 Subject: [PATCH 0959/5344] USB: serial: option: add device id for Foxconn T99W265 commit 9e3eed534f8235a4a596a9dae5b8a6425d81ea1a upstream. Adding support for Foxconn device T99W265 for enumeration with PID 0xe0db. usb-devices output for 0xe0db T: Bus=04 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 19 Spd=5000 MxCh= 0 D: Ver= 3.20 Cls=ef(misc ) Sub=02 Prot=01 MxPS= 9 #Cfgs= 1 P: Vendor=0489 ProdID=e0db Rev=05.04 S: Manufacturer=Microsoft S: Product=Generic Mobile Broadband Adapter S: SerialNumber=6c50f452 C: #Ifs= 5 Cfg#= 1 Atr=a0 MxPwr=896mA I: If#=0x0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim I: If#=0x1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim I: If#=0x2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option I: If#=0x3 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none) I: If#=0x4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option if0/1: MBIM, if2:Diag, if3:GNSS, if4: Modem Signed-off-by: Slark Xiao Link: https://lore.kernel.org/r/20210917110106.9852-1-slark_xiao@163.com [ johan: use USB_DEVICE_INTERFACE_CLASS(), amend comment ] Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/option.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 5965957e7663c..82016d9781460 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -2075,6 +2075,8 @@ static const struct usb_device_id option_ids[] = { .driver_info = RSVD(0) | RSVD(1) | RSVD(6) }, { USB_DEVICE(0x0489, 0xe0b5), /* Foxconn T77W968 ESIM */ .driver_info = RSVD(0) | RSVD(1) | RSVD(6) }, + { USB_DEVICE_INTERFACE_CLASS(0x0489, 0xe0db, 0xff), /* Foxconn T99W265 MBIM */ + .driver_info = RSVD(3) }, { USB_DEVICE(0x1508, 0x1001), /* Fibocom NL668 (IOT version) */ .driver_info = RSVD(4) | RSVD(5) | RSVD(6) }, { USB_DEVICE(0x2cb7, 0x0104), /* Fibocom NL678 series */ From a925008df03095bd0ad30cec66d93f80cc994a56 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 6 Sep 2021 21:35:48 +0900 Subject: [PATCH 0960/5344] mcb: fix error handling in mcb_alloc_bus() commit 25a1433216489de4abc889910f744e952cb6dbae upstream. There are two bugs: 1) If ida_simple_get() fails then this code calls put_device(carrier) but we haven't yet called get_device(carrier) and probably that leads to a use after free. 2) After device_initialize() then we need to use put_device() to release the bus. This will free the internal resources tied to the device and call mcb_free_bus() which will free the rest. Fixes: 5d9e2ab9fea4 ("mcb: Implement bus->dev.release callback") Fixes: 18d288198099 ("mcb: Correctly initialize the bus's device") Cc: stable@vger.kernel.org Signed-off-by: Dan Carpenter Signed-off-by: Johannes Thumshirn Link: https://lore.kernel.org/r/32e160cf6864ce77f9d62948338e24db9fd8ead9.1630931319.git.johannes.thumshirn@wdc.com Signed-off-by: Greg Kroah-Hartman --- drivers/mcb/mcb-core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/mcb/mcb-core.c b/drivers/mcb/mcb-core.c index b72e82efaee52..c799bb81ab03d 100644 --- a/drivers/mcb/mcb-core.c +++ b/drivers/mcb/mcb-core.c @@ -277,8 +277,8 @@ struct mcb_bus *mcb_alloc_bus(struct device *carrier) bus_nr = ida_simple_get(&mcb_ida, 0, 0, GFP_KERNEL); if (bus_nr < 0) { - rc = bus_nr; - goto err_free; + kfree(bus); + return ERR_PTR(bus_nr); } bus->bus_nr = bus_nr; @@ -293,12 +293,12 @@ struct mcb_bus *mcb_alloc_bus(struct device *carrier) dev_set_name(&bus->dev, "mcb:%d", bus_nr); rc = device_add(&bus->dev); if (rc) - goto err_free; + goto err_put; return bus; -err_free: - put_device(carrier); - kfree(bus); + +err_put: + put_device(&bus->dev); return ERR_PTR(rc); } EXPORT_SYMBOL_GPL(mcb_alloc_bus); From 9c8ea171796dbf55b9947064ca9b08e1ec66cfbb Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 21 Sep 2021 22:35:30 +0800 Subject: [PATCH 0961/5344] erofs: fix up erofs_lookup tracepoint commit 93368aab0efc87288cac65e99c9ed2e0ffc9e7d0 upstream. Fix up a misuse that the filename pointer isn't always valid in the ring buffer, and we should copy the content instead. Link: https://lore.kernel.org/r/20210921143531.81356-1-hsiangkao@linux.alibaba.com Fixes: 13f06f48f7bf ("staging: erofs: support tracepoint") Cc: stable@vger.kernel.org # 4.19+ Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Signed-off-by: Greg Kroah-Hartman --- include/trace/events/erofs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h index 27f5caa6299a3..ecfad7641096e 100644 --- a/include/trace/events/erofs.h +++ b/include/trace/events/erofs.h @@ -35,20 +35,20 @@ TRACE_EVENT(erofs_lookup, TP_STRUCT__entry( __field(dev_t, dev ) __field(erofs_nid_t, nid ) - __field(const char *, name ) + __string(name, dentry->d_name.name ) __field(unsigned int, flags ) ), TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->nid = EROFS_I(dir)->nid; - __entry->name = dentry->d_name.name; + __assign_str(name, dentry->d_name.name); __entry->flags = flags; ), TP_printk("dev = (%d,%d), pnid = %llu, name:%s, flags:%x", show_dev_nid(__entry), - __entry->name, + __get_str(name), __entry->flags) ); From 7499c26ce3811108ae0c2e1c0e1ba3c748b5b669 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 16 Sep 2021 20:43:29 +0800 Subject: [PATCH 0962/5344] btrfs: prevent __btrfs_dump_space_info() to underflow its free space commit 0619b7901473c380abc05d45cf9c70bee0707db3 upstream. It's not uncommon where __btrfs_dump_space_info() gets called under over-commit situations. In that case free space would underflow as total allocated space is not enough to handle all the over-committed space. Such underflow values can sometimes cause confusion for users enabled enospc_debug mount option, and takes some seconds for developers to convert the underflow value to signed result. Just output the free space as s64 to avoid such problem. Reported-by: Eli V Link: https://lore.kernel.org/linux-btrfs/CAJtFHUSy4zgyhf-4d9T+KdJp9w=UgzC2A0V=VtmaeEpcGgm1-Q@mail.gmail.com/ CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Anand Jain Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/space-info.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 90500b6c41fc6..1cd39f6a9c3ad 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -262,9 +262,10 @@ static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info, { lockdep_assert_held(&info->lock); - btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", + /* The free space could be negative in case of overcommit */ + btrfs_info(fs_info, "space_info %llu has %lld free, is %sfull", info->flags, - info->total_bytes - btrfs_space_info_used(info, true), + (s64)(info->total_bytes - btrfs_space_info_used(info, true)), info->full ? "" : "not "); btrfs_info(fs_info, "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", From a16d6cefe001c0265fccf51f886ef52a54c98481 Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Thu, 9 Sep 2021 12:11:59 +0530 Subject: [PATCH 0963/5344] xhci: Set HCD flag to defer primary roothub registration commit b7a0a792f864583207c593b50fd1b752ed89f4c1 upstream. Set "HCD_FLAG_DEFER_RH_REGISTER" to hcd->flags in xhci_run() to defer registering primary roothub in usb_add_hcd(). This will make sure both primary roothub and secondary roothub will be registered along with the second HCD. This is required for cold plugged USB devices to be detected in certain PCIe USB cards (like Inateck USB card connected to AM64 EVM or J7200 EVM). CC: stable@vger.kernel.org # 5.4+ Suggested-by: Mathias Nyman Tested-by: Chris Chiu Signed-off-by: Kishon Vijay Abraham I Link: https://lore.kernel.org/r/20210909064200.16216-3-kishon@ti.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index 505da4999e208..02a2afd130eb6 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -693,6 +693,7 @@ int xhci_run(struct usb_hcd *hcd) if (ret) xhci_free_command(xhci, command); } + set_bit(HCD_FLAG_DEFER_RH_REGISTER, &hcd->flags); xhci_dbg_trace(xhci, trace_xhci_dbg_init, "Finished xhci_run for USB2 roothub"); From c418c52f3ab3700229f7cc65dee82fcbef40d70b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Sat, 11 Sep 2021 15:20:17 +0200 Subject: [PATCH 0964/5344] serial: mvebu-uart: fix driver's tx_empty callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 74e1eb3b4a1ef2e564b4bdeb6e92afe844e900de upstream. Driver's tx_empty callback should signal when the transmit shift register is empty. So when the last character has been sent. STAT_TX_FIFO_EMP bit signals only that HW transmit FIFO is empty, which happens when the last byte is loaded into transmit shift register. STAT_TX_EMP bit signals when the both HW transmit FIFO and transmit shift register are empty. So replace STAT_TX_FIFO_EMP check by STAT_TX_EMP in mvebu_uart_tx_empty() callback function. Fixes: 30530791a7a0 ("serial: mvebu-uart: initial support for Armada-3700 serial port") Cc: stable Signed-off-by: Pali Rohár Link: https://lore.kernel.org/r/20210911132017.25505-1-pali@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/mvebu-uart.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/mvebu-uart.c b/drivers/tty/serial/mvebu-uart.c index 51b4d8d1dcaca..13db15118cb94 100644 --- a/drivers/tty/serial/mvebu-uart.c +++ b/drivers/tty/serial/mvebu-uart.c @@ -164,7 +164,7 @@ static unsigned int mvebu_uart_tx_empty(struct uart_port *port) st = readl(port->membase + UART_STAT); spin_unlock_irqrestore(&port->lock, flags); - return (st & STAT_TX_FIFO_EMP) ? TIOCSER_TEMT : 0; + return (st & STAT_TX_EMP) ? TIOCSER_TEMT : 0; } static unsigned int mvebu_uart_get_mctrl(struct uart_port *port) From 76a9d31992a530725dffd77e8d4151b4e38f101e Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 17 Sep 2021 12:12:04 +0200 Subject: [PATCH 0965/5344] net: hso: fix muxed tty registration commit e8f69b16ee776da88589b5271e3f46020efc8f6c upstream. If resource allocation and registration fail for a muxed tty device (e.g. if there are no more minor numbers) the driver should not try to deregister the never-registered (or already-deregistered) tty. Fix up the error handling to avoid dereferencing a NULL pointer when attempting to remove the character device. Fixes: 72dc1c096c70 ("HSO: add option hso driver") Cc: stable@vger.kernel.org # 2.6.27 Signed-off-by: Johan Hovold Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/hso.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/net/usb/hso.c b/drivers/net/usb/hso.c index 22450c4a92251..9ad1f093c4ae1 100644 --- a/drivers/net/usb/hso.c +++ b/drivers/net/usb/hso.c @@ -2704,14 +2704,14 @@ struct hso_device *hso_create_mux_serial_device(struct usb_interface *interface, serial = kzalloc(sizeof(*serial), GFP_KERNEL); if (!serial) - goto exit; + goto err_free_dev; hso_dev->port_data.dev_serial = serial; serial->parent = hso_dev; if (hso_serial_common_create (serial, 1, CTRL_URB_RX_SIZE, CTRL_URB_TX_SIZE)) - goto exit; + goto err_free_serial; serial->tx_data_length--; serial->write_data = hso_mux_serial_write_data; @@ -2727,11 +2727,9 @@ struct hso_device *hso_create_mux_serial_device(struct usb_interface *interface, /* done, return it */ return hso_dev; -exit: - if (serial) { - tty_unregister_device(tty_drv, serial->minor); - kfree(serial); - } +err_free_serial: + kfree(serial); +err_free_dev: kfree(hso_dev); return NULL; From 5c5e7092ac571a38bd9ffac4239d972b17ea2c7e Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 1 Sep 2021 18:11:33 +0100 Subject: [PATCH 0966/5344] afs: Fix incorrect triggering of sillyrename on 3rd-party invalidation [ Upstream commit 63d49d843ef5fffeea069e0ffdfbd2bf40ba01c6 ] The AFS filesystem is currently triggering the silly-rename cleanup from afs_d_revalidate() when it sees that a dentry has been changed by a third party[1]. It should not be doing this as the cleanup includes deleting the silly-rename target file on iput. Fix this by removing the places in the d_revalidate handling that validate anything other than the directory and the dirent. It probably should not be looking to validate the target inode of the dentry also. This includes removing the point in afs_d_revalidate() where the inode that a dentry used to point to was marked as being deleted (AFS_VNODE_DELETED). We don't know it got deleted. It could have been renamed or it could have hard links remaining. This was reproduced by cloning a git repo onto an afs volume on one machine, switching to another machine and doing "git status", then switching back to the first and doing "git status". The second status would show weird output due to ".git/index" getting deleted by the above mentioned mechanism. A simpler way to do it is to do: machine 1: touch a machine 2: touch b; mv -f b a machine 1: stat a on an afs volume. The bug shows up as the stat failing with ENOENT and the file server log showing that machine 1 deleted "a". Fixes: 79ddbfa500b3 ("afs: Implement sillyrename for unlink and rename") Reported-by: Markus Suvanto Signed-off-by: David Howells Tested-by: Markus Suvanto cc: linux-afs@lists.infradead.org Link: https://bugzilla.kernel.org/show_bug.cgi?id=214217#c4 [1] Link: https://lore.kernel.org/r/163111668100.283156.3851669884664475428.stgit@warthog.procyon.org.uk/ Signed-off-by: Sasha Levin --- fs/afs/dir.c | 46 +++++++--------------------------------------- 1 file changed, 7 insertions(+), 39 deletions(-) diff --git a/fs/afs/dir.c b/fs/afs/dir.c index e7494cd49ce7b..8c39533d122a5 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -977,9 +977,9 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, */ static int afs_d_revalidate_rcu(struct dentry *dentry) { - struct afs_vnode *dvnode, *vnode; + struct afs_vnode *dvnode; struct dentry *parent; - struct inode *dir, *inode; + struct inode *dir; long dir_version, de_version; _enter("%p", dentry); @@ -1009,18 +1009,6 @@ static int afs_d_revalidate_rcu(struct dentry *dentry) return -ECHILD; } - /* Check to see if the vnode referred to by the dentry still - * has a callback. - */ - if (d_really_is_positive(dentry)) { - inode = d_inode_rcu(dentry); - if (inode) { - vnode = AFS_FS_I(inode); - if (!afs_check_validity(vnode)) - return -ECHILD; - } - } - return 1; /* Still valid */ } @@ -1056,17 +1044,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) if (IS_ERR(key)) key = NULL; - if (d_really_is_positive(dentry)) { - inode = d_inode(dentry); - if (inode) { - vnode = AFS_FS_I(inode); - afs_validate(vnode, key); - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) - goto out_bad; - } - } - - /* lock down the parent dentry so we can peer at it */ + /* Hold the parent dentry so we can peer at it */ parent = dget_parent(dentry); dir = AFS_FS_I(d_inode(parent)); @@ -1075,7 +1053,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) if (test_bit(AFS_VNODE_DELETED, &dir->flags)) { _debug("%pd: parent dir deleted", dentry); - goto out_bad_parent; + goto not_found; } /* We only need to invalidate a dentry if the server's copy changed @@ -1101,12 +1079,12 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) case 0: /* the filename maps to something */ if (d_really_is_negative(dentry)) - goto out_bad_parent; + goto not_found; inode = d_inode(dentry); if (is_bad_inode(inode)) { printk("kAFS: afs_d_revalidate: %pd2 has bad inode\n", dentry); - goto out_bad_parent; + goto not_found; } vnode = AFS_FS_I(inode); @@ -1128,9 +1106,6 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) dentry, fid.unique, vnode->fid.unique, vnode->vfs_inode.i_generation); - write_seqlock(&vnode->cb_lock); - set_bit(AFS_VNODE_DELETED, &vnode->flags); - write_sequnlock(&vnode->cb_lock); goto not_found; } goto out_valid; @@ -1145,7 +1120,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) default: _debug("failed to iterate dir %pd: %d", parent, ret); - goto out_bad_parent; + goto not_found; } out_valid: @@ -1156,16 +1131,9 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) _leave(" = 1 [valid]"); return 1; - /* the dirent, if it exists, now points to a different vnode */ not_found: - spin_lock(&dentry->d_lock); - dentry->d_flags |= DCACHE_NFSFS_RENAMED; - spin_unlock(&dentry->d_lock); - -out_bad_parent: _debug("dropping dentry %pd2", dentry); dput(parent); -out_bad: key_put(key); _leave(" = 0 [bad]"); From aaeaa8dba959d42939274e7e0a9f0d8168a51c25 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 27 Aug 2021 17:53:10 +0300 Subject: [PATCH 0967/5344] platform/x86/intel: punit_ipc: Drop wrong use of ACPI_PTR() [ Upstream commit 349bff48ae0f5f8aa2075d0bdc2091a30bd634f6 ] ACPI_PTR() is more harmful than helpful. For example, in this case if CONFIG_ACPI=n, the ID table left unused which is not what we want. Instead of adding ifdeffery here and there, drop ACPI_PTR() and unused acpi.h. Fixes: fdca4f16f57d ("platform:x86: add Intel P-Unit mailbox IPC driver") Reported-by: kernel test robot Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210827145310.76239-1-andriy.shevchenko@linux.intel.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Signed-off-by: Sasha Levin --- drivers/platform/x86/intel_punit_ipc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/platform/x86/intel_punit_ipc.c b/drivers/platform/x86/intel_punit_ipc.c index ccb44f2eb2407..af3a28623e869 100644 --- a/drivers/platform/x86/intel_punit_ipc.c +++ b/drivers/platform/x86/intel_punit_ipc.c @@ -8,7 +8,6 @@ * which provide mailbox interface for power management usage. */ -#include #include #include #include @@ -335,7 +334,7 @@ static struct platform_driver intel_punit_ipc_driver = { .remove = intel_punit_ipc_remove, .driver = { .name = "intel_punit_ipc", - .acpi_match_table = ACPI_PTR(punit_ipc_acpi_ids), + .acpi_match_table = punit_ipc_acpi_ids, }, }; From acb4f8b7b2c045138263c29ee04da2bfbc1e6fb4 Mon Sep 17 00:00:00 2001 From: Claudiu Manoil Date: Fri, 17 Sep 2021 13:22:05 +0300 Subject: [PATCH 0968/5344] enetc: Fix illegal access when reading affinity_hint [ Upstream commit 7237a494decfa17d0b9d0076e6cee3235719de90 ] irq_set_affinity_hit() stores a reference to the cpumask_t parameter in the irq descriptor, and that reference can be accessed later from irq_affinity_hint_proc_show(). Since the cpu_mask parameter passed to irq_set_affinity_hit() has only temporary storage (it's on the stack memory), later accesses to it are illegal. Thus reads from the corresponding procfs affinity_hint file can result in paging request oops. The issue is fixed by the get_cpu_mask() helper, which provides a permanent storage for the cpumask_t parameter. Fixes: d4fd0404c1c9 ("enetc: Introduce basic PF and VF ENETC ethernet drivers") Signed-off-by: Claudiu Manoil Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/freescale/enetc/enetc.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index b77eaf31bd4ed..cee77326e7e85 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -1222,7 +1222,6 @@ static void enetc_clear_bdrs(struct enetc_ndev_priv *priv) static int enetc_setup_irqs(struct enetc_ndev_priv *priv) { struct pci_dev *pdev = priv->si->pdev; - cpumask_t cpu_mask; int i, j, err; for (i = 0; i < priv->bdr_int_num; i++) { @@ -1249,9 +1248,7 @@ static int enetc_setup_irqs(struct enetc_ndev_priv *priv) enetc_wr(hw, ENETC_SIMSITRV(idx), entry); } - cpumask_clear(&cpu_mask); - cpumask_set_cpu(i % num_online_cpus(), &cpu_mask); - irq_set_affinity_hint(irq, &cpu_mask); + irq_set_affinity_hint(irq, get_cpu_mask(i % num_online_cpus())); } return 0; From 1a189c9c10ccfa11884ac1ebb154bab7251e16fe Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Mon, 20 Sep 2021 02:51:52 -0400 Subject: [PATCH 0969/5344] bnxt_en: Fix TX timeout when TX ring size is set to the smallest [ Upstream commit 5bed8b0704c9ecccc8f4a2c377d7c8e21090a82e ] The smallest TX ring size we support must fit a TX SKB with MAX_SKB_FRAGS + 1. Because the first TX BD for a packet is always a long TX BD, we need an extra TX BD to fit this packet. Define BNXT_MIN_TX_DESC_CNT with this value to make this more clear. The current code uses a minimum that is off by 1. Fix it using this constant. The tx_wake_thresh to determine when to wake up the TX queue is half the ring size but we must have at least BNXT_MIN_TX_DESC_CNT for the next packet which may have maximum fragments. So the comparison of the available TX BDs with tx_wake_thresh should be >= instead of > in the current code. Otherwise, at the smallest ring size, we will never wake up the TX queue and will cause TX timeout. Fixes: c0c050c58d84 ("bnxt_en: New Broadcom ethernet driver.") Reviewed-by: Pavan Chebbi Signed-off-by: Michael Chan Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 8 ++++---- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 5 +++++ drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 2 +- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index e67f07faca789..7f590a9e3af79 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -372,7 +372,7 @@ static bool bnxt_txr_netif_try_stop_queue(struct bnxt *bp, * netif_tx_queue_stopped(). */ smp_mb(); - if (bnxt_tx_avail(bp, txr) > bp->tx_wake_thresh) { + if (bnxt_tx_avail(bp, txr) >= bp->tx_wake_thresh) { netif_tx_wake_queue(txq); return false; } @@ -701,7 +701,7 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts) smp_mb(); if (unlikely(netif_tx_queue_stopped(txq)) && - bnxt_tx_avail(bp, txr) > bp->tx_wake_thresh && + bnxt_tx_avail(bp, txr) >= bp->tx_wake_thresh && READ_ONCE(txr->dev_state) != BNXT_DEV_STATE_CLOSING) netif_tx_wake_queue(txq); } @@ -2206,7 +2206,7 @@ static int __bnxt_poll_work(struct bnxt *bp, struct bnxt_cp_ring_info *cpr, if (TX_CMP_TYPE(txcmp) == CMP_TYPE_TX_L2_CMP) { tx_pkts++; /* return full budget so NAPI will complete. */ - if (unlikely(tx_pkts > bp->tx_wake_thresh)) { + if (unlikely(tx_pkts >= bp->tx_wake_thresh)) { rx_pkts = budget; raw_cons = NEXT_RAW_CMP(raw_cons); if (budget) @@ -3329,7 +3329,7 @@ static int bnxt_init_tx_rings(struct bnxt *bp) u16 i; bp->tx_wake_thresh = max_t(int, bp->tx_ring_size / 2, - MAX_SKB_FRAGS + 1); + BNXT_MIN_TX_DESC_CNT); for (i = 0; i < bp->tx_nr_rings; i++) { struct bnxt_tx_ring_info *txr = &bp->tx_ring[i]; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 510ff01bdad8c..8ba369c0100b4 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -601,6 +601,11 @@ struct nqe_cn { #define BNXT_MAX_RX_JUM_DESC_CNT (RX_DESC_CNT * MAX_RX_AGG_PAGES - 1) #define BNXT_MAX_TX_DESC_CNT (TX_DESC_CNT * MAX_TX_PAGES - 1) +/* Minimum TX BDs for a TX packet with MAX_SKB_FRAGS + 1. We need one extra + * BD because the first TX BD is always a long BD. + */ +#define BNXT_MIN_TX_DESC_CNT (MAX_SKB_FRAGS + 2) + #define RX_RING(x) (((x) & ~(RX_DESC_CNT - 1)) >> (BNXT_PAGE_SHIFT - 4)) #define RX_IDX(x) ((x) & (RX_DESC_CNT - 1)) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 2118523782246..97aff84fd1d17 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -744,7 +744,7 @@ static int bnxt_set_ringparam(struct net_device *dev, if ((ering->rx_pending > BNXT_MAX_RX_DESC_CNT) || (ering->tx_pending > BNXT_MAX_TX_DESC_CNT) || - (ering->tx_pending <= MAX_SKB_FRAGS)) + (ering->tx_pending < BNXT_MIN_TX_DESC_CNT)) return -EINVAL; if (netif_running(dev)) From 7a8748bdb642e703270bbcabc60d926ac46925f4 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Mon, 20 Sep 2021 21:18:14 +0200 Subject: [PATCH 0970/5344] net/smc: add missing error check in smc_clc_prfx_set() [ Upstream commit 6c90731980655280ea07ce4b21eb97457bf86286 ] Coverity stumbled over a missing error check in smc_clc_prfx_set(): *** CID 1475954: Error handling issues (CHECKED_RETURN) /net/smc/smc_clc.c: 233 in smc_clc_prfx_set() >>> CID 1475954: Error handling issues (CHECKED_RETURN) >>> Calling "kernel_getsockname" without checking return value (as is done elsewhere 8 out of 10 times). 233 kernel_getsockname(clcsock, (struct sockaddr *)&addrs); Add the return code check in smc_clc_prfx_set(). Fixes: c246d942eabc ("net/smc: restructure netinfo for CLC proposal msgs") Reported-by: Julian Wiedmann Signed-off-by: Karsten Graul Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/smc/smc_clc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index aee9ccfa99c22..ade1232699bbf 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -164,7 +164,8 @@ static int smc_clc_prfx_set(struct socket *clcsock, goto out_rel; } /* get address to which the internal TCP socket is bound */ - kernel_getsockname(clcsock, (struct sockaddr *)&addrs); + if (kernel_getsockname(clcsock, (struct sockaddr *)&addrs) < 0) + goto out_rel; /* analyze IP specific data of net_device belonging to TCP socket */ addr6 = (struct sockaddr_in6 *)&addrs; rcu_read_lock(); From a5ca32eb6b57cc4255167d00abccce56db266e8e Mon Sep 17 00:00:00 2001 From: Kunihiko Hayashi Date: Thu, 16 Sep 2021 20:19:35 +0900 Subject: [PATCH 0971/5344] gpio: uniphier: Fix void functions to remove return value [ Upstream commit 2dd824cca3407bc9a2bd11b00f6e117b66fcfcf1 ] The return type of irq_chip.irq_mask() and irq_chip.irq_unmask() should be void. Fixes: dbe776c2ca54 ("gpio: uniphier: add UniPhier GPIO controller driver") Signed-off-by: Kunihiko Hayashi Signed-off-by: Bartosz Golaszewski Signed-off-by: Sasha Levin --- drivers/gpio/gpio-uniphier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpio-uniphier.c b/drivers/gpio/gpio-uniphier.c index 93cdcc41e9fbc..0f1cf50b4dcea 100644 --- a/drivers/gpio/gpio-uniphier.c +++ b/drivers/gpio/gpio-uniphier.c @@ -188,7 +188,7 @@ static void uniphier_gpio_irq_mask(struct irq_data *data) uniphier_gpio_reg_update(priv, UNIPHIER_GPIO_IRQ_EN, mask, 0); - return irq_chip_mask_parent(data); + irq_chip_mask_parent(data); } static void uniphier_gpio_irq_unmask(struct irq_data *data) @@ -198,7 +198,7 @@ static void uniphier_gpio_irq_unmask(struct irq_data *data) uniphier_gpio_reg_update(priv, UNIPHIER_GPIO_IRQ_EN, mask, mask); - return irq_chip_unmask_parent(data); + irq_chip_unmask_parent(data); } static int uniphier_gpio_irq_set_type(struct irq_data *data, unsigned int type) From a99f20f1fd469e7be365a7b7f9501af940670b75 Mon Sep 17 00:00:00 2001 From: Shai Malin Date: Wed, 22 Sep 2021 13:53:26 +0300 Subject: [PATCH 0972/5344] qed: rdma - don't wait for resources under hw error recovery flow [ Upstream commit 1ea7812326004afd2803cc968a4776ae5120a597 ] If the HW device is during recovery, the HW resources will never return, hence we shouldn't wait for the CID (HW context ID) bitmaps to clear. This fix speeds up the error recovery flow. Fixes: 64515dc899df ("qed: Add infrastructure for error detection and recovery") Signed-off-by: Michal Kalderon Signed-off-by: Ariel Elior Signed-off-by: Shai Malin Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/qlogic/qed/qed_iwarp.c | 8 ++++++++ drivers/net/ethernet/qlogic/qed/qed_roce.c | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c index 9adbaccd0c5ed..934740d604709 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c @@ -1307,6 +1307,14 @@ qed_iwarp_wait_cid_map_cleared(struct qed_hwfn *p_hwfn, struct qed_bmap *bmap) prev_weight = weight; while (weight) { + /* If the HW device is during recovery, all resources are + * immediately reset without receiving a per-cid indication + * from HW. In this case we don't expect the cid_map to be + * cleared. + */ + if (p_hwfn->cdev->recov_in_prog) + return 0; + msleep(QED_IWARP_MAX_CID_CLEAN_TIME); weight = bitmap_weight(bmap->bitmap, bmap->max_count); diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.c b/drivers/net/ethernet/qlogic/qed/qed_roce.c index 83817bb50e9fa..6e6563b51d688 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_roce.c +++ b/drivers/net/ethernet/qlogic/qed/qed_roce.c @@ -107,6 +107,14 @@ void qed_roce_stop(struct qed_hwfn *p_hwfn) * Beyond the added delay we clear the bitmap anyway. */ while (bitmap_weight(rcid_map->bitmap, rcid_map->max_count)) { + /* If the HW device is during recovery, all resources are + * immediately reset without receiving a per-cid indication + * from HW. In this case we don't expect the cid bitmap to be + * cleared. + */ + if (p_hwfn->cdev->recov_in_prog) + return; + msleep(100); if (wait_count++ > 20) { DP_NOTICE(p_hwfn, "cid bitmap wait timed out\n"); From 482e87c0cbb2a5a1053576a0737cf6cbfd264646 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Thu, 23 Sep 2021 09:51:45 +0300 Subject: [PATCH 0973/5344] net/mlx4_en: Don't allow aRFS for encapsulated packets [ Upstream commit fdbccea419dc782079ce5881d2705cc9e3881480 ] Driver doesn't support aRFS for encapsulated packets, return early error in such a case. Fixes: 1eb8c695bda9 ("net/mlx4_en: Add accelerated RFS support") Signed-off-by: Aya Levin Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index b3e3c4d903c9d..91334229c1205 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -371,6 +371,9 @@ mlx4_en_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb, int nhoff = skb_network_offset(skb); int ret = 0; + if (skb->encapsulation) + return -EPROTONOSUPPORT; + if (skb->protocol != htons(ETH_P_IP)) return -EPROTONOSUPPORT; From e2c77d0e9fb43880a16ff8fc9aade7a26490fdf4 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 2 Mar 2021 07:22:09 +0100 Subject: [PATCH 0974/5344] tty: synclink_gt, drop unneeded forward declarations [ Upstream commit b9b90fe655c0bd816847ac1bcbf179cfa2981ecb ] Forward declarations make the code larger and rewrites harder. Harder as they are often omitted from global changes. Remove forward declarations which are not really needed, i.e. the definition of the function is before its first use. Signed-off-by: Jiri Slaby Link: https://lore.kernel.org/r/20210302062214.29627-39-jslaby@suse.cz Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/tty/synclink_gt.c | 57 +-------------------------------------- 1 file changed, 1 insertion(+), 56 deletions(-) diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c index 36f1a4d870eb1..4ef84ed54ea51 100644 --- a/drivers/tty/synclink_gt.c +++ b/drivers/tty/synclink_gt.c @@ -137,37 +137,14 @@ MODULE_PARM_DESC(maxframe, "Maximum frame size used by device (4096 to 65535)"); */ static struct tty_driver *serial_driver; -static int open(struct tty_struct *tty, struct file * filp); -static void close(struct tty_struct *tty, struct file * filp); -static void hangup(struct tty_struct *tty); -static void set_termios(struct tty_struct *tty, struct ktermios *old_termios); - -static int write(struct tty_struct *tty, const unsigned char *buf, int count); -static int put_char(struct tty_struct *tty, unsigned char ch); -static void send_xchar(struct tty_struct *tty, char ch); static void wait_until_sent(struct tty_struct *tty, int timeout); -static int write_room(struct tty_struct *tty); -static void flush_chars(struct tty_struct *tty); static void flush_buffer(struct tty_struct *tty); -static void tx_hold(struct tty_struct *tty); static void tx_release(struct tty_struct *tty); -static int ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg); -static int chars_in_buffer(struct tty_struct *tty); -static void throttle(struct tty_struct * tty); -static void unthrottle(struct tty_struct * tty); -static int set_break(struct tty_struct *tty, int break_state); - /* - * generic HDLC support and callbacks + * generic HDLC support */ -#if SYNCLINK_GENERIC_HDLC #define dev_to_port(D) (dev_to_hdlc(D)->priv) -static void hdlcdev_tx_done(struct slgt_info *info); -static void hdlcdev_rx(struct slgt_info *info, char *buf, int size); -static int hdlcdev_init(struct slgt_info *info); -static void hdlcdev_exit(struct slgt_info *info); -#endif /* @@ -186,9 +163,6 @@ struct cond_wait { wait_queue_entry_t wait; unsigned int data; }; -static void init_cond_wait(struct cond_wait *w, unsigned int data); -static void add_cond_wait(struct cond_wait **head, struct cond_wait *w); -static void remove_cond_wait(struct cond_wait **head, struct cond_wait *w); static void flush_cond_wait(struct cond_wait **head); /* @@ -443,12 +417,8 @@ static void shutdown(struct slgt_info *info); static void program_hw(struct slgt_info *info); static void change_params(struct slgt_info *info); -static int register_test(struct slgt_info *info); -static int irq_test(struct slgt_info *info); -static int loopback_test(struct slgt_info *info); static int adapter_test(struct slgt_info *info); -static void reset_adapter(struct slgt_info *info); static void reset_port(struct slgt_info *info); static void async_mode(struct slgt_info *info); static void sync_mode(struct slgt_info *info); @@ -457,14 +427,12 @@ static void rx_stop(struct slgt_info *info); static void rx_start(struct slgt_info *info); static void reset_rbufs(struct slgt_info *info); static void free_rbufs(struct slgt_info *info, unsigned int first, unsigned int last); -static void rdma_reset(struct slgt_info *info); static bool rx_get_frame(struct slgt_info *info); static bool rx_get_buf(struct slgt_info *info); static void tx_start(struct slgt_info *info); static void tx_stop(struct slgt_info *info); static void tx_set_idle(struct slgt_info *info); -static unsigned int free_tbuf_count(struct slgt_info *info); static unsigned int tbuf_bytes(struct slgt_info *info); static void reset_tbufs(struct slgt_info *info); static void tdma_reset(struct slgt_info *info); @@ -472,26 +440,10 @@ static bool tx_load(struct slgt_info *info, const char *buf, unsigned int count) static void get_signals(struct slgt_info *info); static void set_signals(struct slgt_info *info); -static void enable_loopback(struct slgt_info *info); static void set_rate(struct slgt_info *info, u32 data_rate); -static int bh_action(struct slgt_info *info); -static void bh_handler(struct work_struct *work); static void bh_transmit(struct slgt_info *info); -static void isr_serial(struct slgt_info *info); -static void isr_rdma(struct slgt_info *info); static void isr_txeom(struct slgt_info *info, unsigned short status); -static void isr_tdma(struct slgt_info *info); - -static int alloc_dma_bufs(struct slgt_info *info); -static void free_dma_bufs(struct slgt_info *info); -static int alloc_desc(struct slgt_info *info); -static void free_desc(struct slgt_info *info); -static int alloc_bufs(struct slgt_info *info, struct slgt_desc *bufs, int count); -static void free_bufs(struct slgt_info *info, struct slgt_desc *bufs, int count); - -static int alloc_tmp_rbuf(struct slgt_info *info); -static void free_tmp_rbuf(struct slgt_info *info); static void tx_timeout(struct timer_list *t); static void rx_timeout(struct timer_list *t); @@ -509,10 +461,6 @@ static int tx_abort(struct slgt_info *info); static int rx_enable(struct slgt_info *info, int enable); static int modem_input_wait(struct slgt_info *info,int arg); static int wait_mgsl_event(struct slgt_info *info, int __user *mask_ptr); -static int tiocmget(struct tty_struct *tty); -static int tiocmset(struct tty_struct *tty, - unsigned int set, unsigned int clear); -static int set_break(struct tty_struct *tty, int break_state); static int get_interface(struct slgt_info *info, int __user *if_mode); static int set_interface(struct slgt_info *info, int if_mode); static int set_gpio(struct slgt_info *info, struct gpio_desc __user *gpio); @@ -526,9 +474,6 @@ static int set_xctrl(struct slgt_info *info, int if_mode); /* * driver functions */ -static void add_device(struct slgt_info *info); -static void device_init(int adapter_num, struct pci_dev *pdev); -static int claim_resources(struct slgt_info *info); static void release_resources(struct slgt_info *info); /* From 45fedf5703093c05fd22088973afa8062e7a7688 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 1 Sep 2021 17:38:06 -0700 Subject: [PATCH 0975/5344] tty: synclink_gt: rename a conflicting function name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 06e49073dfba24df4b1073a068631b13a0039c34 ] 'set_signals()' in synclink_gt.c conflicts with an exported symbol in arch/um/, so change set_signals() to set_gtsignals(). Keep the function names similar by also changing get_signals() to get_gtsignals(). ../drivers/tty/synclink_gt.c:442:13: error: conflicting types for ‘set_signals’ static void set_signals(struct slgt_info *info); ^~~~~~~~~~~ In file included from ../include/linux/irqflags.h:16:0, from ../include/linux/spinlock.h:58, from ../include/linux/mm_types.h:9, from ../include/linux/buildid.h:5, from ../include/linux/module.h:14, from ../drivers/tty/synclink_gt.c:46: ../arch/um/include/asm/irqflags.h:6:5: note: previous declaration of ‘set_signals’ was here int set_signals(int enable); ^~~~~~~~~~~ Fixes: 705b6c7b34f2 ("[PATCH] new driver synclink_gt") Cc: Greg Kroah-Hartman Cc: Jiri Slaby Cc: Paul Fulghum Signed-off-by: Randy Dunlap Link: https://lore.kernel.org/r/20210902003806.17054-1-rdunlap@infradead.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/tty/synclink_gt.c | 44 +++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c index 4ef84ed54ea51..ff345a8e0fcc6 100644 --- a/drivers/tty/synclink_gt.c +++ b/drivers/tty/synclink_gt.c @@ -438,8 +438,8 @@ static void reset_tbufs(struct slgt_info *info); static void tdma_reset(struct slgt_info *info); static bool tx_load(struct slgt_info *info, const char *buf, unsigned int count); -static void get_signals(struct slgt_info *info); -static void set_signals(struct slgt_info *info); +static void get_gtsignals(struct slgt_info *info); +static void set_gtsignals(struct slgt_info *info); static void set_rate(struct slgt_info *info, u32 data_rate); static void bh_transmit(struct slgt_info *info); @@ -721,7 +721,7 @@ static void set_termios(struct tty_struct *tty, struct ktermios *old_termios) if ((old_termios->c_cflag & CBAUD) && !C_BAUD(tty)) { info->signals &= ~(SerialSignal_RTS | SerialSignal_DTR); spin_lock_irqsave(&info->lock,flags); - set_signals(info); + set_gtsignals(info); spin_unlock_irqrestore(&info->lock,flags); } @@ -731,7 +731,7 @@ static void set_termios(struct tty_struct *tty, struct ktermios *old_termios) if (!C_CRTSCTS(tty) || !tty_throttled(tty)) info->signals |= SerialSignal_RTS; spin_lock_irqsave(&info->lock,flags); - set_signals(info); + set_gtsignals(info); spin_unlock_irqrestore(&info->lock,flags); } @@ -1182,7 +1182,7 @@ static inline void line_info(struct seq_file *m, struct slgt_info *info) /* output current serial signal states */ spin_lock_irqsave(&info->lock,flags); - get_signals(info); + get_gtsignals(info); spin_unlock_irqrestore(&info->lock,flags); stat_buf[0] = 0; @@ -1282,7 +1282,7 @@ static void throttle(struct tty_struct * tty) if (C_CRTSCTS(tty)) { spin_lock_irqsave(&info->lock,flags); info->signals &= ~SerialSignal_RTS; - set_signals(info); + set_gtsignals(info); spin_unlock_irqrestore(&info->lock,flags); } } @@ -1307,7 +1307,7 @@ static void unthrottle(struct tty_struct * tty) if (C_CRTSCTS(tty)) { spin_lock_irqsave(&info->lock,flags); info->signals |= SerialSignal_RTS; - set_signals(info); + set_gtsignals(info); spin_unlock_irqrestore(&info->lock,flags); } } @@ -1479,7 +1479,7 @@ static int hdlcdev_open(struct net_device *dev) /* inform generic HDLC layer of current DCD status */ spin_lock_irqsave(&info->lock, flags); - get_signals(info); + get_gtsignals(info); spin_unlock_irqrestore(&info->lock, flags); if (info->signals & SerialSignal_DCD) netif_carrier_on(dev); @@ -2235,7 +2235,7 @@ static void isr_txeom(struct slgt_info *info, unsigned short status) if (info->params.mode != MGSL_MODE_ASYNC && info->drop_rts_on_tx_done) { info->signals &= ~SerialSignal_RTS; info->drop_rts_on_tx_done = false; - set_signals(info); + set_gtsignals(info); } #if SYNCLINK_GENERIC_HDLC @@ -2400,7 +2400,7 @@ static void shutdown(struct slgt_info *info) if (!info->port.tty || info->port.tty->termios.c_cflag & HUPCL) { info->signals &= ~(SerialSignal_RTS | SerialSignal_DTR); - set_signals(info); + set_gtsignals(info); } flush_cond_wait(&info->gpio_wait_q); @@ -2428,7 +2428,7 @@ static void program_hw(struct slgt_info *info) else async_mode(info); - set_signals(info); + set_gtsignals(info); info->dcd_chkcount = 0; info->cts_chkcount = 0; @@ -2436,7 +2436,7 @@ static void program_hw(struct slgt_info *info) info->dsr_chkcount = 0; slgt_irq_on(info, IRQ_DCD | IRQ_CTS | IRQ_DSR | IRQ_RI); - get_signals(info); + get_gtsignals(info); if (info->netcount || (info->port.tty && info->port.tty->termios.c_cflag & CREAD)) @@ -2680,7 +2680,7 @@ static int wait_mgsl_event(struct slgt_info *info, int __user *mask_ptr) spin_lock_irqsave(&info->lock,flags); /* return immediately if state matches requested events */ - get_signals(info); + get_gtsignals(info); s = info->signals; events = mask & @@ -3098,7 +3098,7 @@ static int tiocmget(struct tty_struct *tty) unsigned long flags; spin_lock_irqsave(&info->lock,flags); - get_signals(info); + get_gtsignals(info); spin_unlock_irqrestore(&info->lock,flags); result = ((info->signals & SerialSignal_RTS) ? TIOCM_RTS:0) + @@ -3137,7 +3137,7 @@ static int tiocmset(struct tty_struct *tty, info->signals &= ~SerialSignal_DTR; spin_lock_irqsave(&info->lock,flags); - set_signals(info); + set_gtsignals(info); spin_unlock_irqrestore(&info->lock,flags); return 0; } @@ -3148,7 +3148,7 @@ static int carrier_raised(struct tty_port *port) struct slgt_info *info = container_of(port, struct slgt_info, port); spin_lock_irqsave(&info->lock,flags); - get_signals(info); + get_gtsignals(info); spin_unlock_irqrestore(&info->lock,flags); return (info->signals & SerialSignal_DCD) ? 1 : 0; } @@ -3163,7 +3163,7 @@ static void dtr_rts(struct tty_port *port, int on) info->signals |= SerialSignal_RTS | SerialSignal_DTR; else info->signals &= ~(SerialSignal_RTS | SerialSignal_DTR); - set_signals(info); + set_gtsignals(info); spin_unlock_irqrestore(&info->lock,flags); } @@ -3962,10 +3962,10 @@ static void tx_start(struct slgt_info *info) if (info->params.mode != MGSL_MODE_ASYNC) { if (info->params.flags & HDLC_FLAG_AUTO_RTS) { - get_signals(info); + get_gtsignals(info); if (!(info->signals & SerialSignal_RTS)) { info->signals |= SerialSignal_RTS; - set_signals(info); + set_gtsignals(info); info->drop_rts_on_tx_done = true; } } @@ -4019,7 +4019,7 @@ static void reset_port(struct slgt_info *info) rx_stop(info); info->signals &= ~(SerialSignal_RTS | SerialSignal_DTR); - set_signals(info); + set_gtsignals(info); slgt_irq_off(info, IRQ_ALL | IRQ_MASTER); } @@ -4441,7 +4441,7 @@ static void tx_set_idle(struct slgt_info *info) /* * get state of V24 status (input) signals */ -static void get_signals(struct slgt_info *info) +static void get_gtsignals(struct slgt_info *info) { unsigned short status = rd_reg16(info, SSR); @@ -4503,7 +4503,7 @@ static void msc_set_vcr(struct slgt_info *info) /* * set state of V24 control (output) signals */ -static void set_signals(struct slgt_info *info) +static void set_gtsignals(struct slgt_info *info) { unsigned char val = rd_reg8(info, VCR); if (info->signals & SerialSignal_DTR) From cc32362ec823f0f88deb0a25fbc372e67d5ea66f Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Tue, 10 Aug 2021 09:40:36 -0700 Subject: [PATCH 0976/5344] fpga: machxo2-spi: Return an error on failure [ Upstream commit 34331739e19fd6a293d488add28832ad49c9fc54 ] Earlier successes leave 'ret' in a non error state, so these errors are not reported. Set ret to -EINVAL before going to the error handler. This addresses two issues reported by smatch: drivers/fpga/machxo2-spi.c:229 machxo2_write_init() warn: missing error code 'ret' drivers/fpga/machxo2-spi.c:316 machxo2_write_complete() warn: missing error code 'ret' [mdf@kernel.org: Reworded commit message] Fixes: 88fb3a002330 ("fpga: lattice machxo2: Add Lattice MachXO2 support") Reported-by: Dan Carpenter Signed-off-by: Tom Rix Signed-off-by: Moritz Fischer Signed-off-by: Sasha Levin --- drivers/fpga/machxo2-spi.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/fpga/machxo2-spi.c b/drivers/fpga/machxo2-spi.c index 4d8a876415874..2fd097c3994b3 100644 --- a/drivers/fpga/machxo2-spi.c +++ b/drivers/fpga/machxo2-spi.c @@ -223,8 +223,10 @@ static int machxo2_write_init(struct fpga_manager *mgr, goto fail; get_status(spi, &status); - if (test_bit(FAIL, &status)) + if (test_bit(FAIL, &status)) { + ret = -EINVAL; goto fail; + } dump_status_reg(&status); spi_message_init(&msg); @@ -310,6 +312,7 @@ static int machxo2_write_complete(struct fpga_manager *mgr, dump_status_reg(&status); if (!test_bit(DONE, &status)) { machxo2_cleanup(mgr); + ret = -EINVAL; goto fail; } From 24eaea3d07ab3ae81b8de6b804c591dbea508c5c Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 13 Aug 2021 14:40:42 +0800 Subject: [PATCH 0977/5344] fpga: machxo2-spi: Fix missing error code in machxo2_write_complete() [ Upstream commit a1e4470823d99e75b596748086e120dea169ed3c ] The error code is missing in this code scenario, add the error code '-EINVAL' to the return value 'ret'. Eliminate the follow smatch warning: drivers/fpga/machxo2-spi.c:341 machxo2_write_complete() warn: missing error code 'ret'. [mdf@kernel.org: Reworded commit message] Fixes: 88fb3a002330 ("fpga: lattice machxo2: Add Lattice MachXO2 support") Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Moritz Fischer Signed-off-by: Sasha Levin --- drivers/fpga/machxo2-spi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/fpga/machxo2-spi.c b/drivers/fpga/machxo2-spi.c index 2fd097c3994b3..37e54e375528e 100644 --- a/drivers/fpga/machxo2-spi.c +++ b/drivers/fpga/machxo2-spi.c @@ -334,6 +334,7 @@ static int machxo2_write_complete(struct fpga_manager *mgr, break; if (++refreshloop == MACHXO2_MAX_REFRESH_LOOP) { machxo2_cleanup(mgr); + ret = -EINVAL; goto fail; } } while (1); From 536c111692bd89930ebdc3b1d101f5f19b90d754 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 16 Sep 2021 16:13:42 +0300 Subject: [PATCH 0978/5344] thermal/core: Potential buffer overflow in thermal_build_list_of_policies() [ Upstream commit 1bb30b20b49773369c299d4d6c65227201328663 ] After printing the list of thermal governors, then this function prints a newline character. The problem is that "size" has not been updated after printing the last governor. This means that it can write one character (the NUL terminator) beyond the end of the buffer. Get rid of the "size" variable and just use "PAGE_SIZE - count" directly. Fixes: 1b4f48494eb2 ("thermal: core: group functions related to governor handling") Signed-off-by: Dan Carpenter Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210916131342.GB25094@kili Signed-off-by: Sasha Levin --- drivers/thermal/thermal_core.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index f526ce31f5a2f..20eab56b02cb9 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -228,15 +228,14 @@ int thermal_build_list_of_policies(char *buf) { struct thermal_governor *pos; ssize_t count = 0; - ssize_t size = PAGE_SIZE; mutex_lock(&thermal_governor_lock); list_for_each_entry(pos, &thermal_governor_list, governor_list) { - size = PAGE_SIZE - count; - count += scnprintf(buf + count, size, "%s ", pos->name); + count += scnprintf(buf + count, PAGE_SIZE - count, "%s ", + pos->name); } - count += scnprintf(buf + count, size, "\n"); + count += scnprintf(buf + count, PAGE_SIZE - count, "\n"); mutex_unlock(&thermal_governor_lock); From b87ce1d767e493f085d0f3be0c9bd56406b68b8a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 21 Sep 2021 23:33:35 +0300 Subject: [PATCH 0979/5344] cifs: fix a sign extension bug [ Upstream commit e946d3c887a9dc33aa82a349c6284f4a084163f4 ] The problem is the mismatched types between "ctx->total_len" which is an unsigned int, "rc" which is an int, and "ctx->rc" which is a ssize_t. The code does: ctx->rc = (rc == 0) ? ctx->total_len : rc; We want "ctx->rc" to store the negative "rc" error code. But what happens is that "rc" is type promoted to a high unsigned int and 'ctx->rc" will store the high positive value instead of a negative value. The fix is to change "rc" from an int to a ssize_t. Fixes: c610c4b619e5 ("CIFS: Add asynchronous write support through kernel AIO") Signed-off-by: Dan Carpenter Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/cifs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 1aac8d38f887d..a9746af5a44db 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2989,7 +2989,7 @@ static void collect_uncached_write_data(struct cifs_aio_ctx *ctx) struct cifs_tcon *tcon; struct cifs_sb_info *cifs_sb; struct dentry *dentry = ctx->cfile->dentry; - int rc; + ssize_t rc; tcon = tlink_tcon(ctx->cfile->tlink); cifs_sb = CIFS_SB(dentry->d_sb); From 11e96913e0bf3d553cda905cc21d3ca401273156 Mon Sep 17 00:00:00 2001 From: Dmitry Bogdanov Date: Wed, 15 Sep 2021 18:32:39 +0300 Subject: [PATCH 0980/5344] scsi: qla2xxx: Restore initiator in dual mode [ Upstream commit 5f8579038842d77e6ce05e1df6bf9dd493b0e3ef ] In dual mode in case of disabling the target, the whole port goes offline and initiator is turned off too. Fix restoring initiator mode after disabling target in dual mode. Link: https://lore.kernel.org/r/20210915153239.8035-1-d.bogdanov@yadro.com Fixes: 0645cb8350cd ("scsi: qla2xxx: Add mode control for each physical port") Reviewed-by: Himanshu Madhani Signed-off-by: Dmitry Bogdanov Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/qla2xxx/qla_init.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c index e59fb9429033c..c9391eb1e3e93 100644 --- a/drivers/scsi/qla2xxx/qla_init.c +++ b/drivers/scsi/qla2xxx/qla_init.c @@ -6813,7 +6813,8 @@ qla2x00_abort_isp(scsi_qla_host_t *vha) return 0; break; case QLA2XXX_INI_MODE_DUAL: - if (!qla_dual_mode_enabled(vha)) + if (!qla_dual_mode_enabled(vha) && + !qla_ini_mode_enabled(vha)) return 0; break; case QLA2XXX_INI_MODE_ENABLED: From d190948efbb8ba159371e0e0e3a0ffd5e0f38979 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 16 Sep 2021 16:23:31 +0300 Subject: [PATCH 0981/5344] scsi: lpfc: Use correct scnprintf() limit [ Upstream commit 6dacc371b77f473770ec646e220303a84fe96c11 ] The limit should be "PAGE_SIZE - len" instead of "PAGE_SIZE". We're not going to hit the limit so this fix will not affect runtime. Link: https://lore.kernel.org/r/20210916132331.GE25094@kili Fixes: 5b9e70b22cc5 ("scsi: lpfc: raise sg count for nvme to use available sg resources") Reviewed-by: James Smart Signed-off-by: Dan Carpenter Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/lpfc/lpfc_attr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c index 45db19e31b348..f0ecfe565660a 100644 --- a/drivers/scsi/lpfc/lpfc_attr.c +++ b/drivers/scsi/lpfc/lpfc_attr.c @@ -5881,7 +5881,8 @@ lpfc_sg_seg_cnt_show(struct device *dev, struct device_attribute *attr, len = scnprintf(buf, PAGE_SIZE, "SGL sz: %d total SGEs: %d\n", phba->cfg_sg_dma_buf_size, phba->cfg_total_seg_cnt); - len += scnprintf(buf + len, PAGE_SIZE, "Cfg: %d SCSI: %d NVME: %d\n", + len += scnprintf(buf + len, PAGE_SIZE - len, + "Cfg: %d SCSI: %d NVME: %d\n", phba->cfg_sg_seg_cnt, phba->cfg_scsi_seg_cnt, phba->cfg_nvme_seg_cnt); return len; From 6a0d4becd25223c2aa6fada9abe0215946cbd9e8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 5 Sep 2021 09:25:19 -0700 Subject: [PATCH 0982/5344] irqchip/goldfish-pic: Select GENERIC_IRQ_CHIP to fix build [ Upstream commit 969ac78db78c723a24e9410666b457cc1b0cb3c3 ] irq-goldfish-pic uses GENERIC_IRQ_CHIP interfaces so select that symbol to fix build errors. Fixes these build errors: mips-linux-ld: drivers/irqchip/irq-goldfish-pic.o: in function `goldfish_pic_of_init': irq-goldfish-pic.c:(.init.text+0xc0): undefined reference to `irq_alloc_generic_chip' mips-linux-ld: irq-goldfish-pic.c:(.init.text+0xf4): undefined reference to `irq_gc_unmask_enable_reg' mips-linux-ld: irq-goldfish-pic.c:(.init.text+0xf8): undefined reference to `irq_gc_unmask_enable_reg' mips-linux-ld: irq-goldfish-pic.c:(.init.text+0x100): undefined reference to `irq_gc_mask_disable_reg' mips-linux-ld: irq-goldfish-pic.c:(.init.text+0x104): undefined reference to `irq_gc_mask_disable_reg' mips-linux-ld: irq-goldfish-pic.c:(.init.text+0x11c): undefined reference to `irq_setup_generic_chip' mips-linux-ld: irq-goldfish-pic.c:(.init.text+0x168): undefined reference to `irq_remove_generic_chip' Fixes: 4235ff50cf98 ("irqchip/irq-goldfish-pic: Add Goldfish PIC driver") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Miodrag Dinic Cc: Geert Uytterhoeven Cc: Bartosz Golaszewski Cc: Thomas Gleixner Cc: Marc Zyngier Cc: Goran Ferenc Cc: Aleksandar Markovic Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210905162519.21507-1-rdunlap@infradead.org Signed-off-by: Sasha Levin --- drivers/irqchip/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 97f9c001d8ff7..20f44ef9c4c9b 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -415,6 +415,7 @@ config MESON_IRQ_GPIO config GOLDFISH_PIC bool "Goldfish programmable interrupt controller" depends on MIPS && (GOLDFISH || COMPILE_TEST) + select GENERIC_IRQ_CHIP select IRQ_DOMAIN help Say yes here to enable Goldfish interrupt controller driver used From 4ff10255471425bfce3bff13383485dc5a58ecdd Mon Sep 17 00:00:00 2001 From: Kaige Fu Date: Wed, 15 Sep 2021 10:20:55 +0800 Subject: [PATCH 0983/5344] irqchip/gic-v3-its: Fix potential VPE leak on error [ Upstream commit 280bef512933b2dda01d681d8cbe499b98fc5bdd ] In its_vpe_irq_domain_alloc, when its_vpe_init() returns an error, there is an off-by-one in the number of VPEs to be freed. Fix it by simply passing the number of VPEs allocated, which is the index of the loop iterating over the VPEs. Fixes: 7d75bbb4bc1a ("irqchip/gic-v3-its: Add VPE irq domain allocation/teardown") Signed-off-by: Kaige Fu [maz: fixed commit message] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/d9e36dee512e63670287ed9eff884a5d8d6d27f2.1631672311.git.kaige.fu@linux.alibaba.com Signed-off-by: Sasha Levin --- drivers/irqchip/irq-gic-v3-its.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index f298313b87ac7..398c54387988a 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -3123,7 +3123,7 @@ static int its_vpe_irq_domain_alloc(struct irq_domain *domain, unsigned int virq if (err) { if (i > 0) - its_vpe_irq_domain_free(domain, virq, i - 1); + its_vpe_irq_domain_free(domain, virq, i); its_lpi_free(bitmap, base, nr_ids); its_free_prop_table(vprop_page); From 0d2d29e6c828de8c1d839b629be676dc74049135 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 1 Sep 2021 13:38:29 +0200 Subject: [PATCH 0984/5344] md: fix a lock order reversal in md_alloc [ Upstream commit 7df835a32a8bedf7ce88efcfa7c9b245b52ff139 ] Commit b0140891a8cea3 ("md: Fix race when creating a new md device.") not only moved assigning mddev->gendisk before calling add_disk, which fixes the races described in the commit log, but also added a mddev->open_mutex critical section over add_disk and creation of the md kobj. Adding a kobject after add_disk is racy vs deleting the gendisk right after adding it, but md already prevents against that by holding a mddev->active reference. On the other hand taking this lock added a lock order reversal with what is not disk->open_mutex (used to be bdev->bd_mutex when the commit was added) for partition devices, which need that lock for the internal open for the partition scan, and a recent commit also takes it for non-partitioned devices, leading to further lockdep splatter. Fixes: b0140891a8ce ("md: Fix race when creating a new md device.") Fixes: d62633873590 ("block: support delayed holder registration") Reported-by: syzbot+fadc0aaf497e6a493b9f@syzkaller.appspotmail.com Signed-off-by: Christoph Hellwig Tested-by: syzbot+fadc0aaf497e6a493b9f@syzkaller.appspotmail.com Reviewed-by: NeilBrown Signed-off-by: Song Liu Signed-off-by: Sasha Levin --- drivers/md/md.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 761d43829b2b7..c178b2f406de3 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5535,10 +5535,6 @@ static int md_alloc(dev_t dev, char *name) */ disk->flags |= GENHD_FL_EXT_DEVT; mddev->gendisk = disk; - /* As soon as we call add_disk(), another thread could get - * through to md_open, so make sure it doesn't get too far - */ - mutex_lock(&mddev->open_mutex); add_disk(disk); error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); @@ -5553,7 +5549,6 @@ static int md_alloc(dev_t dev, char *name) if (mddev->kobj.sd && sysfs_create_group(&mddev->kobj, &md_bitmap_group)) pr_debug("pointless warning\n"); - mutex_unlock(&mddev->open_mutex); abort: mutex_unlock(&disks_mutex); if (!error && mddev->kobj.sd) { From 00db2e7dfd213b2850f479fd5e20978bda704f4a Mon Sep 17 00:00:00 2001 From: Tong Zhang Date: Wed, 8 Sep 2021 12:02:32 -0700 Subject: [PATCH 0985/5344] net: macb: fix use after free on rmmod [ Upstream commit d82d5303c4c539db86588ffb5dc5b26c3f1513e8 ] plat_dev->dev->platform_data is released by platform_device_unregister(), use of pclk and hclk is a use-after-free. Since device unregister won't need a clk device we adjust the function call sequence to fix this issue. [ 31.261225] BUG: KASAN: use-after-free in macb_remove+0x77/0xc6 [macb_pci] [ 31.275563] Freed by task 306: [ 30.276782] platform_device_release+0x25/0x80 Suggested-by: Nicolas Ferre Signed-off-by: Tong Zhang Acked-by: Nicolas Ferre Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/cadence/macb_pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cadence/macb_pci.c b/drivers/net/ethernet/cadence/macb_pci.c index 617b3b728dd04..94f3babfad309 100644 --- a/drivers/net/ethernet/cadence/macb_pci.c +++ b/drivers/net/ethernet/cadence/macb_pci.c @@ -112,9 +112,9 @@ static void macb_remove(struct pci_dev *pdev) struct platform_device *plat_dev = pci_get_drvdata(pdev); struct macb_platform_data *plat_data = dev_get_platdata(&plat_dev->dev); - platform_device_unregister(plat_dev); clk_unregister(plat_data->pclk); clk_unregister(plat_data->hclk); + platform_device_unregister(plat_dev); } static const struct pci_device_id dev_id_table[] = { From 40a836fc5dc1a41d22085e6f5ce2dcd6aa30370c Mon Sep 17 00:00:00 2001 From: Jesper Nilsson Date: Fri, 10 Sep 2021 21:55:34 +0200 Subject: [PATCH 0986/5344] net: stmmac: allow CSR clock of 300MHz [ Upstream commit 08dad2f4d541fcfe5e7bfda72cc6314bbfd2802f ] The Synopsys Ethernet IP uses the CSR clock as a base clock for MDC. The divisor used is set in the MAC_MDIO_Address register field CR (Clock Rate) The divisor is there to change the CSR clock into a clock that falls below the IEEE 802.3 specified max frequency of 2.5MHz. If the CSR clock is 300MHz, the code falls back to using the reset value in the MAC_MDIO_Address register, as described in the comment above this code. However, 300MHz is actually an allowed value and the proper divider can be estimated quite easily (it's just 1Hz difference!) A CSR frequency of 300MHz with the maximum clock rate value of 0x5 (STMMAC_CSR_250_300M, a divisor of 124) gives somewhere around ~2.42MHz which is below the IEEE 802.3 specified maximum. For the ARTPEC-8 SoC, the CSR clock is this problematic 300MHz, and unfortunately, the reset-value of the MAC_MDIO_Address CR field is 0x0. This leads to a clock rate of zero and a divisor of 42, and gives an MDC frequency of ~7.14MHz. Allow CSR clock of 300MHz by making the comparison inclusive. Signed-off-by: Jesper Nilsson Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 4e7cfd3bfcd2e..e09851c7da9b8 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -225,7 +225,7 @@ static void stmmac_clk_csr_set(struct stmmac_priv *priv) priv->clk_csr = STMMAC_CSR_100_150M; else if ((clk_rate >= CSR_F_150M) && (clk_rate < CSR_F_250M)) priv->clk_csr = STMMAC_CSR_150_250M; - else if ((clk_rate >= CSR_F_250M) && (clk_rate < CSR_F_300M)) + else if ((clk_rate >= CSR_F_250M) && (clk_rate <= CSR_F_300M)) priv->clk_csr = STMMAC_CSR_250_300M; } From 059baa650a92d5b402cfb64b92bebb1958fc29e6 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Mon, 6 Sep 2021 23:07:29 -0700 Subject: [PATCH 0987/5344] m68k: Double cast io functions to unsigned long [ Upstream commit b1a89856fbf63fffde6a4771d8f1ac21df549e50 ] m68k builds fail widely with errors such as arch/m68k/include/asm/raw_io.h:20:19: error: cast to pointer from integer of different size arch/m68k/include/asm/raw_io.h:30:32: error: cast to pointer from integer of different size [-Werror=int-to-p On m68k, io functions are defined as macros. The problem is seen if the macro parameter variable size differs from the size of a pointer. Cast the parameter of all io macros to unsigned long before casting it to a pointer to fix the problem. Signed-off-by: Guenter Roeck Link: https://lore.kernel.org/r/20210907060729.2391992-1-linux@roeck-us.net Signed-off-by: Geert Uytterhoeven Signed-off-by: Sasha Levin --- arch/m68k/include/asm/raw_io.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/m68k/include/asm/raw_io.h b/arch/m68k/include/asm/raw_io.h index 8a6dc6e5a279c..8ab3c350bd530 100644 --- a/arch/m68k/include/asm/raw_io.h +++ b/arch/m68k/include/asm/raw_io.h @@ -17,21 +17,21 @@ * two accesses to memory, which may be undesirable for some devices. */ #define in_8(addr) \ - ({ u8 __v = (*(__force volatile u8 *) (addr)); __v; }) + ({ u8 __v = (*(__force volatile u8 *) (unsigned long)(addr)); __v; }) #define in_be16(addr) \ - ({ u16 __v = (*(__force volatile u16 *) (addr)); __v; }) + ({ u16 __v = (*(__force volatile u16 *) (unsigned long)(addr)); __v; }) #define in_be32(addr) \ - ({ u32 __v = (*(__force volatile u32 *) (addr)); __v; }) + ({ u32 __v = (*(__force volatile u32 *) (unsigned long)(addr)); __v; }) #define in_le16(addr) \ - ({ u16 __v = le16_to_cpu(*(__force volatile __le16 *) (addr)); __v; }) + ({ u16 __v = le16_to_cpu(*(__force volatile __le16 *) (unsigned long)(addr)); __v; }) #define in_le32(addr) \ - ({ u32 __v = le32_to_cpu(*(__force volatile __le32 *) (addr)); __v; }) + ({ u32 __v = le32_to_cpu(*(__force volatile __le32 *) (unsigned long)(addr)); __v; }) -#define out_8(addr,b) (void)((*(__force volatile u8 *) (addr)) = (b)) -#define out_be16(addr,w) (void)((*(__force volatile u16 *) (addr)) = (w)) -#define out_be32(addr,l) (void)((*(__force volatile u32 *) (addr)) = (l)) -#define out_le16(addr,w) (void)((*(__force volatile __le16 *) (addr)) = cpu_to_le16(w)) -#define out_le32(addr,l) (void)((*(__force volatile __le32 *) (addr)) = cpu_to_le32(l)) +#define out_8(addr,b) (void)((*(__force volatile u8 *) (unsigned long)(addr)) = (b)) +#define out_be16(addr,w) (void)((*(__force volatile u16 *) (unsigned long)(addr)) = (w)) +#define out_be32(addr,l) (void)((*(__force volatile u32 *) (unsigned long)(addr)) = (l)) +#define out_le16(addr,w) (void)((*(__force volatile __le16 *) (unsigned long)(addr)) = cpu_to_le16(w)) +#define out_le32(addr,l) (void)((*(__force volatile __le32 *) (unsigned long)(addr)) = cpu_to_le32(l)) #define raw_inb in_8 #define raw_inw in_be16 From d04590367c982031cf70a34f9f50bb72b66def01 Mon Sep 17 00:00:00 2001 From: zhang kai Date: Thu, 9 Sep 2021 16:39:18 +0800 Subject: [PATCH 0988/5344] ipv6: delay fib6_sernum increase in fib6_add [ Upstream commit e87b5052271e39d62337ade531992b7e5d8c2cfa ] only increase fib6_sernum in net namespace after add fib6_info successfully. Signed-off-by: zhang kai Reviewed-by: David Ahern Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/ipv6/ip6_fib.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index bb68290ad68d8..9a6f66e0e9a27 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1310,7 +1310,6 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, int err = -ENOMEM; int allow_create = 1; int replace_required = 0; - int sernum = fib6_new_sernum(info->nl_net); if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) @@ -1410,7 +1409,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, if (!err) { if (rt->nh) list_add(&rt->nh_list, &rt->nh->f6i_list); - __fib6_update_sernum_upto_root(rt, sernum); + __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net)); fib6_start_gc(info->nl_net, rt); } From 6a266cb90469f1977a13289c38ae7771af3c5d92 Mon Sep 17 00:00:00 2001 From: Bixuan Cui Date: Sat, 11 Sep 2021 08:55:57 +0800 Subject: [PATCH 0989/5344] bpf: Add oversize check before call kvcalloc() [ Upstream commit 0e6491b559704da720f6da09dd0a52c4df44c514 ] Commit 7661809d493b ("mm: don't allow oversized kvmalloc() calls") add the oversize check. When the allocation is larger than what kmalloc() supports, the following warning triggered: WARNING: CPU: 0 PID: 8408 at mm/util.c:597 kvmalloc_node+0x108/0x110 mm/util.c:597 Modules linked in: CPU: 0 PID: 8408 Comm: syz-executor221 Not tainted 5.14.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:kvmalloc_node+0x108/0x110 mm/util.c:597 Call Trace: kvmalloc include/linux/mm.h:806 [inline] kvmalloc_array include/linux/mm.h:824 [inline] kvcalloc include/linux/mm.h:829 [inline] check_btf_line kernel/bpf/verifier.c:9925 [inline] check_btf_info kernel/bpf/verifier.c:10049 [inline] bpf_check+0xd634/0x150d0 kernel/bpf/verifier.c:13759 bpf_prog_load kernel/bpf/syscall.c:2301 [inline] __sys_bpf+0x11181/0x126e0 kernel/bpf/syscall.c:4587 __do_sys_bpf kernel/bpf/syscall.c:4691 [inline] __se_sys_bpf kernel/bpf/syscall.c:4689 [inline] __x64_sys_bpf+0x78/0x90 kernel/bpf/syscall.c:4689 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Reported-by: syzbot+f3e749d4c662818ae439@syzkaller.appspotmail.com Signed-off-by: Bixuan Cui Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210911005557.45518-1-cuibixuan@huawei.com Signed-off-by: Sasha Levin --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 78c283802e048..f4953aa64ea2b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6845,6 +6845,8 @@ static int check_btf_line(struct bpf_verifier_env *env, nr_linfo = attr->line_info_cnt; if (!nr_linfo) return 0; + if (nr_linfo > INT_MAX / sizeof(struct bpf_line_info)) + return -EINVAL; rec_size = attr->line_info_rec_size; if (rec_size < MIN_BPF_LINEINFO_SIZE || From 9f83dba0e0872942853da15fcbac02ab36b6b277 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 27 Aug 2021 14:32:06 +0200 Subject: [PATCH 0990/5344] xen/balloon: use a kernel thread instead a workqueue [ Upstream commit 8480ed9c2bbd56fc86524998e5f2e3e22f5038f6 ] Today the Xen ballooning is done via delayed work in a workqueue. This might result in workqueue hangups being reported in case of large amounts of memory are being ballooned in one go (here 16GB): BUG: workqueue lockup - pool cpus=6 node=0 flags=0x0 nice=0 stuck for 64s! Showing busy workqueues and worker pools: workqueue events: flags=0x0 pwq 12: cpus=6 node=0 flags=0x0 nice=0 active=2/256 refcnt=3 in-flight: 229:balloon_process pending: cache_reap workqueue events_freezable_power_: flags=0x84 pwq 12: cpus=6 node=0 flags=0x0 nice=0 active=1/256 refcnt=2 pending: disk_events_workfn workqueue mm_percpu_wq: flags=0x8 pwq 12: cpus=6 node=0 flags=0x0 nice=0 active=1/256 refcnt=2 pending: vmstat_update pool 12: cpus=6 node=0 flags=0x0 nice=0 hung=64s workers=3 idle: 2222 43 This can easily be avoided by using a dedicated kernel thread for doing the ballooning work. Reported-by: Jan Beulich Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Link: https://lore.kernel.org/r/20210827123206.15429-1-jgross@suse.com Signed-off-by: Juergen Gross Signed-off-by: Sasha Levin --- drivers/xen/balloon.c | 62 +++++++++++++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 17 deletions(-) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index ebb05517b6aa1..2762d246991b6 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -43,6 +43,8 @@ #include #include #include +#include +#include #include #include #include @@ -117,7 +119,7 @@ static struct ctl_table xen_root[] = { #define EXTENT_ORDER (fls(XEN_PFN_PER_PAGE) - 1) /* - * balloon_process() state: + * balloon_thread() state: * * BP_DONE: done or nothing to do, * BP_WAIT: wait to be rescheduled, @@ -132,6 +134,8 @@ enum bp_state { BP_ECANCELED }; +/* Main waiting point for xen-balloon thread. */ +static DECLARE_WAIT_QUEUE_HEAD(balloon_thread_wq); static DEFINE_MUTEX(balloon_mutex); @@ -146,10 +150,6 @@ static xen_pfn_t frame_list[PAGE_SIZE / sizeof(xen_pfn_t)]; static LIST_HEAD(ballooned_pages); static DECLARE_WAIT_QUEUE_HEAD(balloon_wq); -/* Main work function, always executed in process context. */ -static void balloon_process(struct work_struct *work); -static DECLARE_DELAYED_WORK(balloon_worker, balloon_process); - /* When ballooning out (allocating memory to return to Xen) we don't really want the kernel to try too hard since that can trigger the oom killer. */ #define GFP_BALLOON \ @@ -383,7 +383,7 @@ static void xen_online_page(struct page *page, unsigned int order) static int xen_memory_notifier(struct notifier_block *nb, unsigned long val, void *v) { if (val == MEM_ONLINE) - schedule_delayed_work(&balloon_worker, 0); + wake_up(&balloon_thread_wq); return NOTIFY_OK; } @@ -508,18 +508,43 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) } /* - * As this is a work item it is guaranteed to run as a single instance only. + * Stop waiting if either state is not BP_EAGAIN and ballooning action is + * needed, or if the credit has changed while state is BP_EAGAIN. + */ +static bool balloon_thread_cond(enum bp_state state, long credit) +{ + if (state != BP_EAGAIN) + credit = 0; + + return current_credit() != credit || kthread_should_stop(); +} + +/* + * As this is a kthread it is guaranteed to run as a single instance only. * We may of course race updates of the target counts (which are protected * by the balloon lock), or with changes to the Xen hard limit, but we will * recover from these in time. */ -static void balloon_process(struct work_struct *work) +static int balloon_thread(void *unused) { enum bp_state state = BP_DONE; long credit; + unsigned long timeout; + + set_freezable(); + for (;;) { + if (state == BP_EAGAIN) + timeout = balloon_stats.schedule_delay * HZ; + else + timeout = 3600 * HZ; + credit = current_credit(); + wait_event_interruptible_timeout(balloon_thread_wq, + balloon_thread_cond(state, credit), timeout); + + if (kthread_should_stop()) + return 0; - do { mutex_lock(&balloon_mutex); credit = current_credit(); @@ -546,12 +571,7 @@ static void balloon_process(struct work_struct *work) mutex_unlock(&balloon_mutex); cond_resched(); - - } while (credit && state == BP_DONE); - - /* Schedule more work if there is some still to be done. */ - if (state == BP_EAGAIN) - schedule_delayed_work(&balloon_worker, balloon_stats.schedule_delay * HZ); + } } /* Resets the Xen limit, sets new target, and kicks off processing. */ @@ -559,7 +579,7 @@ void balloon_set_new_target(unsigned long target) { /* No need for lock. Not read-modify-write updates. */ balloon_stats.target_pages = target; - schedule_delayed_work(&balloon_worker, 0); + wake_up(&balloon_thread_wq); } EXPORT_SYMBOL_GPL(balloon_set_new_target); @@ -664,7 +684,7 @@ void free_xenballooned_pages(int nr_pages, struct page **pages) /* The balloon may be too large now. Shrink it if needed. */ if (current_credit()) - schedule_delayed_work(&balloon_worker, 0); + wake_up(&balloon_thread_wq); mutex_unlock(&balloon_mutex); } @@ -696,6 +716,8 @@ static void __init balloon_add_region(unsigned long start_pfn, static int __init balloon_init(void) { + struct task_struct *task; + if (!xen_domain()) return -ENODEV; @@ -739,6 +761,12 @@ static int __init balloon_init(void) } #endif + task = kthread_run(balloon_thread, NULL, "xen-balloon"); + if (IS_ERR(task)) { + pr_err("xen-balloon thread could not be started, ballooning will not work!\n"); + return PTR_ERR(task); + } + /* Init the xen-balloon driver. */ xen_balloon_init(); From 969ea96721cd3323b8adb576a204a6cf23d3fef1 Mon Sep 17 00:00:00 2001 From: Anton Eidelman Date: Sun, 12 Sep 2021 12:54:57 -0600 Subject: [PATCH 0991/5344] nvme-multipath: fix ANA state updates when a namespace is not present [ Upstream commit 79f528afa93918519574773ea49a444c104bc1bd ] nvme_update_ana_state() has a deficiency that results in a failure to properly update the ana state for a namespace in the following case: NSIDs in ctrl->namespaces: 1, 3, 4 NSIDs in desc->nsids: 1, 2, 3, 4 Loop iteration 0: ns index = 0, n = 0, ns->head->ns_id = 1, nsid = 1, MATCH. Loop iteration 1: ns index = 1, n = 1, ns->head->ns_id = 3, nsid = 2, NO MATCH. Loop iteration 2: ns index = 2, n = 2, ns->head->ns_id = 4, nsid = 4, MATCH. Where the update to the ANA state of NSID 3 is missed. To fix this increment n and retry the update with the same ns when ns->head->ns_id is higher than nsid, Signed-off-by: Anton Eidelman Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Sasha Levin --- drivers/nvme/host/multipath.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 590b040e90a34..016a67fd41989 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -522,14 +522,17 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl, down_read(&ctrl->namespaces_rwsem); list_for_each_entry(ns, &ctrl->namespaces, list) { - unsigned nsid = le32_to_cpu(desc->nsids[n]); - + unsigned nsid; +again: + nsid = le32_to_cpu(desc->nsids[n]); if (ns->head->ns_id < nsid) continue; if (ns->head->ns_id == nsid) nvme_update_ns_ana_state(desc, ns); if (++n == nr_nsids) break; + if (ns->head->ns_id > nsid) + goto again; } up_read(&ctrl->namespaces_rwsem); return 0; From 9fb693c4766f78424f02120855aadbe9ad7d1da8 Mon Sep 17 00:00:00 2001 From: Andreas Larsson Date: Wed, 8 Sep 2021 09:48:22 +0200 Subject: [PATCH 0992/5344] sparc32: page align size in arch_dma_alloc [ Upstream commit 59583f747664046aaae5588d56d5954fab66cce8 ] Commit 53b7670e5735 ("sparc: factor the dma coherent mapping into helper") lost the page align for the calls to dma_make_coherent and srmmu_unmapiorange. The latter cannot handle a non page aligned len argument. Signed-off-by: Andreas Larsson Reviewed-by: Sam Ravnborg Signed-off-by: Christoph Hellwig Signed-off-by: Sasha Levin --- arch/sparc/kernel/ioport.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c index f89603855f1ec..b87e0002131dd 100644 --- a/arch/sparc/kernel/ioport.c +++ b/arch/sparc/kernel/ioport.c @@ -356,7 +356,9 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, void arch_dma_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) { - if (!sparc_dma_free_resource(cpu_addr, PAGE_ALIGN(size))) + size = PAGE_ALIGN(size); + + if (!sparc_dma_free_resource(cpu_addr, size)) return; dma_make_coherent(dma_addr, size); From 2259f82666556a6bfbb4a0a4a03673e5cd60fa69 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 14 Sep 2021 20:52:24 -0700 Subject: [PATCH 0993/5344] compiler.h: Introduce absolute_pointer macro [ Upstream commit f6b5f1a56987de837f8e25cd560847106b8632a8 ] absolute_pointer() disassociates a pointer from its originating symbol type and context. Use it to prevent compiler warnings/errors such as drivers/net/ethernet/i825xx/82596.c: In function 'i82596_probe': arch/m68k/include/asm/string.h:72:25: error: '__builtin_memcpy' reading 6 bytes from a region of size 0 [-Werror=stringop-overread] Such warnings may be reported by gcc 11.x for string and memory operations on fixed addresses. Suggested-by: Linus Torvalds Signed-off-by: Guenter Roeck Reviewed-by: Geert Uytterhoeven Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- include/linux/compiler.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 9446e8fbe55c5..bce983406aaf3 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -233,6 +233,8 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, (typeof(ptr)) (__ptr + (off)); }) #endif +#define absolute_pointer(val) RELOC_HIDE((void *)(val), 0) + #ifndef OPTIMIZER_HIDE_VAR /* Make the optimizer believe the variable can be manipulated arbitrarily. */ #define OPTIMIZER_HIDE_VAR(var) \ From 5a0dc974119d0eb483497c9b00727bfa80dabd09 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 14 Sep 2021 20:52:25 -0700 Subject: [PATCH 0994/5344] net: i825xx: Use absolute_pointer for memcpy from fixed memory location [ Upstream commit dff2d13114f0beec448da9b3716204eb34b0cf41 ] gcc 11.x reports the following compiler warning/error. drivers/net/ethernet/i825xx/82596.c: In function 'i82596_probe': arch/m68k/include/asm/string.h:72:25: error: '__builtin_memcpy' reading 6 bytes from a region of size 0 [-Werror=stringop-overread] Use absolute_pointer() to work around the problem. Cc: Geert Uytterhoeven Signed-off-by: Guenter Roeck Reviewed-by: Geert Uytterhoeven Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- drivers/net/ethernet/i825xx/82596.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/i825xx/82596.c b/drivers/net/ethernet/i825xx/82596.c index 92929750f8325..54d5b402b0e8d 100644 --- a/drivers/net/ethernet/i825xx/82596.c +++ b/drivers/net/ethernet/i825xx/82596.c @@ -1155,7 +1155,7 @@ struct net_device * __init i82596_probe(int unit) err = -ENODEV; goto out; } - memcpy(eth_addr, (void *) 0xfffc1f2c, ETH_ALEN); /* YUCK! Get addr from NOVRAM */ + memcpy(eth_addr, absolute_pointer(0xfffc1f2c), ETH_ALEN); /* YUCK! Get addr from NOVRAM */ dev->base_addr = MVME_I596_BASE; dev->irq = (unsigned) MVME16x_IRQ_I596; goto found; From 025020a56865ef94d7f88a6c5980c8d300915c79 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 6 Sep 2021 16:06:04 -0700 Subject: [PATCH 0995/5344] sparc: avoid stringop-overread errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit fc7c028dcdbfe981bca75d2a7b95f363eb691ef3 ] The sparc mdesc code does pointer games with 'struct mdesc_hdr', but didn't describe to the compiler how that header is then followed by the data that the header describes. As a result, gcc is now unhappy since it does stricter pointer range tracking, and doesn't understand about how these things work. This results in various errors like: arch/sparc/kernel/mdesc.c: In function ‘mdesc_node_by_name’: arch/sparc/kernel/mdesc.c:647:22: error: ‘strcmp’ reading 1 or more bytes from a region of size 0 [-Werror=stringop-overread] 647 | if (!strcmp(names + ep[ret].name_offset, name)) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ which are easily avoided by just describing 'struct mdesc_hdr' better, and making the node_block() helper function look into that unsized data[] that follows the header. This makes the sparc64 build happy again at least for my cross-compiler version (gcc version 11.2.1). Link: https://lore.kernel.org/lkml/CAHk-=wi4NW3NC0xWykkw=6LnjQD6D_rtRtxY9g8gQAJXtQMi8A@mail.gmail.com/ Cc: Guenter Roeck Cc: David S. Miller Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- arch/sparc/kernel/mdesc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/sparc/kernel/mdesc.c b/arch/sparc/kernel/mdesc.c index 8e645ddac58e2..30f171b7b00c2 100644 --- a/arch/sparc/kernel/mdesc.c +++ b/arch/sparc/kernel/mdesc.c @@ -39,6 +39,7 @@ struct mdesc_hdr { u32 node_sz; /* node block size */ u32 name_sz; /* name block size */ u32 data_sz; /* data block size */ + char data[]; } __attribute__((aligned(16))); struct mdesc_elem { @@ -612,7 +613,7 @@ EXPORT_SYMBOL(mdesc_get_node_info); static struct mdesc_elem *node_block(struct mdesc_hdr *mdesc) { - return (struct mdesc_elem *) (mdesc + 1); + return (struct mdesc_elem *) mdesc->data; } static void *name_block(struct mdesc_hdr *mdesc) From bb058054dcb34d498a66cfbf78dbbfa6f303aa81 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 15 Sep 2021 13:56:37 -0700 Subject: [PATCH 0996/5344] qnx4: avoid stringop-overread errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit b7213ffa0e585feb1aee3e7173e965e66ee0abaa ] The qnx4 directory entries are 64-byte blocks that have different contents depending on the a status byte that is in the last byte of the block. In particular, a directory entry can be either a "link info" entry with a 48-byte name and pointers to the real inode information, or an "inode entry" with a smaller 16-byte name and the full inode information. But the code was written to always just treat the directory name as if it was part of that "inode entry", and just extend the name to the longer case if the status byte said it was a link entry. That work just fine and gives the right results, but now that gcc is tracking data structure accesses much more, the code can trigger a compiler error about using up to 48 bytes (the long name) in a structure that only has that shorter name in it: fs/qnx4/dir.c: In function ‘qnx4_readdir’: fs/qnx4/dir.c:51:32: error: ‘strnlen’ specified bound 48 exceeds source size 16 [-Werror=stringop-overread] 51 | size = strnlen(de->di_fname, size); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~ In file included from fs/qnx4/qnx4.h:3, from fs/qnx4/dir.c:16: include/uapi/linux/qnx4_fs.h:45:25: note: source object declared here 45 | char di_fname[QNX4_SHORT_NAME_MAX]; | ^~~~~~~~ which is because the source code doesn't really make this whole "one of two different types" explicit. Fix this by introducing a very explicit union of the two types, and basically explaining to the compiler what is really going on. Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/qnx4/dir.c | 51 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c index a6ee23aadd283..2a66844b7ff87 100644 --- a/fs/qnx4/dir.c +++ b/fs/qnx4/dir.c @@ -15,13 +15,27 @@ #include #include "qnx4.h" +/* + * A qnx4 directory entry is an inode entry or link info + * depending on the status field in the last byte. The + * first byte is where the name start either way, and a + * zero means it's empty. + */ +union qnx4_directory_entry { + struct { + char de_name; + char de_pad[62]; + char de_status; + }; + struct qnx4_inode_entry inode; + struct qnx4_link_info link; +}; + static int qnx4_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); unsigned int offset; struct buffer_head *bh; - struct qnx4_inode_entry *de; - struct qnx4_link_info *le; unsigned long blknum; int ix, ino; int size; @@ -38,27 +52,30 @@ static int qnx4_readdir(struct file *file, struct dir_context *ctx) } ix = (ctx->pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK; for (; ix < QNX4_INODES_PER_BLOCK; ix++, ctx->pos += QNX4_DIR_ENTRY_SIZE) { + union qnx4_directory_entry *de; + const char *name; + offset = ix * QNX4_DIR_ENTRY_SIZE; - de = (struct qnx4_inode_entry *) (bh->b_data + offset); - if (!de->di_fname[0]) + de = (union qnx4_directory_entry *) (bh->b_data + offset); + + if (!de->de_name) continue; - if (!(de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK))) + if (!(de->de_status & (QNX4_FILE_USED|QNX4_FILE_LINK))) continue; - if (!(de->di_status & QNX4_FILE_LINK)) - size = QNX4_SHORT_NAME_MAX; - else - size = QNX4_NAME_MAX; - size = strnlen(de->di_fname, size); - QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname)); - if (!(de->di_status & QNX4_FILE_LINK)) + if (!(de->de_status & QNX4_FILE_LINK)) { + size = sizeof(de->inode.di_fname); + name = de->inode.di_fname; ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1; - else { - le = (struct qnx4_link_info*)de; - ino = ( le32_to_cpu(le->dl_inode_blk) - 1 ) * + } else { + size = sizeof(de->link.dl_fname); + name = de->link.dl_fname; + ino = ( le32_to_cpu(de->link.dl_inode_blk) - 1 ) * QNX4_INODES_PER_BLOCK + - le->dl_inode_ndx; + de->link.dl_inode_ndx; } - if (!dir_emit(ctx, de->di_fname, size, ino, DT_UNKNOWN)) { + size = strnlen(name, size); + QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, name)); + if (!dir_emit(ctx, name, size, ino, DT_UNKNOWN)) { brelse(bh); return 0; } From 54cb0f0b8aa565240d1595d7b56b20d3805cfa66 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 16 Sep 2021 08:35:42 +0200 Subject: [PATCH 0997/5344] parisc: Use absolute_pointer() to define PAGE0 [ Upstream commit 90cc7bed1ed19f869ae7221a6b41887fe762a6a3 ] Use absolute_pointer() wrapper for PAGE0 to avoid this compiler warning: arch/parisc/kernel/setup.c: In function 'start_parisc': error: '__builtin_memcmp_eq' specified bound 8 exceeds source size 0 Signed-off-by: Helge Deller Co-Developed-by: Guenter Roeck Suggested-by: Linus Torvalds Signed-off-by: Sasha Levin --- arch/parisc/include/asm/page.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/parisc/include/asm/page.h b/arch/parisc/include/asm/page.h index 93caf17ac5e2f..9ebf3b0413d5f 100644 --- a/arch/parisc/include/asm/page.h +++ b/arch/parisc/include/asm/page.h @@ -181,7 +181,7 @@ extern int npmem_ranges; #include #include -#define PAGE0 ((struct zeropage *)__PAGE_OFFSET) +#define PAGE0 ((struct zeropage *)absolute_pointer(__PAGE_OFFSET)) /* DEFINITION OF THE ZERO-PAGE (PAG0) */ /* based on work by Jason Eckhardt (jason@equator.com) */ From b98e3dda814ad52bcea8664b6a4bdd467879c107 Mon Sep 17 00:00:00 2001 From: Dan Li Date: Tue, 14 Sep 2021 17:44:02 +0800 Subject: [PATCH 0998/5344] arm64: Mark __stack_chk_guard as __ro_after_init [ Upstream commit 9fcb2e93f41c07a400885325e7dbdfceba6efaec ] __stack_chk_guard is setup once while init stage and never changed after that. Although the modification of this variable at runtime will usually cause the kernel to crash (so does the attacker), it should be marked as __ro_after_init, and it should not affect performance if it is placed in the ro_after_init section. Signed-off-by: Dan Li Acked-by: Mark Rutland Link: https://lore.kernel.org/r/1631612642-102881-1-git-send-email-ashimida@linux.alibaba.com Signed-off-by: Catalin Marinas Signed-off-by: Sasha Levin --- arch/arm64/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 09974216384fd..43b82d9e81e77 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -56,7 +56,7 @@ #if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_STACKPROTECTOR_PER_TASK) #include -unsigned long __stack_chk_guard __read_mostly; +unsigned long __stack_chk_guard __ro_after_init; EXPORT_SYMBOL(__stack_chk_guard); #endif From 9ae8825a4ace9f5210b20d11f49f29e4da84c012 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Wed, 8 Sep 2021 22:00:33 -0700 Subject: [PATCH 0999/5344] alpha: Declare virt_to_phys and virt_to_bus parameter as pointer to volatile [ Upstream commit 35a3f4ef0ab543daa1725b0c963eb8c05e3376f8 ] Some drivers pass a pointer to volatile data to virt_to_bus() and virt_to_phys(), and that works fine. One exception is alpha. This results in a number of compile errors such as drivers/net/wan/lmc/lmc_main.c: In function 'lmc_softreset': drivers/net/wan/lmc/lmc_main.c:1782:50: error: passing argument 1 of 'virt_to_bus' discards 'volatile' qualifier from pointer target type drivers/atm/ambassador.c: In function 'do_loader_command': drivers/atm/ambassador.c:1747:58: error: passing argument 1 of 'virt_to_bus' discards 'volatile' qualifier from pointer target type Declare the parameter of virt_to_phys and virt_to_bus as pointer to volatile to fix the problem. Signed-off-by: Guenter Roeck Acked-by: Arnd Bergmann Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- arch/alpha/include/asm/io.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/alpha/include/asm/io.h b/arch/alpha/include/asm/io.h index 103270d5a9fc6..66a384a4ddbad 100644 --- a/arch/alpha/include/asm/io.h +++ b/arch/alpha/include/asm/io.h @@ -61,7 +61,7 @@ extern inline void set_hae(unsigned long new_hae) * Change virtual addresses to physical addresses and vv. */ #ifdef USE_48_BIT_KSEG -static inline unsigned long virt_to_phys(void *address) +static inline unsigned long virt_to_phys(volatile void *address) { return (unsigned long)address - IDENT_ADDR; } @@ -71,7 +71,7 @@ static inline void * phys_to_virt(unsigned long address) return (void *) (address + IDENT_ADDR); } #else -static inline unsigned long virt_to_phys(void *address) +static inline unsigned long virt_to_phys(volatile void *address) { unsigned long phys = (unsigned long)address; @@ -107,7 +107,7 @@ static inline void * phys_to_virt(unsigned long address) extern unsigned long __direct_map_base; extern unsigned long __direct_map_size; -static inline unsigned long __deprecated virt_to_bus(void *address) +static inline unsigned long __deprecated virt_to_bus(volatile void *address) { unsigned long phys = virt_to_phys(address); unsigned long bus = phys + __direct_map_base; From f3c4abe8a6c2f4aa653859a60aa612b79497d37c Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Wed, 8 Sep 2021 20:57:43 -0700 Subject: [PATCH 1000/5344] net: 6pack: Fix tx timeout and slot time [ Upstream commit 3c0d2a46c0141913dc6fd126c57d0615677d946e ] tx timeout and slot time are currently specified in units of HZ. On Alpha, HZ is defined as 1024. When building alpha:allmodconfig, this results in the following error message. drivers/net/hamradio/6pack.c: In function 'sixpack_open': drivers/net/hamradio/6pack.c:71:41: error: unsigned conversion from 'int' to 'unsigned char' changes value from '256' to '0' In the 6PACK protocol, tx timeout is specified in units of 10 ms and transmitted over the wire: https://www.linux-ax25.org/wiki/6PACK Defining a value dependent on HZ doesn't really make sense, and presumably comes from the (very historical) situation where HZ was originally 100. Note that the SIXP_SLOTTIME use explicitly is about 10ms granularity: mod_timer(&sp->tx_t, jiffies + ((when + 1) * HZ) / 100); and the SIXP_TXDELAY walue is sent as a byte over the wire. Signed-off-by: Guenter Roeck Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- drivers/net/hamradio/6pack.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c index da13683d52d1a..bd0beb16d68a9 100644 --- a/drivers/net/hamradio/6pack.c +++ b/drivers/net/hamradio/6pack.c @@ -68,9 +68,9 @@ #define SIXP_DAMA_OFF 0 /* default level 2 parameters */ -#define SIXP_TXDELAY (HZ/4) /* in 1 s */ +#define SIXP_TXDELAY 25 /* 250 ms */ #define SIXP_PERSIST 50 /* in 256ths */ -#define SIXP_SLOTTIME (HZ/10) /* in 1 s */ +#define SIXP_SLOTTIME 10 /* 100 ms */ #define SIXP_INIT_RESYNC_TIMEOUT (3*HZ/2) /* in 1 s */ #define SIXP_RESYNC_TIMEOUT 5*HZ /* in 1 s */ From 6f80d15e51e60333ebce7025c0a7285c646355e7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 18 Sep 2021 10:05:06 -0700 Subject: [PATCH 1001/5344] spi: Fix tegra20 build with CONFIG_PM=n MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit efafec27c5658ed987e720130772f8933c685e87 ] Without CONFIG_PM enabled, the SET_RUNTIME_PM_OPS() macro ends up being empty, and the only use of tegra_slink_runtime_{resume,suspend} goes away, resulting in drivers/spi/spi-tegra20-slink.c:1200:12: error: ‘tegra_slink_runtime_resume’ defined but not used [-Werror=unused-function] 1200 | static int tegra_slink_runtime_resume(struct device *dev) | ^~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/spi/spi-tegra20-slink.c:1188:12: error: ‘tegra_slink_runtime_suspend’ defined but not used [-Werror=unused-function] 1188 | static int tegra_slink_runtime_suspend(struct device *dev) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~ mark the functions __maybe_unused to make the build happy. This hits the alpha allmodconfig build (and others). Reported-by: Guenter Roeck Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- drivers/spi/spi-tegra20-slink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-tegra20-slink.c b/drivers/spi/spi-tegra20-slink.c index 2a1905c43a0b7..9b59539c87359 100644 --- a/drivers/spi/spi-tegra20-slink.c +++ b/drivers/spi/spi-tegra20-slink.c @@ -1205,7 +1205,7 @@ static int tegra_slink_resume(struct device *dev) } #endif -static int tegra_slink_runtime_suspend(struct device *dev) +static int __maybe_unused tegra_slink_runtime_suspend(struct device *dev) { struct spi_master *master = dev_get_drvdata(dev); struct tegra_slink_data *tspi = spi_master_get_devdata(master); @@ -1217,7 +1217,7 @@ static int tegra_slink_runtime_suspend(struct device *dev) return 0; } -static int tegra_slink_runtime_resume(struct device *dev) +static int __maybe_unused tegra_slink_runtime_resume(struct device *dev) { struct spi_master *master = dev_get_drvdata(dev); struct tegra_slink_data *tspi = spi_master_get_devdata(master); From fa6ea44915d449566e30ffbff23137a54f2d6309 Mon Sep 17 00:00:00 2001 From: Sai Krishna Potthuri Date: Wed, 18 Aug 2021 12:53:14 +0530 Subject: [PATCH 1002/5344] EDAC/synopsys: Fix wrong value type assignment for edac_mode commit 5297cfa6bdf93e3889f78f9b482e2a595a376083 upstream. dimm->edac_mode contains values of type enum edac_type - not the corresponding capability flags. Fix that. Issue caught by Coverity check "enumerated type mixed with another type." [ bp: Rewrite commit message, add tags. ] Fixes: ae9b56e3996d ("EDAC, synps: Add EDAC support for zynq ddr ecc controller") Signed-off-by: Sai Krishna Potthuri Signed-off-by: Shubhrajyoti Datta Signed-off-by: Borislav Petkov Cc: Link: https://lkml.kernel.org/r/20210818072315.15149-1-shubhrajyoti.datta@xilinx.com Signed-off-by: Greg Kroah-Hartman --- drivers/edac/synopsys_edac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/edac/synopsys_edac.c b/drivers/edac/synopsys_edac.c index 880ffd8337187..6becf3363ad57 100644 --- a/drivers/edac/synopsys_edac.c +++ b/drivers/edac/synopsys_edac.c @@ -782,7 +782,7 @@ static void init_csrows(struct mem_ctl_info *mci) for (j = 0; j < csi->nr_channels; j++) { dimm = csi->channels[j]->dimm; - dimm->edac_mode = EDAC_FLAG_SECDED; + dimm->edac_mode = EDAC_SECDED; dimm->mtype = p_data->get_mtype(priv->baseaddr); dimm->nr_pages = (size >> PAGE_SHIFT) / csi->nr_channels; dimm->grain = SYNPS_EDAC_ERR_GRAIN; From 39a5535b5c779d603708bcd41c91f61bc44b645c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 24 Jun 2021 23:55:46 +0200 Subject: [PATCH 1003/5344] arm64: dts: marvell: armada-37xx: Extend PCIe MEM space MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 514ef1e62d6521c2199d192b1c71b79d2aa21d5a upstream. Current PCIe MEM space of size 16 MB is not enough for some combination of PCIe cards (e.g. NVMe disk together with ath11k wifi card). ARM Trusted Firmware for Armada 3700 platform already assigns 128 MB for PCIe window, so extend PCIe MEM space to the end of 128 MB PCIe window which allows to allocate more PCIe BARs for more PCIe cards. Without this change some combination of PCIe cards cannot be used and kernel show error messages in dmesg during initialization: pci 0000:00:00.0: BAR 8: no space for [mem size 0x01800000] pci 0000:00:00.0: BAR 8: failed to assign [mem size 0x01800000] pci 0000:00:00.0: BAR 6: assigned [mem 0xe8000000-0xe80007ff pref] pci 0000:01:00.0: BAR 8: no space for [mem size 0x01800000] pci 0000:01:00.0: BAR 8: failed to assign [mem size 0x01800000] pci 0000:02:03.0: BAR 8: no space for [mem size 0x01000000] pci 0000:02:03.0: BAR 8: failed to assign [mem size 0x01000000] pci 0000:02:07.0: BAR 8: no space for [mem size 0x00100000] pci 0000:02:07.0: BAR 8: failed to assign [mem size 0x00100000] pci 0000:03:00.0: BAR 0: no space for [mem size 0x01000000 64bit] pci 0000:03:00.0: BAR 0: failed to assign [mem size 0x01000000 64bit] Due to bugs in U-Boot port for Turris Mox, the second range in Turris Mox kernel DTS file for PCIe must start at 16 MB offset. Otherwise U-Boot crashes during loading of kernel DTB file. This bug is present only in U-Boot code for Turris Mox and therefore other Armada 3700 devices are not affected by this bug. Bug is fixed in U-Boot version 2021.07. To not break booting new kernels on existing versions of U-Boot on Turris Mox, use first 16 MB range for IO and second range with rest of PCIe window for MEM. Signed-off-by: Pali Rohár Fixes: 76f6386b25cc ("arm64: dts: marvell: Add Aardvark PCIe support for Armada 3700") Signed-off-by: Gregory CLEMENT Signed-off-by: Greg Kroah-Hartman --- .../boot/dts/marvell/armada-3720-turris-mox.dts | 17 +++++++++++++++++ arch/arm64/boot/dts/marvell/armada-37xx.dtsi | 11 +++++++++-- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts b/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts index 025e02d23da9b..de0eabff29353 100644 --- a/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts +++ b/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts @@ -138,6 +138,23 @@ max-link-speed = <2>; reset-gpios = <&gpiosb 3 GPIO_ACTIVE_LOW>; phys = <&comphy1 0>; + /* + * U-Boot port for Turris Mox has a bug which always expects that "ranges" DT property + * contains exactly 2 ranges with 3 (child) address cells, 2 (parent) address cells and + * 2 size cells and also expects that the second range starts at 16 MB offset. If these + * conditions are not met then U-Boot crashes during loading kernel DTB file. PCIe address + * space is 128 MB long, so the best split between MEM and IO is to use fixed 16 MB window + * for IO and the rest 112 MB (64+32+16) for MEM, despite that maximal IO size is just 64 kB. + * This bug is not present in U-Boot ports for other Armada 3700 devices and is fixed in + * U-Boot version 2021.07. See relevant U-Boot commits (the last one contains fix): + * https://source.denx.de/u-boot/u-boot/-/commit/cb2ddb291ee6fcbddd6d8f4ff49089dfe580f5d7 + * https://source.denx.de/u-boot/u-boot/-/commit/c64ac3b3185aeb3846297ad7391fc6df8ecd73bf + * https://source.denx.de/u-boot/u-boot/-/commit/4a82fca8e330157081fc132a591ebd99ba02ee33 + */ + #address-cells = <3>; + #size-cells = <2>; + ranges = <0x81000000 0 0xe8000000 0 0xe8000000 0 0x01000000 /* Port 0 IO */ + 0x82000000 0 0xe9000000 0 0xe9000000 0 0x07000000>; /* Port 0 MEM */ /* enabled by U-Boot if PCIe module is present */ status = "disabled"; diff --git a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi index 52767037e0494..c28611c1c251a 100644 --- a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi +++ b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi @@ -487,8 +487,15 @@ #interrupt-cells = <1>; msi-parent = <&pcie0>; msi-controller; - ranges = <0x82000000 0 0xe8000000 0 0xe8000000 0 0x1000000 /* Port 0 MEM */ - 0x81000000 0 0xe9000000 0 0xe9000000 0 0x10000>; /* Port 0 IO*/ + /* + * The 128 MiB address range [0xe8000000-0xf0000000] is + * dedicated for PCIe and can be assigned to 8 windows + * with size a power of two. Use one 64 KiB window for + * IO at the end and the remaining seven windows + * (totaling 127 MiB) for MEM. + */ + ranges = <0x82000000 0 0xe8000000 0 0xe8000000 0 0x07f00000 /* Port 0 MEM */ + 0x81000000 0 0xefff0000 0 0xefff0000 0 0x00010000>; /* Port 0 IO */ interrupt-map-mask = <0 0 0 7>; interrupt-map = <0 0 0 1 &pcie_intc 0>, <0 0 0 2 &pcie_intc 1>, From ccc00740b65c5afa93e11ea010a49c15bf0b27e6 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 20 Sep 2021 12:03:45 +0200 Subject: [PATCH 1004/5344] xen/balloon: fix balloon kthread freezing commit 96f5bd03e1be606987644b71899ea56a8d05f825 upstream. Commit 8480ed9c2bbd56 ("xen/balloon: use a kernel thread instead a workqueue") switched the Xen balloon driver to use a kernel thread. Unfortunately the patch omitted to call try_to_freeze() or to use wait_event_freezable_timeout(), causing a system suspend to fail. Fixes: 8480ed9c2bbd56 ("xen/balloon: use a kernel thread instead a workqueue") Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Link: https://lore.kernel.org/r/20210920100345.21939-1-jgross@suse.com Signed-off-by: Juergen Gross Signed-off-by: Greg Kroah-Hartman --- drivers/xen/balloon.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 2762d246991b6..be31c296eed4c 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -539,8 +539,8 @@ static int balloon_thread(void *unused) timeout = 3600 * HZ; credit = current_credit(); - wait_event_interruptible_timeout(balloon_thread_wq, - balloon_thread_cond(state, credit), timeout); + wait_event_freezable_timeout(balloon_thread_wq, + balloon_thread_cond(state, credit), timeout); if (kthread_should_stop()) return 0; From 802837ace3eb0016e352fc75d9f632c67c2e1124 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 20 Sep 2021 10:26:21 -0700 Subject: [PATCH 1005/5344] qnx4: work around gcc false positive warning bug commit d5f6545934c47e97c0b48a645418e877b452a992 upstream. In commit b7213ffa0e58 ("qnx4: avoid stringop-overread errors") I tried to teach gcc about how the directory entry structure can be two different things depending on a status flag. It made the code clearer, and it seemed to make gcc happy. However, Arnd points to a gcc bug, where despite using two different members of a union, gcc then gets confused, and uses the size of one of the members to decide if a string overrun happens. And not necessarily the rigth one. End result: with some configurations, gcc-11 will still complain about the source buffer size being overread: fs/qnx4/dir.c: In function 'qnx4_readdir': fs/qnx4/dir.c:76:32: error: 'strnlen' specified bound [16, 48] exceeds source size 1 [-Werror=stringop-overread] 76 | size = strnlen(name, size); | ^~~~~~~~~~~~~~~~~~~ fs/qnx4/dir.c:26:22: note: source object declared here 26 | char de_name; | ^~~~~~~ because gcc will get confused about which union member entry is actually getting accessed, even when the source code is very clear about it. Gcc internally will have combined two "redundant" pointers (pointing to different union elements that are at the same offset), and takes the size checking from one or the other - not necessarily the right one. This is clearly a gcc bug, but we can work around it fairly easily. The biggest thing here is the big honking comment about why we do what we do. Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99578#c6 Reported-and-tested-by: Arnd Bergmann Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/qnx4/dir.c | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c index 2a66844b7ff87..66645a5a35f30 100644 --- a/fs/qnx4/dir.c +++ b/fs/qnx4/dir.c @@ -20,12 +20,33 @@ * depending on the status field in the last byte. The * first byte is where the name start either way, and a * zero means it's empty. + * + * Also, due to a bug in gcc, we don't want to use the + * real (differently sized) name arrays in the inode and + * link entries, but always the 'de_name[]' one in the + * fake struct entry. + * + * See + * + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99578#c6 + * + * for details, but basically gcc will take the size of the + * 'name' array from one of the used union entries randomly. + * + * This use of 'de_name[]' (48 bytes) avoids the false positive + * warnings that would happen if gcc decides to use 'inode.di_name' + * (16 bytes) even when the pointer and size were to come from + * 'link.dl_name' (48 bytes). + * + * In all cases the actual name pointer itself is the same, it's + * only the gcc internal 'what is the size of this field' logic + * that can get confused. */ union qnx4_directory_entry { struct { - char de_name; - char de_pad[62]; - char de_status; + const char de_name[48]; + u8 de_pad[15]; + u8 de_status; }; struct qnx4_inode_entry inode; struct qnx4_link_info link; @@ -53,29 +74,26 @@ static int qnx4_readdir(struct file *file, struct dir_context *ctx) ix = (ctx->pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK; for (; ix < QNX4_INODES_PER_BLOCK; ix++, ctx->pos += QNX4_DIR_ENTRY_SIZE) { union qnx4_directory_entry *de; - const char *name; offset = ix * QNX4_DIR_ENTRY_SIZE; de = (union qnx4_directory_entry *) (bh->b_data + offset); - if (!de->de_name) + if (!de->de_name[0]) continue; if (!(de->de_status & (QNX4_FILE_USED|QNX4_FILE_LINK))) continue; if (!(de->de_status & QNX4_FILE_LINK)) { size = sizeof(de->inode.di_fname); - name = de->inode.di_fname; ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1; } else { size = sizeof(de->link.dl_fname); - name = de->link.dl_fname; ino = ( le32_to_cpu(de->link.dl_inode_blk) - 1 ) * QNX4_INODES_PER_BLOCK + de->link.dl_inode_ndx; } - size = strnlen(name, size); + size = strnlen(de->de_name, size); QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, name)); - if (!dir_emit(ctx, name, size, ino, DT_UNKNOWN)) { + if (!dir_emit(ctx, de->de_name, size, ino, DT_UNKNOWN)) { brelse(bh); return 0; } From ee8409b311d4e73d051e89114189bc65afb2ce49 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 30 Sep 2021 10:09:26 +0200 Subject: [PATCH 1006/5344] Linux 5.4.150 Link: https://lore.kernel.org/r/20210927170219.901812470@linuxfoundation.org Tested-by: Florian Fainelli Tested-by: Shuah Khan Tested-by: Jon Hunter Tested-by: Linux Kernel Functional Testing Tested-by: Sudip Mukherjee Tested-by: Hulk Robot Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 1834f47fbaf61..c6b3a3d62f6ca 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 4 -SUBLEVEL = 149 +SUBLEVEL = 150 EXTRAVERSION = NAME = Kleptomaniac Octopus From dabe8e2742a3e4b4a731d8d0a14f76acc189a1bd Mon Sep 17 00:00:00 2001 From: Igor Matheus Andrade Torrente Date: Mon, 28 Jun 2021 10:45:09 -0300 Subject: [PATCH 1007/5344] tty: Fix out-of-bound vmalloc access in imageblit [ Upstream commit 3b0c406124719b625b1aba431659f5cdc24a982c ] This issue happens when a userspace program does an ioctl FBIOPUT_VSCREENINFO passing the fb_var_screeninfo struct containing only the fields xres, yres, and bits_per_pixel with values. If this struct is the same as the previous ioctl, the vc_resize() detects it and doesn't call the resize_screen(), leaving the fb_var_screeninfo incomplete. And this leads to the updatescrollmode() calculates a wrong value to fbcon_display->vrows, which makes the real_y() return a wrong value of y, and that value, eventually, causes the imageblit to access an out-of-bound address value. To solve this issue I made the resize_screen() be called even if the screen does not need any resizing, so it will "fix and fill" the fb_var_screeninfo independently. Cc: stable # after 5.15-rc2 is out, give it time to bake Reported-and-tested-by: syzbot+858dc7a2f7ef07c2c219@syzkaller.appspotmail.com Signed-off-by: Igor Matheus Andrade Torrente Link: https://lore.kernel.org/r/20210628134509.15895-1-igormtorrente@gmail.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/tty/vt/vt.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c index 404b80dc06b87..d1ab8561e2581 100644 --- a/drivers/tty/vt/vt.c +++ b/drivers/tty/vt/vt.c @@ -1215,8 +1215,25 @@ static int vc_do_resize(struct tty_struct *tty, struct vc_data *vc, new_row_size = new_cols << 1; new_screen_size = new_row_size * new_rows; - if (new_cols == vc->vc_cols && new_rows == vc->vc_rows) - return 0; + if (new_cols == vc->vc_cols && new_rows == vc->vc_rows) { + /* + * This function is being called here to cover the case + * where the userspace calls the FBIOPUT_VSCREENINFO twice, + * passing the same fb_var_screeninfo containing the fields + * yres/xres equal to a number non-multiple of vc_font.height + * and yres_virtual/xres_virtual equal to number lesser than the + * vc_font.height and yres/xres. + * In the second call, the struct fb_var_screeninfo isn't + * being modified by the underlying driver because of the + * if above, and this causes the fbcon_display->vrows to become + * negative and it eventually leads to out-of-bound + * access by the imageblit function. + * To give the correct values to the struct and to not have + * to deal with possible errors from the code below, we call + * the resize_screen here as well. + */ + return resize_screen(vc, new_cols, new_rows, user); + } if (new_screen_size > KMALLOC_MAX_SIZE || !new_screen_size) return -EINVAL; From 2a81cb5ca296ac3f8a9fa2875898fc055c5fb367 Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Thu, 5 Aug 2021 15:29:17 +0800 Subject: [PATCH 1008/5344] cpufreq: schedutil: Use kobject release() method to free sugov_tunables [ Upstream commit e5c6b312ce3cc97e90ea159446e6bfa06645364d ] The struct sugov_tunables is protected by the kobject, so we can't free it directly. Otherwise we would get a call trace like this: ODEBUG: free active (active state 0) object type: timer_list hint: delayed_work_timer_fn+0x0/0x30 WARNING: CPU: 3 PID: 720 at lib/debugobjects.c:505 debug_print_object+0xb8/0x100 Modules linked in: CPU: 3 PID: 720 Comm: a.sh Tainted: G W 5.14.0-rc1-next-20210715-yocto-standard+ #507 Hardware name: Marvell OcteonTX CN96XX board (DT) pstate: 40400009 (nZcv daif +PAN -UAO -TCO BTYPE=--) pc : debug_print_object+0xb8/0x100 lr : debug_print_object+0xb8/0x100 sp : ffff80001ecaf910 x29: ffff80001ecaf910 x28: ffff00011b10b8d0 x27: ffff800011043d80 x26: ffff00011a8f0000 x25: ffff800013cb3ff0 x24: 0000000000000000 x23: ffff80001142aa68 x22: ffff800011043d80 x21: ffff00010de46f20 x20: ffff800013c0c520 x19: ffff800011d8f5b0 x18: 0000000000000010 x17: 6e6968207473696c x16: 5f72656d6974203a x15: 6570797420746365 x14: 6a626f2029302065 x13: 303378302f307830 x12: 2b6e665f72656d69 x11: ffff8000124b1560 x10: ffff800012331520 x9 : ffff8000100ca6b0 x8 : 000000000017ffe8 x7 : c0000000fffeffff x6 : 0000000000000001 x5 : ffff800011d8c000 x4 : ffff800011d8c740 x3 : 0000000000000000 x2 : ffff0001108301c0 x1 : ab3c90eedf9c0f00 x0 : 0000000000000000 Call trace: debug_print_object+0xb8/0x100 __debug_check_no_obj_freed+0x1c0/0x230 debug_check_no_obj_freed+0x20/0x88 slab_free_freelist_hook+0x154/0x1c8 kfree+0x114/0x5d0 sugov_exit+0xbc/0xc0 cpufreq_exit_governor+0x44/0x90 cpufreq_set_policy+0x268/0x4a8 store_scaling_governor+0xe0/0x128 store+0xc0/0xf0 sysfs_kf_write+0x54/0x80 kernfs_fop_write_iter+0x128/0x1c0 new_sync_write+0xf0/0x190 vfs_write+0x2d4/0x478 ksys_write+0x74/0x100 __arm64_sys_write+0x24/0x30 invoke_syscall.constprop.0+0x54/0xe0 do_el0_svc+0x64/0x158 el0_svc+0x2c/0xb0 el0t_64_sync_handler+0xb0/0xb8 el0t_64_sync+0x198/0x19c irq event stamp: 5518 hardirqs last enabled at (5517): [] console_unlock+0x554/0x6c8 hardirqs last disabled at (5518): [] el1_dbg+0x28/0xa0 softirqs last enabled at (5504): [] __do_softirq+0x4d0/0x6c0 softirqs last disabled at (5483): [] irq_exit+0x1b0/0x1b8 So split the original sugov_tunables_free() into two functions, sugov_clear_global_tunables() is just used to clear the global_tunables and the new sugov_tunables_free() is used as kobj_type::release to release the sugov_tunables safely. Fixes: 9bdcb44e391d ("cpufreq: schedutil: New governor based on scheduler utilization data") Cc: 4.7+ # 4.7+ Signed-off-by: Kevin Hao Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki Signed-off-by: Sasha Levin --- kernel/sched/cpufreq_schedutil.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 4cb80e6042c4f..831fee509404e 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -624,9 +624,17 @@ static struct attribute *sugov_attrs[] = { }; ATTRIBUTE_GROUPS(sugov); +static void sugov_tunables_free(struct kobject *kobj) +{ + struct gov_attr_set *attr_set = container_of(kobj, struct gov_attr_set, kobj); + + kfree(to_sugov_tunables(attr_set)); +} + static struct kobj_type sugov_tunables_ktype = { .default_groups = sugov_groups, .sysfs_ops = &governor_sysfs_ops, + .release = &sugov_tunables_free, }; /********************** cpufreq governor interface *********************/ @@ -726,12 +734,10 @@ static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_polic return tunables; } -static void sugov_tunables_free(struct sugov_tunables *tunables) +static void sugov_clear_global_tunables(void) { if (!have_governor_per_policy()) global_tunables = NULL; - - kfree(tunables); } static int sugov_init(struct cpufreq_policy *policy) @@ -794,7 +800,7 @@ static int sugov_init(struct cpufreq_policy *policy) fail: kobject_put(&tunables->attr_set.kobj); policy->governor_data = NULL; - sugov_tunables_free(tunables); + sugov_clear_global_tunables(); stop_kthread: sugov_kthread_stop(sg_policy); @@ -821,7 +827,7 @@ static void sugov_exit(struct cpufreq_policy *policy) count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); policy->governor_data = NULL; if (!count) - sugov_tunables_free(tunables); + sugov_clear_global_tunables(); mutex_unlock(&global_tunables_lock); From 6764092c67d3a450731a1897dddb95ef5e264401 Mon Sep 17 00:00:00 2001 From: Pawel Laszczak Date: Thu, 30 Sep 2021 11:42:17 +0200 Subject: [PATCH 1009/5344] usb: cdns3: fix race condition before setting doorbell commit b69ec50b3e55c4b2a85c8bc46763eaf33060584 upstream For DEV_VER_V3 version there exist race condition between clearing ep_sts.EP_STS_TRBERR and setting ep_cmd.EP_CMD_DRDY bit. Setting EP_CMD_DRDY will be ignored by controller when EP_STS_TRBERR is set. So, between these two instructions we have a small time gap in which the EP_STS_TRBERR can be set. In such case the transfer will not start after setting doorbell. Fixes: 7733f6c32e36 ("usb: cdns3: Add Cadence USB3 DRD Driver") cc: # 5.4.x Tested-by: Aswath Govindraju Reviewed-by: Aswath Govindraju Signed-off-by: Pawel Laszczak Signed-off-by: Sasha Levin --- drivers/usb/cdns3/gadget.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/usb/cdns3/gadget.c b/drivers/usb/cdns3/gadget.c index c3bf54cc530f9..296f2ee1b6803 100644 --- a/drivers/usb/cdns3/gadget.c +++ b/drivers/usb/cdns3/gadget.c @@ -807,6 +807,19 @@ static void cdns3_wa1_tray_restore_cycle_bit(struct cdns3_device *priv_dev, cdns3_wa1_restore_cycle_bit(priv_ep); } +static void cdns3_rearm_drdy_if_needed(struct cdns3_endpoint *priv_ep) +{ + struct cdns3_device *priv_dev = priv_ep->cdns3_dev; + + if (priv_dev->dev_ver < DEV_VER_V3) + return; + + if (readl(&priv_dev->regs->ep_sts) & EP_STS_TRBERR) { + writel(EP_STS_TRBERR, &priv_dev->regs->ep_sts); + writel(EP_CMD_DRDY, &priv_dev->regs->ep_cmd); + } +} + /** * cdns3_ep_run_transfer - start transfer on no-default endpoint hardware * @priv_ep: endpoint object @@ -1003,6 +1016,7 @@ int cdns3_ep_run_transfer(struct cdns3_endpoint *priv_ep, /*clearing TRBERR and EP_STS_DESCMIS before seting DRDY*/ writel(EP_STS_TRBERR | EP_STS_DESCMIS, &priv_dev->regs->ep_sts); writel(EP_CMD_DRDY, &priv_dev->regs->ep_cmd); + cdns3_rearm_drdy_if_needed(priv_ep); trace_cdns3_doorbell_epx(priv_ep->name, readl(&priv_dev->regs->ep_traddr)); } From 316909224453d2bc2403eaffd462e1554abd9da7 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 16 Sep 2021 13:34:24 -0700 Subject: [PATCH 1010/5344] fs-verity: fix signed integer overflow with i_size near S64_MAX commit 80f6e3080bfcf865062a926817b3ca6c4a137a57 upstream. If the file size is almost S64_MAX, the calculated number of Merkle tree levels exceeds FS_VERITY_MAX_LEVELS, causing FS_IOC_ENABLE_VERITY to fail. This is unintentional, since as the comment above the definition of FS_VERITY_MAX_LEVELS states, it is enough for over U64_MAX bytes of data using SHA-256 and 4K blocks. (Specifically, 4096*128**8 >= 2**64.) The bug is actually that when the number of blocks in the first level is calculated from i_size, there is a signed integer overflow due to i_size being signed. Fix this by treating i_size as unsigned. This was found by the new test "generic: test fs-verity EFBIG scenarios" (https://lkml.kernel.org/r/b1d116cd4d0ea74b9cd86f349c672021e005a75c.1631558495.git.boris@bur.io). This didn't affect ext4 or f2fs since those have a smaller maximum file size, but it did affect btrfs which allows files up to S64_MAX bytes. Reported-by: Boris Burkov Fixes: 3fda4c617e84 ("fs-verity: implement FS_IOC_ENABLE_VERITY ioctl") Fixes: fd2d1acfcadf ("fs-verity: add the hook for file ->open()") Cc: # v5.4+ Reviewed-by: Boris Burkov Link: https://lore.kernel.org/r/20210916203424.113376-1-ebiggers@kernel.org Signed-off-by: Eric Biggers Signed-off-by: Greg Kroah-Hartman --- fs/verity/enable.c | 2 +- fs/verity/open.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/verity/enable.c b/fs/verity/enable.c index eabc6ac199064..1370bfd17e870 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -136,7 +136,7 @@ static int build_merkle_tree(struct inode *inode, * (level 0) and ascending to the root node (level 'num_levels - 1'). * Then at the end (level 'num_levels'), calculate the root hash. */ - blocks = (inode->i_size + params->block_size - 1) >> + blocks = ((u64)inode->i_size + params->block_size - 1) >> params->log_blocksize; for (level = 0; level <= params->num_levels; level++) { err = build_merkle_tree_level(inode, level, blocks, params, diff --git a/fs/verity/open.c b/fs/verity/open.c index 63d1004b688cb..6200826107c24 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -89,7 +89,7 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, */ /* Compute number of levels and the number of blocks in each level */ - blocks = (inode->i_size + params->block_size - 1) >> log_blocksize; + blocks = ((u64)inode->i_size + params->block_size - 1) >> log_blocksize; pr_debug("Data is %lld bytes (%llu blocks)\n", inode->i_size, blocks); while (blocks > 1) { if (params->num_levels >= FS_VERITY_MAX_LEVELS) { From 1664ee9217fde523436173aabc1055ec670dc07f Mon Sep 17 00:00:00 2001 From: Nadezda Lutovinova Date: Tue, 21 Sep 2021 18:51:53 +0300 Subject: [PATCH 1011/5344] hwmon: (w83793) Fix NULL pointer dereference by removing unnecessary structure field commit dd4d747ef05addab887dc8ff0d6ab9860bbcd783 upstream. If driver read tmp value sufficient for (tmp & 0x08) && (!(tmp & 0x80)) && ((tmp & 0x7) == ((tmp >> 4) & 0x7)) from device then Null pointer dereference occurs. (It is possible if tmp = 0b0xyz1xyz, where same literals mean same numbers) Also lm75[] does not serve a purpose anymore after switching to devm_i2c_new_dummy_device() in w83791d_detect_subclients(). The patch fixes possible NULL pointer dereference by removing lm75[]. Found by Linux Driver Verification project (linuxtesting.org). Cc: stable@vger.kernel.org Signed-off-by: Nadezda Lutovinova Link: https://lore.kernel.org/r/20210921155153.28098-3-lutovinova@ispras.ru [groeck: Dropped unnecessary continuation lines, fixed multi-line alignments] Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- drivers/hwmon/w83793.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/drivers/hwmon/w83793.c b/drivers/hwmon/w83793.c index 9df48b70c70c7..640330a3a8dc4 100644 --- a/drivers/hwmon/w83793.c +++ b/drivers/hwmon/w83793.c @@ -202,7 +202,6 @@ static inline s8 TEMP_TO_REG(long val, s8 min, s8 max) } struct w83793_data { - struct i2c_client *lm75[2]; struct device *hwmon_dev; struct mutex update_lock; char valid; /* !=0 if following fields are valid */ @@ -1566,7 +1565,6 @@ w83793_detect_subclients(struct i2c_client *client) int address = client->addr; u8 tmp; struct i2c_adapter *adapter = client->adapter; - struct w83793_data *data = i2c_get_clientdata(client); id = i2c_adapter_id(adapter); if (force_subclients[0] == id && force_subclients[1] == address) { @@ -1586,21 +1584,19 @@ w83793_detect_subclients(struct i2c_client *client) } tmp = w83793_read_value(client, W83793_REG_I2C_SUBADDR); - if (!(tmp & 0x08)) - data->lm75[0] = devm_i2c_new_dummy_device(&client->dev, adapter, - 0x48 + (tmp & 0x7)); - if (!(tmp & 0x80)) { - if (!IS_ERR(data->lm75[0]) - && ((tmp & 0x7) == ((tmp >> 4) & 0x7))) { - dev_err(&client->dev, - "duplicate addresses 0x%x, " - "use force_subclients\n", data->lm75[0]->addr); - return -ENODEV; - } - data->lm75[1] = devm_i2c_new_dummy_device(&client->dev, adapter, - 0x48 + ((tmp >> 4) & 0x7)); + + if (!(tmp & 0x88) && (tmp & 0x7) == ((tmp >> 4) & 0x7)) { + dev_err(&client->dev, + "duplicate addresses 0x%x, use force_subclient\n", 0x48 + (tmp & 0x7)); + return -ENODEV; } + if (!(tmp & 0x08)) + devm_i2c_new_dummy_device(&client->dev, adapter, 0x48 + (tmp & 0x7)); + + if (!(tmp & 0x80)) + devm_i2c_new_dummy_device(&client->dev, adapter, 0x48 + ((tmp >> 4) & 0x7)); + return 0; } From f4118b830c4295f158651b465073947de385d3af Mon Sep 17 00:00:00 2001 From: Nadezda Lutovinova Date: Tue, 21 Sep 2021 18:51:52 +0300 Subject: [PATCH 1012/5344] hwmon: (w83792d) Fix NULL pointer dereference by removing unnecessary structure field commit 0f36b88173f028e372668ae040ab1a496834d278 upstream. If driver read val value sufficient for (val & 0x08) && (!(val & 0x80)) && ((val & 0x7) == ((val >> 4) & 0x7)) from device then Null pointer dereference occurs. (It is possible if tmp = 0b0xyz1xyz, where same literals mean same numbers) Also lm75[] does not serve a purpose anymore after switching to devm_i2c_new_dummy_device() in w83791d_detect_subclients(). The patch fixes possible NULL pointer dereference by removing lm75[]. Found by Linux Driver Verification project (linuxtesting.org). Cc: stable@vger.kernel.org Signed-off-by: Nadezda Lutovinova Link: https://lore.kernel.org/r/20210921155153.28098-2-lutovinova@ispras.ru [groeck: Dropped unnecessary continuation lines, fixed multipline alignment] Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- drivers/hwmon/w83792d.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/drivers/hwmon/w83792d.c b/drivers/hwmon/w83792d.c index 7fc8a1160c8f9..f91db4be2fc79 100644 --- a/drivers/hwmon/w83792d.c +++ b/drivers/hwmon/w83792d.c @@ -264,9 +264,6 @@ struct w83792d_data { char valid; /* !=0 if following fields are valid */ unsigned long last_updated; /* In jiffies */ - /* array of 2 pointers to subclients */ - struct i2c_client *lm75[2]; - u8 in[9]; /* Register value */ u8 in_max[9]; /* Register value */ u8 in_min[9]; /* Register value */ @@ -928,7 +925,6 @@ w83792d_detect_subclients(struct i2c_client *new_client) int address = new_client->addr; u8 val; struct i2c_adapter *adapter = new_client->adapter; - struct w83792d_data *data = i2c_get_clientdata(new_client); id = i2c_adapter_id(adapter); if (force_subclients[0] == id && force_subclients[1] == address) { @@ -947,21 +943,19 @@ w83792d_detect_subclients(struct i2c_client *new_client) } val = w83792d_read_value(new_client, W83792D_REG_I2C_SUBADDR); - if (!(val & 0x08)) - data->lm75[0] = devm_i2c_new_dummy_device(&new_client->dev, adapter, - 0x48 + (val & 0x7)); - if (!(val & 0x80)) { - if (!IS_ERR(data->lm75[0]) && - ((val & 0x7) == ((val >> 4) & 0x7))) { - dev_err(&new_client->dev, - "duplicate addresses 0x%x, use force_subclient\n", - data->lm75[0]->addr); - return -ENODEV; - } - data->lm75[1] = devm_i2c_new_dummy_device(&new_client->dev, adapter, - 0x48 + ((val >> 4) & 0x7)); + + if (!(val & 0x88) && (val & 0x7) == ((val >> 4) & 0x7)) { + dev_err(&new_client->dev, + "duplicate addresses 0x%x, use force_subclient\n", 0x48 + (val & 0x7)); + return -ENODEV; } + if (!(val & 0x08)) + devm_i2c_new_dummy_device(&new_client->dev, adapter, 0x48 + (val & 0x7)); + + if (!(val & 0x80)) + devm_i2c_new_dummy_device(&new_client->dev, adapter, 0x48 + ((val >> 4) & 0x7)); + return 0; } From b4927c79de9ac07f1d4bc80d9f29b94cacef61e1 Mon Sep 17 00:00:00 2001 From: Nadezda Lutovinova Date: Tue, 21 Sep 2021 18:51:51 +0300 Subject: [PATCH 1013/5344] hwmon: (w83791d) Fix NULL pointer dereference by removing unnecessary structure field commit 943c15ac1b84d378da26bba41c83c67e16499ac4 upstream. If driver read val value sufficient for (val & 0x08) && (!(val & 0x80)) && ((val & 0x7) == ((val >> 4) & 0x7)) from device then Null pointer dereference occurs. (It is possible if tmp = 0b0xyz1xyz, where same literals mean same numbers) Also lm75[] does not serve a purpose anymore after switching to devm_i2c_new_dummy_device() in w83791d_detect_subclients(). The patch fixes possible NULL pointer dereference by removing lm75[]. Found by Linux Driver Verification project (linuxtesting.org). Cc: stable@vger.kernel.org Signed-off-by: Nadezda Lutovinova Link: https://lore.kernel.org/r/20210921155153.28098-1-lutovinova@ispras.ru [groeck: Dropped unnecessary continuation lines, fixed multi-line alignment] Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- drivers/hwmon/w83791d.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/drivers/hwmon/w83791d.c b/drivers/hwmon/w83791d.c index aad8d4da5802b..b586e917e6ede 100644 --- a/drivers/hwmon/w83791d.c +++ b/drivers/hwmon/w83791d.c @@ -273,9 +273,6 @@ struct w83791d_data { char valid; /* !=0 if following fields are valid */ unsigned long last_updated; /* In jiffies */ - /* array of 2 pointers to subclients */ - struct i2c_client *lm75[2]; - /* volts */ u8 in[NUMBER_OF_VIN]; /* Register value */ u8 in_max[NUMBER_OF_VIN]; /* Register value */ @@ -1258,7 +1255,6 @@ static const struct attribute_group w83791d_group_fanpwm45 = { static int w83791d_detect_subclients(struct i2c_client *client) { struct i2c_adapter *adapter = client->adapter; - struct w83791d_data *data = i2c_get_clientdata(client); int address = client->addr; int i, id; u8 val; @@ -1281,22 +1277,19 @@ static int w83791d_detect_subclients(struct i2c_client *client) } val = w83791d_read(client, W83791D_REG_I2C_SUBADDR); - if (!(val & 0x08)) - data->lm75[0] = devm_i2c_new_dummy_device(&client->dev, adapter, - 0x48 + (val & 0x7)); - if (!(val & 0x80)) { - if (!IS_ERR(data->lm75[0]) && - ((val & 0x7) == ((val >> 4) & 0x7))) { - dev_err(&client->dev, - "duplicate addresses 0x%x, " - "use force_subclient\n", - data->lm75[0]->addr); - return -ENODEV; - } - data->lm75[1] = devm_i2c_new_dummy_device(&client->dev, adapter, - 0x48 + ((val >> 4) & 0x7)); + + if (!(val & 0x88) && (val & 0x7) == ((val >> 4) & 0x7)) { + dev_err(&client->dev, + "duplicate addresses 0x%x, use force_subclient\n", 0x48 + (val & 0x7)); + return -ENODEV; } + if (!(val & 0x08)) + devm_i2c_new_dummy_device(&client->dev, adapter, 0x48 + (val & 0x7)); + + if (!(val & 0x80)) + devm_i2c_new_dummy_device(&client->dev, adapter, 0x48 + ((val >> 4) & 0x7)); + return 0; } From 83a2df2e55208053697cfe04acc1ff91d5b6ac12 Mon Sep 17 00:00:00 2001 From: Jonathan Hsu Date: Fri, 24 Sep 2021 16:58:48 +0800 Subject: [PATCH 1014/5344] scsi: ufs: Fix illegal offset in UPIU event trace commit e8c2da7e329ce004fee748b921e4c765dc2fa338 upstream. Fix incorrect index for UTMRD reference in ufshcd_add_tm_upiu_trace(). Link: https://lore.kernel.org/r/20210924085848.25500-1-jonathan.hsu@mediatek.com Fixes: 4b42d557a8ad ("scsi: ufs: core: Fix wrong Task Tag used in task management request UPIUs") Cc: stable@vger.kernel.org Reviewed-by: Stanley Chu Reviewed-by: Bart Van Assche Signed-off-by: Jonathan Hsu Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/ufs/ufshcd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 0429ba5d7d23c..24396f4d5f2d3 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -320,8 +320,7 @@ static void ufshcd_add_query_upiu_trace(struct ufs_hba *hba, unsigned int tag, static void ufshcd_add_tm_upiu_trace(struct ufs_hba *hba, unsigned int tag, const char *str) { - int off = (int)tag - hba->nutrs; - struct utp_task_req_desc *descp = &hba->utmrdl_base_addr[off]; + struct utp_task_req_desc *descp = &hba->utmrdl_base_addr[tag]; trace_ufshcd_upiu(dev_name(hba->dev), str, &descp->req_header, &descp->input_param1); From f31f4ef71b027673df7e2d49b8a169b60931d243 Mon Sep 17 00:00:00 2001 From: Zelin Deng Date: Wed, 29 Sep 2021 13:13:48 +0800 Subject: [PATCH 1015/5344] x86/kvmclock: Move this_cpu_pvti into kvmclock.h commit ad9af930680bb396c87582edc172b3a7cf2a3fbf upstream. There're other modules might use hv_clock_per_cpu variable like ptp_kvm, so move it into kvmclock.h and export the symbol to make it visiable to other modules. Signed-off-by: Zelin Deng Cc: Message-Id: <1632892429-101194-2-git-send-email-zelin.deng@linux.alibaba.com> Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/kvmclock.h | 14 ++++++++++++++ arch/x86/kernel/kvmclock.c | 13 ++----------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/kvmclock.h b/arch/x86/include/asm/kvmclock.h index eceea92990974..6c57651921028 100644 --- a/arch/x86/include/asm/kvmclock.h +++ b/arch/x86/include/asm/kvmclock.h @@ -2,6 +2,20 @@ #ifndef _ASM_X86_KVM_CLOCK_H #define _ASM_X86_KVM_CLOCK_H +#include + extern struct clocksource kvm_clock; +DECLARE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); + +static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) +{ + return &this_cpu_read(hv_clock_per_cpu)->pvti; +} + +static inline struct pvclock_vsyscall_time_info *this_cpu_hvclock(void) +{ + return this_cpu_read(hv_clock_per_cpu); +} + #endif /* _ASM_X86_KVM_CLOCK_H */ diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 4a0802af2e3e0..d81e34e614e00 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -50,18 +50,9 @@ early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); static struct pvclock_vsyscall_time_info hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE); static struct pvclock_wall_clock wall_clock __bss_decrypted; -static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); static struct pvclock_vsyscall_time_info *hvclock_mem; - -static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) -{ - return &this_cpu_read(hv_clock_per_cpu)->pvti; -} - -static inline struct pvclock_vsyscall_time_info *this_cpu_hvclock(void) -{ - return this_cpu_read(hv_clock_per_cpu); -} +DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); +EXPORT_PER_CPU_SYMBOL_GPL(hv_clock_per_cpu); /* * The wallclock is the time of day when we booted. Since then, some time may From 859ba2afff8ca82309d2b89ac19d66835bda8f96 Mon Sep 17 00:00:00 2001 From: Charlene Liu Date: Mon, 20 Sep 2021 14:30:02 -0400 Subject: [PATCH 1016/5344] drm/amd/display: Pass PCI deviceid into DC commit d942856865c733ff60450de9691af796ad71d7bc upstream. [why] pci deviceid not passed to dal dc, without proper break, dcn2.x falls into dcn3.x code path [how] pass in pci deviceid, and break once dal_version initialized. Reviewed-by: Zhan Liu Acked-by: Anson Jacob Signed-off-by: Charlene Liu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 0dc60fe22aefc..8e4d863c7570b 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -664,6 +664,7 @@ static int amdgpu_dm_init(struct amdgpu_device *adev) init_data.asic_id.pci_revision_id = adev->rev_id; init_data.asic_id.hw_internal_rev = adev->external_rev_id; + init_data.asic_id.chip_id = adev->pdev->device; init_data.asic_id.vram_width = adev->gmc.vram_width; /* TODO: initialize init_data.asic_id.vram_type here!!!! */ From 41855d6d52d54c336b48a7424204d41c382f3bcc Mon Sep 17 00:00:00 2001 From: Andrea Claudi Date: Fri, 10 Sep 2021 18:08:39 +0200 Subject: [PATCH 1017/5344] ipvs: check that ip_vs_conn_tab_bits is between 8 and 20 [ Upstream commit 69e73dbfda14fbfe748d3812da1244cce2928dcb ] ip_vs_conn_tab_bits may be provided by the user through the conn_tab_bits module parameter. If this value is greater than 31, or less than 0, the shift operator used to derive tab_size causes undefined behaviour. Fix this checking ip_vs_conn_tab_bits value to be in the range specified in ipvs Kconfig. If not, simply use default value. Fixes: 6f7edb4881bf ("IPVS: Allow boot time change of hash size") Reported-by: Yi Chen Signed-off-by: Andrea Claudi Acked-by: Julian Anastasov Acked-by: Simon Horman Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/ipvs/ip_vs_conn.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index a90b8eac16ac2..2d1e228707065 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1456,6 +1456,10 @@ int __init ip_vs_conn_init(void) int idx; /* Compute size and mask */ + if (ip_vs_conn_tab_bits < 8 || ip_vs_conn_tab_bits > 20) { + pr_info("conn_tab_bits not in [8, 20]. Using default value\n"); + ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; + } ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; From 1927ea3c8514ef6caee3363a0155c770b61ff668 Mon Sep 17 00:00:00 2001 From: Vadim Pasternak Date: Thu, 16 Sep 2021 21:31:51 +0300 Subject: [PATCH 1018/5344] hwmon: (mlxreg-fan) Return non-zero value when fan current state is enforced from sysfs [ Upstream commit e6fab7af6ba1bc77c78713a83876f60ca7a4a064 ] Fan speed minimum can be enforced from sysfs. For example, setting current fan speed to 20 is used to enforce fan speed to be at 100% speed, 19 - to be not below 90% speed, etcetera. This feature provides ability to limit fan speed according to some system wise considerations, like absence of some replaceable units or high system ambient temperature. Request for changing fan minimum speed is configuration request and can be set only through 'sysfs' write procedure. In this situation value of argument 'state' is above nominal fan speed maximum. Return non-zero code in this case to avoid thermal_cooling_device_stats_update() call, because in this case statistics update violates thermal statistics table range. The issues is observed in case kernel is configured with option CONFIG_THERMAL_STATISTICS. Here is the trace from KASAN: [ 159.506659] BUG: KASAN: slab-out-of-bounds in thermal_cooling_device_stats_update+0x7d/0xb0 [ 159.516016] Read of size 4 at addr ffff888116163840 by task hw-management.s/7444 [ 159.545625] Call Trace: [ 159.548366] dump_stack+0x92/0xc1 [ 159.552084] ? thermal_cooling_device_stats_update+0x7d/0xb0 [ 159.635869] thermal_zone_device_update+0x345/0x780 [ 159.688711] thermal_zone_device_set_mode+0x7d/0xc0 [ 159.694174] mlxsw_thermal_modules_init+0x48f/0x590 [mlxsw_core] [ 159.700972] ? mlxsw_thermal_set_cur_state+0x5a0/0x5a0 [mlxsw_core] [ 159.731827] mlxsw_thermal_init+0x763/0x880 [mlxsw_core] [ 160.070233] RIP: 0033:0x7fd995909970 [ 160.074239] Code: 73 01 c3 48 8b 0d 28 d5 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d 99 2d 2c 00 00 75 10 b8 01 00 00 00 0f 05 <48> 3d 01 f0 ff .. [ 160.095242] RSP: 002b:00007fff54f5d938 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 160.103722] RAX: ffffffffffffffda RBX: 0000000000000013 RCX: 00007fd995909970 [ 160.111710] RDX: 0000000000000013 RSI: 0000000001906008 RDI: 0000000000000001 [ 160.119699] RBP: 0000000001906008 R08: 00007fd995bc9760 R09: 00007fd996210700 [ 160.127687] R10: 0000000000000073 R11: 0000000000000246 R12: 0000000000000013 [ 160.135673] R13: 0000000000000001 R14: 00007fd995bc8600 R15: 0000000000000013 [ 160.143671] [ 160.145338] Allocated by task 2924: [ 160.149242] kasan_save_stack+0x19/0x40 [ 160.153541] __kasan_kmalloc+0x7f/0xa0 [ 160.157743] __kmalloc+0x1a2/0x2b0 [ 160.161552] thermal_cooling_device_setup_sysfs+0xf9/0x1a0 [ 160.167687] __thermal_cooling_device_register+0x1b5/0x500 [ 160.173833] devm_thermal_of_cooling_device_register+0x60/0xa0 [ 160.180356] mlxreg_fan_probe+0x474/0x5e0 [mlxreg_fan] [ 160.248140] [ 160.249807] The buggy address belongs to the object at ffff888116163400 [ 160.249807] which belongs to the cache kmalloc-1k of size 1024 [ 160.263814] The buggy address is located 64 bytes to the right of [ 160.263814] 1024-byte region [ffff888116163400, ffff888116163800) [ 160.277536] The buggy address belongs to the page: [ 160.282898] page:0000000012275840 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff888116167000 pfn:0x116160 [ 160.294872] head:0000000012275840 order:3 compound_mapcount:0 compound_pincount:0 [ 160.303251] flags: 0x200000000010200(slab|head|node=0|zone=2) [ 160.309694] raw: 0200000000010200 ffffea00046f7208 ffffea0004928208 ffff88810004dbc0 [ 160.318367] raw: ffff888116167000 00000000000a0006 00000001ffffffff 0000000000000000 [ 160.327033] page dumped because: kasan: bad access detected [ 160.333270] [ 160.334937] Memory state around the buggy address: [ 160.356469] >ffff888116163800: fc .. Fixes: 65afb4c8e7e4 ("hwmon: (mlxreg-fan) Add support for Mellanox FAN driver") Signed-off-by: Vadim Pasternak Link: https://lore.kernel.org/r/20210916183151.869427-1-vadimp@nvidia.com Signed-off-by: Guenter Roeck Signed-off-by: Sasha Levin --- drivers/hwmon/mlxreg-fan.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/hwmon/mlxreg-fan.c b/drivers/hwmon/mlxreg-fan.c index ed8d59d4eecb3..bd8f5a3aaad9c 100644 --- a/drivers/hwmon/mlxreg-fan.c +++ b/drivers/hwmon/mlxreg-fan.c @@ -291,8 +291,8 @@ static int mlxreg_fan_set_cur_state(struct thermal_cooling_device *cdev, { struct mlxreg_fan *fan = cdev->devdata; unsigned long cur_state; + int i, config = 0; u32 regval; - int i; int err; /* @@ -305,6 +305,12 @@ static int mlxreg_fan_set_cur_state(struct thermal_cooling_device *cdev, * overwritten. */ if (state >= MLXREG_FAN_SPEED_MIN && state <= MLXREG_FAN_SPEED_MAX) { + /* + * This is configuration change, which is only supported through sysfs. + * For configuration non-zero value is to be returned to avoid thermal + * statistics update. + */ + config = 1; state -= MLXREG_FAN_MAX_STATE; for (i = 0; i < state; i++) fan->cooling_levels[i] = state; @@ -319,7 +325,7 @@ static int mlxreg_fan_set_cur_state(struct thermal_cooling_device *cdev, cur_state = MLXREG_FAN_PWM_DUTY2STATE(regval); if (state < cur_state) - return 0; + return config; state = cur_state; } @@ -335,7 +341,7 @@ static int mlxreg_fan_set_cur_state(struct thermal_cooling_device *cdev, dev_err(fan->dev, "Failed to write PWM duty\n"); return err; } - return 0; + return config; } static const struct thermal_cooling_device_ops mlxreg_fan_cooling_ops = { From cf8a0a77d2be946d0c329b12f19cfe7dcc25c2ae Mon Sep 17 00:00:00 2001 From: Chih-Kang Chang Date: Mon, 30 Aug 2021 15:32:40 +0800 Subject: [PATCH 1019/5344] mac80211: Fix ieee80211_amsdu_aggregate frag_tail bug [ Upstream commit fe94bac626d9c1c5bc98ab32707be8a9d7f8adba ] In ieee80211_amsdu_aggregate() set a pointer frag_tail point to the end of skb_shinfo(head)->frag_list, and use it to bind other skb in the end of this function. But when execute ieee80211_amsdu_aggregate() ->ieee80211_amsdu_realloc_pad()->pskb_expand_head(), the address of skb_shinfo(head)->frag_list will be changed. However, the ieee80211_amsdu_aggregate() not update frag_tail after call pskb_expand_head(). That will cause the second skb can't bind to the head skb appropriately.So we update the address of frag_tail to fix it. Fixes: 6e0456b54545 ("mac80211: add A-MSDU tx support") Signed-off-by: Chih-Kang Chang Signed-off-by: Zong-Zhe Yang Signed-off-by: Ping-Ke Shih Link: https://lore.kernel.org/r/20210830073240.12736-1-pkshih@realtek.com [reword comment] Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin --- net/mac80211/tx.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 4dfac7a25e5ad..eb87ed0146d12 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -3325,6 +3325,14 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, if (!ieee80211_amsdu_prepare_head(sdata, fast_tx, head)) goto out; + /* If n == 2, the "while (*frag_tail)" loop above didn't execute + * and frag_tail should be &skb_shinfo(head)->frag_list. + * However, ieee80211_amsdu_prepare_head() can reallocate it. + * Reload frag_tail to have it pointing to the correct place. + */ + if (n == 2) + frag_tail = &skb_shinfo(head)->frag_list; + /* * Pad out the previous subframe to a multiple of 4 by adding the * padding to the next one, that's being added. Note that head->len From a8b156e71c119e026be402ee4f61de4aaf9ad937 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 20 Sep 2021 14:45:22 +0200 Subject: [PATCH 1020/5344] mac80211: limit injected vht mcs/nss in ieee80211_parse_tx_radiotap [ Upstream commit 13cb6d826e0ac0d144b0d48191ff1a111d32f0c6 ] Limit max values for vht mcs and nss in ieee80211_parse_tx_radiotap routine in order to fix the following warning reported by syzbot: WARNING: CPU: 0 PID: 10717 at include/net/mac80211.h:989 ieee80211_rate_set_vht include/net/mac80211.h:989 [inline] WARNING: CPU: 0 PID: 10717 at include/net/mac80211.h:989 ieee80211_parse_tx_radiotap+0x101e/0x12d0 net/mac80211/tx.c:2244 Modules linked in: CPU: 0 PID: 10717 Comm: syz-executor.5 Not tainted 5.14.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:ieee80211_rate_set_vht include/net/mac80211.h:989 [inline] RIP: 0010:ieee80211_parse_tx_radiotap+0x101e/0x12d0 net/mac80211/tx.c:2244 RSP: 0018:ffffc9000186f3e8 EFLAGS: 00010216 RAX: 0000000000000618 RBX: ffff88804ef76500 RCX: ffffc900143a5000 RDX: 0000000000040000 RSI: ffffffff888f478e RDI: 0000000000000003 RBP: 00000000ffffffff R08: 0000000000000000 R09: 0000000000000100 R10: ffffffff888f46f9 R11: 0000000000000000 R12: 00000000fffffff8 R13: ffff88804ef7653c R14: 0000000000000001 R15: 0000000000000004 FS: 00007fbf5718f700(0000) GS:ffff8880b9c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b2de23000 CR3: 000000006a671000 CR4: 00000000001506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000600 Call Trace: ieee80211_monitor_select_queue+0xa6/0x250 net/mac80211/iface.c:740 netdev_core_pick_tx+0x169/0x2e0 net/core/dev.c:4089 __dev_queue_xmit+0x6f9/0x3710 net/core/dev.c:4165 __bpf_tx_skb net/core/filter.c:2114 [inline] __bpf_redirect_no_mac net/core/filter.c:2139 [inline] __bpf_redirect+0x5ba/0xd20 net/core/filter.c:2162 ____bpf_clone_redirect net/core/filter.c:2429 [inline] bpf_clone_redirect+0x2ae/0x420 net/core/filter.c:2401 bpf_prog_eeb6f53a69e5c6a2+0x59/0x234 bpf_dispatcher_nop_func include/linux/bpf.h:717 [inline] __bpf_prog_run include/linux/filter.h:624 [inline] bpf_prog_run include/linux/filter.h:631 [inline] bpf_test_run+0x381/0xa30 net/bpf/test_run.c:119 bpf_prog_test_run_skb+0xb84/0x1ee0 net/bpf/test_run.c:663 bpf_prog_test_run kernel/bpf/syscall.c:3307 [inline] __sys_bpf+0x2137/0x5df0 kernel/bpf/syscall.c:4605 __do_sys_bpf kernel/bpf/syscall.c:4691 [inline] __se_sys_bpf kernel/bpf/syscall.c:4689 [inline] __x64_sys_bpf+0x75/0xb0 kernel/bpf/syscall.c:4689 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x4665f9 Reported-by: syzbot+0196ac871673f0c20f68@syzkaller.appspotmail.com Fixes: 646e76bb5daf4 ("mac80211: parse VHT info in injected frames") Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/c26c3f02dcb38ab63b2f2534cb463d95ee81bb13.1632141760.git.lorenzo@kernel.org Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin --- net/mac80211/tx.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index eb87ed0146d12..d82d22b6a2a94 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2156,7 +2156,11 @@ static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local, } vht_mcs = iterator.this_arg[4] >> 4; + if (vht_mcs > 11) + vht_mcs = 0; vht_nss = iterator.this_arg[4] & 0xF; + if (!vht_nss || vht_nss > 8) + vht_nss = 1; break; /* From 5fcd254a9e10f488adf5e116c55fa7d62d0c6535 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 20 Sep 2021 15:40:05 +0200 Subject: [PATCH 1021/5344] mac80211: mesh: fix potentially unaligned access [ Upstream commit b9731062ce8afd35cf723bf3a8ad55d208f915a5 ] The pointer here points directly into the frame, so the access is potentially unaligned. Use get_unaligned_le16 to avoid that. Fixes: 3f52b7e328c5 ("mac80211: mesh power save basics") Link: https://lore.kernel.org/r/20210920154009.3110ff75be0c.Ib6a2ff9e9cc9bc6fca50fce631ec1ce725cc926b@changeid Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin --- net/mac80211/mesh_ps.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/mac80211/mesh_ps.c b/net/mac80211/mesh_ps.c index 031e905f684a2..bf83f512f748b 100644 --- a/net/mac80211/mesh_ps.c +++ b/net/mac80211/mesh_ps.c @@ -2,6 +2,7 @@ /* * Copyright 2012-2013, Marco Porsch * Copyright 2012-2013, cozybit Inc. + * Copyright (C) 2021 Intel Corporation */ #include "mesh.h" @@ -584,7 +585,7 @@ void ieee80211_mps_frame_release(struct sta_info *sta, /* only transmit to PS STA with announced, non-zero awake window */ if (test_sta_flag(sta, WLAN_STA_PS_STA) && - (!elems->awake_window || !le16_to_cpu(*elems->awake_window))) + (!elems->awake_window || !get_unaligned_le16(elems->awake_window))) return; if (!test_sta_flag(sta, WLAN_STA_MPSP_OWNER)) From 3e5c850d830b4c37e02684f5f81ff8e7ce3cdc7d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 15 Sep 2021 11:29:37 +0200 Subject: [PATCH 1022/5344] mac80211-hwsim: fix late beacon hrtimer handling [ Upstream commit 313bbd1990b6ddfdaa7da098d0c56b098a833572 ] Thomas explained in https://lore.kernel.org/r/87mtoeb4hb.ffs@tglx that our handling of the hrtimer here is wrong: If the timer fires late (e.g. due to vCPU scheduling, as reported by Dmitry/syzbot) then it tries to actually rearm the timer at the next deadline, which might be in the past already: 1 2 3 N N+1 | | | ... | | ^ intended to fire here (1) ^ next deadline here (2) ^ actually fired here The next time it fires, it's later, but will still try to schedule for the next deadline (now 3), etc. until it catches up with N, but that might take a long time, causing stalls etc. Now, all of this is simulation, so we just have to fix it, but note that the behaviour is wrong even per spec, since there's no value then in sending all those beacons unaligned - they should be aligned to the TBTT (1, 2, 3, ... in the picture), and if we're a bit (or a lot) late, then just resume at that point. Therefore, change the code to use hrtimer_forward_now() which will ensure that the next firing of the timer would be at N+1 (in the picture), i.e. the next interval point after the current time. Suggested-by: Thomas Gleixner Reported-by: Dmitry Vyukov Reported-by: syzbot+0e964fad69a9c462bc1e@syzkaller.appspotmail.com Fixes: 01e59e467ecf ("mac80211_hwsim: hrtimer beacon") Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210915112936.544f383472eb.I3f9712009027aa09244b65399bf18bf482a8c4f1@changeid Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin --- drivers/net/wireless/mac80211_hwsim.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 1033513d3d9de..07b070b14d75d 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -1603,8 +1603,8 @@ mac80211_hwsim_beacon(struct hrtimer *timer) bcn_int -= data->bcn_delta; data->bcn_delta = 0; } - hrtimer_forward(&data->beacon_timer, hrtimer_get_expires(timer), - ns_to_ktime(bcn_int * NSEC_PER_USEC)); + hrtimer_forward_now(&data->beacon_timer, + ns_to_ktime(bcn_int * NSEC_PER_USEC)); return HRTIMER_RESTART; } From 9843d14a8f985eb6a7c39545a563a3533ec959b0 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 23 Sep 2021 00:05:04 -0400 Subject: [PATCH 1023/5344] sctp: break out if skb_header_pointer returns NULL in sctp_rcv_ootb [ Upstream commit f7e745f8e94492a8ac0b0a26e25f2b19d342918f ] We should always check if skb_header_pointer's return is NULL before using it, otherwise it may cause null-ptr-deref, as syzbot reported: KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] RIP: 0010:sctp_rcv_ootb net/sctp/input.c:705 [inline] RIP: 0010:sctp_rcv+0x1d84/0x3220 net/sctp/input.c:196 Call Trace: sctp6_rcv+0x38/0x60 net/sctp/ipv6.c:1109 ip6_protocol_deliver_rcu+0x2e9/0x1ca0 net/ipv6/ip6_input.c:422 ip6_input_finish+0x62/0x170 net/ipv6/ip6_input.c:463 NF_HOOK include/linux/netfilter.h:307 [inline] NF_HOOK include/linux/netfilter.h:301 [inline] ip6_input+0x9c/0xd0 net/ipv6/ip6_input.c:472 dst_input include/net/dst.h:460 [inline] ip6_rcv_finish net/ipv6/ip6_input.c:76 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] NF_HOOK include/linux/netfilter.h:301 [inline] ipv6_rcv+0x28c/0x3c0 net/ipv6/ip6_input.c:297 Fixes: 3acb50c18d8d ("sctp: delay as much as possible skb_linearize") Reported-by: syzbot+581aff2ae6b860625116@syzkaller.appspotmail.com Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/sctp/input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sctp/input.c b/net/sctp/input.c index 2aca37717ed1e..9616b600a8766 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -676,7 +676,7 @@ static int sctp_rcv_ootb(struct sk_buff *skb) ch = skb_header_pointer(skb, offset, sizeof(*ch), &_ch); /* Break out if chunk length is less then minimal. */ - if (ntohs(ch->length) < sizeof(_ch)) + if (!ch || ntohs(ch->length) < sizeof(_ch)) break; ch_end = offset + SCTP_PAD4(ntohs(ch->length)); From dee68e9e1eaafd17f1aababb4bb9ca17c0012821 Mon Sep 17 00:00:00 2001 From: Paul Fertser Date: Fri, 24 Sep 2021 12:30:10 +0300 Subject: [PATCH 1024/5344] hwmon: (tmp421) report /PVLD condition as fault [ Upstream commit 540effa7f283d25bcc13c0940d808002fee340b8 ] For both local and remote sensors all the supported ICs can report an "undervoltage lockout" condition which means the conversion wasn't properly performed due to insufficient power supply voltage and so the measurement results can't be trusted. Fixes: 9410700b881f ("hwmon: Add driver for Texas Instruments TMP421/422/423 sensor chips") Signed-off-by: Paul Fertser Link: https://lore.kernel.org/r/20210924093011.26083-2-fercerpav@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Sasha Levin --- drivers/hwmon/tmp421.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/hwmon/tmp421.c b/drivers/hwmon/tmp421.c index a94e35cff3e5f..e245ae272f7de 100644 --- a/drivers/hwmon/tmp421.c +++ b/drivers/hwmon/tmp421.c @@ -160,10 +160,10 @@ static int tmp421_read(struct device *dev, enum hwmon_sensor_types type, return 0; case hwmon_temp_fault: /* - * The OPEN bit signals a fault. This is bit 0 of the temperature - * register (low byte). + * Any of OPEN or /PVLD bits indicate a hardware mulfunction + * and the conversion result may be incorrect */ - *val = tmp421->temp[channel] & 0x01; + *val = !!(tmp421->temp[channel] & 0x03); return 0; default: return -EOPNOTSUPP; @@ -176,9 +176,6 @@ static umode_t tmp421_is_visible(const void *data, enum hwmon_sensor_types type, { switch (attr) { case hwmon_temp_fault: - if (channel == 0) - return 0; - return 0444; case hwmon_temp_input: return 0444; default: From 5dbca8fa8cb0a3c45a76aefeb04b59217f190d4d Mon Sep 17 00:00:00 2001 From: Paul Fertser Date: Fri, 24 Sep 2021 12:30:11 +0300 Subject: [PATCH 1025/5344] hwmon: (tmp421) fix rounding for negative values [ Upstream commit 724e8af85854c4d3401313b6dd7d79cf792d8990 ] Old code produces -24999 for 0b1110011100000000 input in standard format due to always rounding up rather than "away from zero". Use the common macro for division, unify and simplify the conversion code along the way. Fixes: 9410700b881f ("hwmon: Add driver for Texas Instruments TMP421/422/423 sensor chips") Signed-off-by: Paul Fertser Link: https://lore.kernel.org/r/20210924093011.26083-3-fercerpav@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Sasha Levin --- drivers/hwmon/tmp421.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/drivers/hwmon/tmp421.c b/drivers/hwmon/tmp421.c index e245ae272f7de..876ccf77a825a 100644 --- a/drivers/hwmon/tmp421.c +++ b/drivers/hwmon/tmp421.c @@ -100,23 +100,17 @@ struct tmp421_data { s16 temp[4]; }; -static int temp_from_s16(s16 reg) +static int temp_from_raw(u16 reg, bool extended) { /* Mask out status bits */ int temp = reg & ~0xf; - return (temp * 1000 + 128) / 256; -} - -static int temp_from_u16(u16 reg) -{ - /* Mask out status bits */ - int temp = reg & ~0xf; - - /* Add offset for extended temperature range. */ - temp -= 64 * 256; + if (extended) + temp = temp - 64 * 256; + else + temp = (s16)temp; - return (temp * 1000 + 128) / 256; + return DIV_ROUND_CLOSEST(temp * 1000, 256); } static struct tmp421_data *tmp421_update_device(struct device *dev) @@ -153,10 +147,8 @@ static int tmp421_read(struct device *dev, enum hwmon_sensor_types type, switch (attr) { case hwmon_temp_input: - if (tmp421->config & TMP421_CONFIG_RANGE) - *val = temp_from_u16(tmp421->temp[channel]); - else - *val = temp_from_s16(tmp421->temp[channel]); + *val = temp_from_raw(tmp421->temp[channel], + tmp421->config & TMP421_CONFIG_RANGE); return 0; case hwmon_temp_fault: /* From 296e6156b4c74c278be21591b1612bac12a68092 Mon Sep 17 00:00:00 2001 From: Xiao Liang Date: Thu, 23 Sep 2021 23:03:19 +0800 Subject: [PATCH 1026/5344] net: ipv4: Fix rtnexthop len when RTA_FLOW is present [ Upstream commit 597aa16c782496bf74c5dc3b45ff472ade6cee64 ] Multipath RTA_FLOW is embedded in nexthop. Dump it in fib_add_nexthop() to get the length of rtnexthop correct. Fixes: b0f60193632e ("ipv4: Refactor nexthop attributes in fib_dump_info") Signed-off-by: Xiao Liang Reviewed-by: David Ahern Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/net/ip_fib.h | 2 +- include/net/nexthop.h | 2 +- net/ipv4/fib_semantics.c | 16 +++++++++------- net/ipv6/route.c | 5 +++-- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index ffbae7683450a..cb6c125628990 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -524,5 +524,5 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nh, u8 rt_family, unsigned char *flags, bool skip_oif); int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nh, - int nh_weight, u8 rt_family); + int nh_weight, u8 rt_family, u32 nh_tclassid); #endif /* _NET_FIB_H */ diff --git a/include/net/nexthop.h b/include/net/nexthop.h index 18a5aca264767..5ad614793af26 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -173,7 +173,7 @@ int nexthop_mpath_fill_node(struct sk_buff *skb, struct nexthop *nh, struct fib_nh_common *nhc = &nhi->fib_nhc; int weight = nhg->nh_entries[i].weight; - if (fib_add_nexthop(skb, nhc, weight, rt_family) < 0) + if (fib_add_nexthop(skb, nhc, weight, rt_family, 0) < 0) return -EMSGSIZE; } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index b1b3220917ca0..dce85a9c20c60 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1654,7 +1654,7 @@ EXPORT_SYMBOL_GPL(fib_nexthop_info); #if IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) || IS_ENABLED(CONFIG_IPV6) int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc, - int nh_weight, u8 rt_family) + int nh_weight, u8 rt_family, u32 nh_tclassid) { const struct net_device *dev = nhc->nhc_dev; struct rtnexthop *rtnh; @@ -1672,6 +1672,9 @@ int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc, rtnh->rtnh_flags = flags; + if (nh_tclassid && nla_put_u32(skb, RTA_FLOW, nh_tclassid)) + goto nla_put_failure; + /* length of rtnetlink header + attributes */ rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; @@ -1699,14 +1702,13 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) } for_nexthops(fi) { - if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight, - AF_INET) < 0) - goto nla_put_failure; + u32 nh_tclassid = 0; #ifdef CONFIG_IP_ROUTE_CLASSID - if (nh->nh_tclassid && - nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) - goto nla_put_failure; + nh_tclassid = nh->nh_tclassid; #endif + if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight, + AF_INET, nh_tclassid) < 0) + goto nla_put_failure; } endfor_nexthops(fi); mp_end: diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 575bd0f1b0089..3fb259c20546e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5523,14 +5523,15 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common, - rt->fib6_nh->fib_nh_weight, AF_INET6) < 0) + rt->fib6_nh->fib_nh_weight, AF_INET6, + 0) < 0) goto nla_put_failure; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) { if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common, sibling->fib6_nh->fib_nh_weight, - AF_INET6) < 0) + AF_INET6, 0) < 0) goto nla_put_failure; } From 4c58d77ec71f30f6137a64c680817f392d8085ab Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Wed, 8 Sep 2021 10:52:36 -0700 Subject: [PATCH 1027/5344] e100: fix length calculation in e100_get_regs_len [ Upstream commit 4329c8dc110b25d5f04ed20c6821bb60deff279f ] commit abf9b902059f ("e100: cleanup unneeded math") tried to simplify e100_get_regs_len and remove a double 'divide and then multiply' calculation that the e100_reg_regs_len function did. This change broke the size calculation entirely as it failed to account for the fact that the numbered registers are actually 4 bytes wide and not 1 byte. This resulted in a significant under allocation of the register buffer used by e100_get_regs. Fix this by properly multiplying the register count by u32 first before adding the size of the dump buffer. Fixes: abf9b902059f ("e100: cleanup unneeded math") Reported-by: Felicitas Hetzelt Signed-off-by: Jacob Keller Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/e100.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/e100.c b/drivers/net/ethernet/intel/e100.c index 911b3d2a94e1c..ea0f97d769648 100644 --- a/drivers/net/ethernet/intel/e100.c +++ b/drivers/net/ethernet/intel/e100.c @@ -2439,7 +2439,11 @@ static void e100_get_drvinfo(struct net_device *netdev, static int e100_get_regs_len(struct net_device *netdev) { struct nic *nic = netdev_priv(netdev); - return 1 + E100_PHY_REGS + sizeof(nic->mem->dump_buf); + + /* We know the number of registers, and the size of the dump buffer. + * Calculate the total size in bytes. + */ + return (1 + E100_PHY_REGS) * sizeof(u32) + sizeof(nic->mem->dump_buf); } static void e100_get_regs(struct net_device *netdev, From e462ca0b78b038f52fd47ee55756cc44732fe8c0 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Wed, 8 Sep 2021 10:52:37 -0700 Subject: [PATCH 1028/5344] e100: fix buffer overrun in e100_get_regs [ Upstream commit 51032e6f17ce990d06123ad7307f258c50d25aa7 ] The e100_get_regs function is used to implement a simple register dump for the e100 device. The data is broken into a couple of MAC control registers, and then a series of PHY registers, followed by a memory dump buffer. The total length of the register dump is defined as (1 + E100_PHY_REGS) * sizeof(u32) + sizeof(nic->mem->dump_buf). The logic for filling in the PHY registers uses a convoluted inverted count for loop which counts from E100_PHY_REGS (0x1C) down to 0, and assigns the slots 1 + E100_PHY_REGS - i. The first loop iteration will fill in [1] and the final loop iteration will fill in [1 + 0x1C]. This is actually one more than the supposed number of PHY registers. The memory dump buffer is then filled into the space at [2 + E100_PHY_REGS] which will cause that memcpy to assign 4 bytes past the total size. The end result is that we overrun the total buffer size allocated by the kernel, which could lead to a panic or other issues due to memory corruption. It is difficult to determine the actual total number of registers here. The only 8255x datasheet I could find indicates there are 28 total MDI registers. However, we're reading 29 here, and reading them in reverse! In addition, the ethtool e100 register dump interface appears to read the first PHY register to determine if the device is in MDI or MDIx mode. This doesn't appear to be documented anywhere within the 8255x datasheet. I can only assume it must be in register 28 (the extra register we're reading here). Lets not change any of the intended meaning of what we copy here. Just extend the space by 4 bytes to account for the extra register and continue copying the data out in the same order. Change the E100_PHY_REGS value to be the correct total (29) so that the total register dump size is calculated properly. Fix the offset for where we copy the dump buffer so that it doesn't overrun the total size. Re-write the for loop to use counting up instead of the convoluted down-counting. Correct the mdio_read offset to use the 0-based register offsets, but maintain the bizarre reverse ordering so that we have the ABI expected by applications like ethtool. This requires and additional subtraction of 1. It seems a bit odd but it makes the flow of assignment into the register buffer easier to follow. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Felicitas Hetzelt Signed-off-by: Jacob Keller Tested-by: Jacob Keller Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/e100.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/e100.c b/drivers/net/ethernet/intel/e100.c index ea0f97d769648..70962967d7141 100644 --- a/drivers/net/ethernet/intel/e100.c +++ b/drivers/net/ethernet/intel/e100.c @@ -2435,7 +2435,7 @@ static void e100_get_drvinfo(struct net_device *netdev, sizeof(info->bus_info)); } -#define E100_PHY_REGS 0x1C +#define E100_PHY_REGS 0x1D static int e100_get_regs_len(struct net_device *netdev) { struct nic *nic = netdev_priv(netdev); @@ -2457,14 +2457,18 @@ static void e100_get_regs(struct net_device *netdev, buff[0] = ioread8(&nic->csr->scb.cmd_hi) << 24 | ioread8(&nic->csr->scb.cmd_lo) << 16 | ioread16(&nic->csr->scb.status); - for (i = E100_PHY_REGS; i >= 0; i--) - buff[1 + E100_PHY_REGS - i] = - mdio_read(netdev, nic->mii.phy_id, i); + for (i = 0; i < E100_PHY_REGS; i++) + /* Note that we read the registers in reverse order. This + * ordering is the ABI apparently used by ethtool and other + * applications. + */ + buff[1 + i] = mdio_read(netdev, nic->mii.phy_id, + E100_PHY_REGS - 1 - i); memset(nic->mem->dump_buf, 0, sizeof(nic->mem->dump_buf)); e100_exec_cb(nic, NULL, e100_dump); msleep(10); - memcpy(&buff[2 + E100_PHY_REGS], nic->mem->dump_buf, - sizeof(nic->mem->dump_buf)); + memcpy(&buff[1 + E100_PHY_REGS], nic->mem->dump_buf, + sizeof(nic->mem->dump_buf)); } static void e100_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) From 886cd481d36972d6a971530aaf34dfc6e53b5586 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 23 Sep 2021 10:40:22 +0200 Subject: [PATCH 1029/5344] selftests, bpf: test_lwt_ip_encap: Really disable rp_filter [ Upstream commit 79e2c306667542b8ee2d9a9d947eadc7039f0a3c ] It's not enough to set net.ipv4.conf.all.rp_filter=0, that does not override a greater rp_filter value on the individual interfaces. We also need to set net.ipv4.conf.default.rp_filter=0 before creating the interfaces. That way, they'll also get their own rp_filter value of zero. Fixes: 0fde56e4385b0 ("selftests: bpf: add test_lwt_ip_encap selftest") Signed-off-by: Jiri Benc Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/b1cdd9d469f09ea6e01e9c89a6071c79b7380f89.1632386362.git.jbenc@redhat.com Signed-off-by: Sasha Levin --- tools/testing/selftests/bpf/test_lwt_ip_encap.sh | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/test_lwt_ip_encap.sh b/tools/testing/selftests/bpf/test_lwt_ip_encap.sh index 59ea56945e6cd..b497bb85b667f 100755 --- a/tools/testing/selftests/bpf/test_lwt_ip_encap.sh +++ b/tools/testing/selftests/bpf/test_lwt_ip_encap.sh @@ -112,6 +112,14 @@ setup() ip netns add "${NS2}" ip netns add "${NS3}" + # rp_filter gets confused by what these tests are doing, so disable it + ip netns exec ${NS1} sysctl -wq net.ipv4.conf.all.rp_filter=0 + ip netns exec ${NS2} sysctl -wq net.ipv4.conf.all.rp_filter=0 + ip netns exec ${NS3} sysctl -wq net.ipv4.conf.all.rp_filter=0 + ip netns exec ${NS1} sysctl -wq net.ipv4.conf.default.rp_filter=0 + ip netns exec ${NS2} sysctl -wq net.ipv4.conf.default.rp_filter=0 + ip netns exec ${NS3} sysctl -wq net.ipv4.conf.default.rp_filter=0 + ip link add veth1 type veth peer name veth2 ip link add veth3 type veth peer name veth4 ip link add veth5 type veth peer name veth6 @@ -236,11 +244,6 @@ setup() ip -netns ${NS1} -6 route add ${IPv6_GRE}/128 dev veth5 via ${IPv6_6} ${VRF} ip -netns ${NS2} -6 route add ${IPv6_GRE}/128 dev veth7 via ${IPv6_8} ${VRF} - # rp_filter gets confused by what these tests are doing, so disable it - ip netns exec ${NS1} sysctl -wq net.ipv4.conf.all.rp_filter=0 - ip netns exec ${NS2} sysctl -wq net.ipv4.conf.all.rp_filter=0 - ip netns exec ${NS3} sysctl -wq net.ipv4.conf.all.rp_filter=0 - TMPFILE=$(mktemp /tmp/test_lwt_ip_encap.XXXXXX) sleep 1 # reduce flakiness From 229ab61a5c0bc83d51900744b6ac71b3ef2c517c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 28 Sep 2021 06:33:15 -0600 Subject: [PATCH 1030/5344] Revert "block, bfq: honor already-setup queue merges" [ Upstream commit ebc69e897e17373fbe1daaff1debaa77583a5284 ] This reverts commit 2d52c58b9c9bdae0ca3df6a1eab5745ab3f7d80b. We have had several folks complain that this causes hangs for them, which is especially problematic as the commit has also hit stable already. As no resolution seems to be forthcoming right now, revert the patch. Link: https://bugzilla.kernel.org/show_bug.cgi?id=214503 Fixes: 2d52c58b9c9b ("block, bfq: honor already-setup queue merges") Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- block/bfq-iosched.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 2acc814f9ad21..c5da1955b1c94 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2526,15 +2526,6 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) * are likely to increase the throughput. */ bfqq->new_bfqq = new_bfqq; - /* - * The above assignment schedules the following redirections: - * each time some I/O for bfqq arrives, the process that - * generated that I/O is disassociated from bfqq and - * associated with new_bfqq. Here we increases new_bfqq->ref - * in advance, adding the number of processes that are - * expected to be associated with new_bfqq as they happen to - * issue I/O. - */ new_bfqq->ref += process_refs; return new_bfqq; } @@ -2594,10 +2585,6 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, { struct bfq_queue *in_service_bfqq, *new_bfqq; - /* if a merge has already been setup, then proceed with that first */ - if (bfqq->new_bfqq) - return bfqq->new_bfqq; - /* * Do not perform queue merging if the device is non * rotational and performs internal queueing. In fact, such a @@ -2652,6 +2639,9 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (bfq_too_late_for_merging(bfqq)) return NULL; + if (bfqq->new_bfqq) + return bfqq->new_bfqq; + if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) return NULL; From 011c3cb5bb3bcc76cba25ee6e6d834f65262c67b Mon Sep 17 00:00:00 2001 From: Rahul Lakkireddy Date: Mon, 27 Sep 2021 21:44:08 +0530 Subject: [PATCH 1031/5344] scsi: csiostor: Add module softdep on cxgb4 [ Upstream commit 79a7482249a7353bc86aff8127954d5febf02472 ] Both cxgb4 and csiostor drivers run on their own independent Physical Function. But when cxgb4 and csiostor are both being loaded in parallel via modprobe, there is a race when firmware upgrade is attempted by both the drivers. When the cxgb4 driver initiates the firmware upgrade, it halts the firmware and the chip until upgrade is complete. When the csiostor driver is coming up in parallel, the firmware mailbox communication fails with timeouts and the csiostor driver probe fails. Add a module soft dependency on cxgb4 driver to ensure loading csiostor triggers cxgb4 to load first when available to avoid the firmware upgrade race. Link: https://lore.kernel.org/r/1632759248-15382-1-git-send-email-rahul.lakkireddy@chelsio.com Fixes: a3667aaed569 ("[SCSI] csiostor: Chelsio FCoE offload driver") Signed-off-by: Rahul Lakkireddy Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/csiostor/csio_init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/csiostor/csio_init.c b/drivers/scsi/csiostor/csio_init.c index a6dd704d7f2de..1b8ccadc7cf67 100644 --- a/drivers/scsi/csiostor/csio_init.c +++ b/drivers/scsi/csiostor/csio_init.c @@ -1257,3 +1257,4 @@ MODULE_DEVICE_TABLE(pci, csio_pci_tbl); MODULE_VERSION(CSIO_DRV_VERSION); MODULE_FIRMWARE(FW_FNAME_T5); MODULE_FIRMWARE(FW_FNAME_T6); +MODULE_SOFTDEP("pre: cxgb4"); From 95a7e6be2e49a00e15a869974d1bdc63b90324d4 Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Wed, 29 Sep 2021 17:35:49 +0800 Subject: [PATCH 1032/5344] net: hns3: do not allow call hns3_nic_net_open repeatedly [ Upstream commit 5b09e88e1bf7fe86540fab4b5f3eece8abead39e ] hns3_nic_net_open() is not allowed to called repeatly, but there is no checking for this. When doing device reset and setup tc concurrently, there is a small oppotunity to call hns3_nic_net_open repeatedly, and cause kernel bug by calling napi_enable twice. The calltrace information is like below: [ 3078.222780] ------------[ cut here ]------------ [ 3078.230255] kernel BUG at net/core/dev.c:6991! [ 3078.236224] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP [ 3078.243431] Modules linked in: hns3 hclgevf hclge hnae3 vfio_iommu_type1 vfio_pci vfio_virqfd vfio pv680_mii(O) [ 3078.258880] CPU: 0 PID: 295 Comm: kworker/u8:5 Tainted: G O 5.14.0-rc4+ #1 [ 3078.269102] Hardware name: , BIOS KpxxxFPGA 1P B600 V181 08/12/2021 [ 3078.276801] Workqueue: hclge hclge_service_task [hclge] [ 3078.288774] pstate: 60400009 (nZCv daif +PAN -UAO -TCO BTYPE=--) [ 3078.296168] pc : napi_enable+0x80/0x84 tc qdisc sho[w 3d0e7v8 .e3t0h218 79] lr : hns3_nic_net_open+0x138/0x510 [hns3] [ 3078.314771] sp : ffff8000108abb20 [ 3078.319099] x29: ffff8000108abb20 x28: 0000000000000000 x27: ffff0820a8490300 [ 3078.329121] x26: 0000000000000001 x25: ffff08209cfc6200 x24: 0000000000000000 [ 3078.339044] x23: ffff0820a8490300 x22: ffff08209cd76000 x21: ffff0820abfe3880 [ 3078.349018] x20: 0000000000000000 x19: ffff08209cd76900 x18: 0000000000000000 [ 3078.358620] x17: 0000000000000000 x16: ffffc816e1727a50 x15: 0000ffff8f4ff930 [ 3078.368895] x14: 0000000000000000 x13: 0000000000000000 x12: 0000259e9dbeb6b4 [ 3078.377987] x11: 0096a8f7e764eb40 x10: 634615ad28d3eab5 x9 : ffffc816ad8885b8 [ 3078.387091] x8 : ffff08209cfc6fb8 x7 : ffff0820ac0da058 x6 : ffff0820a8490344 [ 3078.396356] x5 : 0000000000000140 x4 : 0000000000000003 x3 : ffff08209cd76938 [ 3078.405365] x2 : 0000000000000000 x1 : 0000000000000010 x0 : ffff0820abfe38a0 [ 3078.414657] Call trace: [ 3078.418517] napi_enable+0x80/0x84 [ 3078.424626] hns3_reset_notify_up_enet+0x78/0xd0 [hns3] [ 3078.433469] hns3_reset_notify+0x64/0x80 [hns3] [ 3078.441430] hclge_notify_client+0x68/0xb0 [hclge] [ 3078.450511] hclge_reset_rebuild+0x524/0x884 [hclge] [ 3078.458879] hclge_reset_service_task+0x3c4/0x680 [hclge] [ 3078.467470] hclge_service_task+0xb0/0xb54 [hclge] [ 3078.475675] process_one_work+0x1dc/0x48c [ 3078.481888] worker_thread+0x15c/0x464 [ 3078.487104] kthread+0x160/0x170 [ 3078.492479] ret_from_fork+0x10/0x18 [ 3078.498785] Code: c8027c81 35ffffa2 d50323bf d65f03c0 (d4210000) [ 3078.506889] ---[ end trace 8ebe0340a1b0fb44 ]--- Once hns3_nic_net_open() is excute success, the flag HNS3_NIC_STATE_DOWN will be cleared. So add checking for this flag, directly return when HNS3_NIC_STATE_DOWN is no set. Fixes: e888402789b9 ("net: hns3: call hns3_nic_net_open() while doing HNAE3_UP_CLIENT") Signed-off-by: Jian Shen Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index db9c8f943811b..ffd1018d43fbe 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -452,6 +452,11 @@ static int hns3_nic_net_open(struct net_device *netdev) if (hns3_nic_resetting(netdev)) return -EBUSY; + if (!test_bit(HNS3_NIC_STATE_DOWN, &priv->state)) { + netdev_warn(netdev, "net open repeatedly!\n"); + return 0; + } + netif_carrier_off(netdev); ret = hns3_nic_set_real_num_queue(netdev); From 3750b682eaefbbdd571af0c9c21566b918c3cbdd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Sep 2021 15:57:50 -0700 Subject: [PATCH 1033/5344] af_unix: fix races in sk_peer_pid and sk_peer_cred accesses [ Upstream commit 35306eb23814444bd4021f8a1c3047d3cb0c8b2b ] Jann Horn reported that SO_PEERCRED and SO_PEERGROUPS implementations are racy, as af_unix can concurrently change sk_peer_pid and sk_peer_cred. In order to fix this issue, this patch adds a new spinlock that needs to be used whenever these fields are read or written. Jann also pointed out that l2cap_sock_get_peer_pid_cb() is currently reading sk->sk_peer_pid which makes no sense, as this field is only possibly set by AF_UNIX sockets. We will have to clean this in a separate patch. This could be done by reverting b48596d1dc25 "Bluetooth: L2CAP: Add get_peer_pid callback" or implementing what was truly expected. Fixes: 109f6e39fa07 ("af_unix: Allow SO_PEERCRED to work across namespaces.") Signed-off-by: Eric Dumazet Reported-by: Jann Horn Cc: Eric W. Biederman Cc: Luiz Augusto von Dentz Cc: Marcel Holtmann Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/net/sock.h | 2 ++ net/core/sock.c | 32 ++++++++++++++++++++++++++------ net/unix/af_unix.c | 34 ++++++++++++++++++++++++++++------ 3 files changed, 56 insertions(+), 12 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 535666196c52e..c7161316ae5ea 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -470,8 +470,10 @@ struct sock { u32 sk_ack_backlog; u32 sk_max_ack_backlog; kuid_t sk_uid; + spinlock_t sk_peer_lock; struct pid *sk_peer_pid; const struct cred *sk_peer_cred; + long sk_rcvtimeo; ktime_t sk_stamp; #if BITS_PER_LONG==32 diff --git a/net/core/sock.c b/net/core/sock.c index b9c8f7c08ce16..0e98929972f46 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1181,6 +1181,16 @@ int sock_setsockopt(struct socket *sock, int level, int optname, } EXPORT_SYMBOL(sock_setsockopt); +static const struct cred *sk_get_peer_cred(struct sock *sk) +{ + const struct cred *cred; + + spin_lock(&sk->sk_peer_lock); + cred = get_cred(sk->sk_peer_cred); + spin_unlock(&sk->sk_peer_lock); + + return cred; +} static void cred_to_ucred(struct pid *pid, const struct cred *cred, struct ucred *ucred) @@ -1355,7 +1365,11 @@ int sock_getsockopt(struct socket *sock, int level, int optname, struct ucred peercred; if (len > sizeof(peercred)) len = sizeof(peercred); + + spin_lock(&sk->sk_peer_lock); cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); + spin_unlock(&sk->sk_peer_lock); + if (copy_to_user(optval, &peercred, len)) return -EFAULT; goto lenout; @@ -1363,20 +1377,23 @@ int sock_getsockopt(struct socket *sock, int level, int optname, case SO_PEERGROUPS: { + const struct cred *cred; int ret, n; - if (!sk->sk_peer_cred) + cred = sk_get_peer_cred(sk); + if (!cred) return -ENODATA; - n = sk->sk_peer_cred->group_info->ngroups; + n = cred->group_info->ngroups; if (len < n * sizeof(gid_t)) { len = n * sizeof(gid_t); + put_cred(cred); return put_user(len, optlen) ? -EFAULT : -ERANGE; } len = n * sizeof(gid_t); - ret = groups_to_user((gid_t __user *)optval, - sk->sk_peer_cred->group_info); + ret = groups_to_user((gid_t __user *)optval, cred->group_info); + put_cred(cred); if (ret) return ret; goto lenout; @@ -1714,9 +1731,10 @@ static void __sk_destruct(struct rcu_head *head) sk->sk_frag.page = NULL; } - if (sk->sk_peer_cred) - put_cred(sk->sk_peer_cred); + /* We do not need to acquire sk->sk_peer_lock, we are the last user. */ + put_cred(sk->sk_peer_cred); put_pid(sk->sk_peer_pid); + if (likely(sk->sk_net_refcnt)) put_net(sock_net(sk)); sk_prot_free(sk->sk_prot_creator, sk); @@ -2920,6 +2938,8 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_peer_pid = NULL; sk->sk_peer_cred = NULL; + spin_lock_init(&sk->sk_peer_lock); + sk->sk_write_pending = 0; sk->sk_rcvlowat = 1; sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3098710c9c344..05470ca91bd94 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -595,20 +595,42 @@ static void unix_release_sock(struct sock *sk, int embrion) static void init_peercred(struct sock *sk) { - put_pid(sk->sk_peer_pid); - if (sk->sk_peer_cred) - put_cred(sk->sk_peer_cred); + const struct cred *old_cred; + struct pid *old_pid; + + spin_lock(&sk->sk_peer_lock); + old_pid = sk->sk_peer_pid; + old_cred = sk->sk_peer_cred; sk->sk_peer_pid = get_pid(task_tgid(current)); sk->sk_peer_cred = get_current_cred(); + spin_unlock(&sk->sk_peer_lock); + + put_pid(old_pid); + put_cred(old_cred); } static void copy_peercred(struct sock *sk, struct sock *peersk) { - put_pid(sk->sk_peer_pid); - if (sk->sk_peer_cred) - put_cred(sk->sk_peer_cred); + const struct cred *old_cred; + struct pid *old_pid; + + if (sk < peersk) { + spin_lock(&sk->sk_peer_lock); + spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING); + } else { + spin_lock(&peersk->sk_peer_lock); + spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING); + } + old_pid = sk->sk_peer_pid; + old_cred = sk->sk_peer_cred; sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); + + spin_unlock(&sk->sk_peer_lock); + spin_unlock(&peersk->sk_peer_lock); + + put_pid(old_pid); + put_cred(old_cred); } static int unix_listen(struct socket *sock, int backlog) From cc64b800e4989a5243853a2f8da355b5cd0da93d Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 28 Sep 2021 08:19:03 -0700 Subject: [PATCH 1034/5344] perf/x86/intel: Update event constraints for ICX [ Upstream commit ecc2123e09f9e71ddc6c53d71e283b8ada685fe2 ] According to the latest event list, the event encoding 0xEF is only available on the first 4 counters. Add it into the event constraints table. Fixes: 6017608936c1 ("perf/x86/intel: Add Icelake support") Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1632842343-25862-1-git-send-email-kan.liang@linux.intel.com Signed-off-by: Sasha Levin --- arch/x86/events/intel/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 9faccd187b9e7..1396403e6dc42 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -263,6 +263,7 @@ static struct event_constraint intel_icl_event_constraints[] = { INTEL_EVENT_CONSTRAINT_RANGE(0xa8, 0xb0, 0xf), INTEL_EVENT_CONSTRAINT_RANGE(0xb7, 0xbd, 0xf), INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xe6, 0xf), + INTEL_EVENT_CONSTRAINT(0xef, 0xf), INTEL_EVENT_CONSTRAINT_RANGE(0xf0, 0xf4, 0xf), EVENT_CONSTRAINT_END }; From 8357793a6aba701bafcdde39a78a4796327eeb00 Mon Sep 17 00:00:00 2001 From: Chen Jingwen Date: Tue, 28 Sep 2021 20:56:57 +0800 Subject: [PATCH 1035/5344] elf: don't use MAP_FIXED_NOREPLACE for elf interpreter mappings commit 9b2f72cc0aa4bb444541bb87581c35b7508b37d3 upstream. In commit b212921b13bd ("elf: don't use MAP_FIXED_NOREPLACE for elf executable mappings") we still leave MAP_FIXED_NOREPLACE in place for load_elf_interp. Unfortunately, this will cause kernel to fail to start with: 1 (init): Uhuuh, elf segment at 00003ffff7ffd000 requested but the memory is mapped already Failed to execute /init (error -17) The reason is that the elf interpreter (ld.so) has overlapping segments. readelf -l ld-2.31.so Program Headers: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flags Align LOAD 0x0000000000000000 0x0000000000000000 0x0000000000000000 0x000000000002c94c 0x000000000002c94c R E 0x10000 LOAD 0x000000000002dae0 0x000000000003dae0 0x000000000003dae0 0x00000000000021e8 0x0000000000002320 RW 0x10000 LOAD 0x000000000002fe00 0x000000000003fe00 0x000000000003fe00 0x00000000000011ac 0x0000000000001328 RW 0x10000 The reason for this problem is the same as described in commit ad55eac74f20 ("elf: enforce MAP_FIXED on overlaying elf segments"). Not only executable binaries, elf interpreters (e.g. ld.so) can have overlapping elf segments, so we better drop MAP_FIXED_NOREPLACE and go back to MAP_FIXED in load_elf_interp. Fixes: 4ed28639519c ("fs, elf: drop MAP_FIXED usage from elf_map") Cc: # v4.19 Cc: Andrew Morton Cc: Michal Hocko Signed-off-by: Chen Jingwen Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/binfmt_elf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 7ce3cfd965d25..72cd871544ac0 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -583,7 +583,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, vaddr = eppnt->p_vaddr; if (interp_elf_ex->e_type == ET_EXEC || load_addr_set) - elf_type |= MAP_FIXED_NOREPLACE; + elf_type |= MAP_FIXED; else if (no_base && interp_elf_ex->e_type == ET_DYN) load_addr = -vaddr; From f37861ad7ee2e719579693c967c49d00334ed550 Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Thu, 2 Sep 2021 12:29:17 +0200 Subject: [PATCH 1036/5344] debugfs: debugfs_create_file_size(): use IS_ERR to check for error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit af505cad9567f7a500d34bf183696d570d7f6810 upstream. debugfs_create_file() returns encoded error so use IS_ERR for checking return value. Reviewed-by: Christian König Signed-off-by: Nirmoy Das Fixes: ff9fb72bc077 ("debugfs: return error values, not NULL") Cc: stable References: https://gitlab.freedesktop.org/drm/amd/-/issues/1686 Link: https://lore.kernel.org/r/20210902102917.2233-1-nirmoy.das@amd.com Signed-off-by: Greg Kroah-Hartman --- fs/debugfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 1c74a7cbf5b19..e595a29bf46e3 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -522,7 +522,7 @@ struct dentry *debugfs_create_file_size(const char *name, umode_t mode, { struct dentry *de = debugfs_create_file(name, mode, parent, data, fops); - if (de) + if (!IS_ERR(de)) d_inode(de)->i_size = file_size; return de; } From 5402a343d3548ba88fc6b78feb53aabe849bd0dc Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 17 Sep 2021 13:46:17 +0200 Subject: [PATCH 1037/5344] ipack: ipoctal: fix stack information leak commit a89936cce87d60766a75732a9e7e25c51164f47c upstream. The tty driver name is used also after registering the driver and must specifically not be allocated on the stack to avoid leaking information to user space (or triggering an oops). Drivers should not try to encode topology information in the tty device name but this one snuck in through staging without anyone noticing and another driver has since copied this malpractice. Fixing the ABI is a separate issue, but this at least plugs the security hole. Fixes: ba4dc61fe8c5 ("Staging: ipack: add support for IP-OCTAL mezzanine board") Cc: stable@vger.kernel.org # 3.5 Acked-by: Samuel Iglesias Gonsalvez Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20210917114622.5412-2-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/ipack/devices/ipoctal.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/ipack/devices/ipoctal.c b/drivers/ipack/devices/ipoctal.c index 9c2a4b5d30cfc..621f7bbe6223b 100644 --- a/drivers/ipack/devices/ipoctal.c +++ b/drivers/ipack/devices/ipoctal.c @@ -266,7 +266,6 @@ static int ipoctal_inst_slot(struct ipoctal *ipoctal, unsigned int bus_nr, int res; int i; struct tty_driver *tty; - char name[20]; struct ipoctal_channel *channel; struct ipack_region *region; void __iomem *addr; @@ -357,8 +356,11 @@ static int ipoctal_inst_slot(struct ipoctal *ipoctal, unsigned int bus_nr, /* Fill struct tty_driver with ipoctal data */ tty->owner = THIS_MODULE; tty->driver_name = KBUILD_MODNAME; - sprintf(name, KBUILD_MODNAME ".%d.%d.", bus_nr, slot); - tty->name = name; + tty->name = kasprintf(GFP_KERNEL, KBUILD_MODNAME ".%d.%d.", bus_nr, slot); + if (!tty->name) { + res = -ENOMEM; + goto err_put_driver; + } tty->major = 0; tty->minor_start = 0; @@ -374,8 +376,7 @@ static int ipoctal_inst_slot(struct ipoctal *ipoctal, unsigned int bus_nr, res = tty_register_driver(tty); if (res) { dev_err(&ipoctal->dev->dev, "Can't register tty driver.\n"); - put_tty_driver(tty); - return res; + goto err_free_name; } /* Save struct tty_driver for use it when uninstalling the device */ @@ -412,6 +413,13 @@ static int ipoctal_inst_slot(struct ipoctal *ipoctal, unsigned int bus_nr, ipoctal_irq_handler, ipoctal); return 0; + +err_free_name: + kfree(tty->name); +err_put_driver: + put_tty_driver(tty); + + return res; } static inline int ipoctal_copy_write_buffer(struct ipoctal_channel *channel, @@ -700,6 +708,7 @@ static void __ipoctal_remove(struct ipoctal *ipoctal) } tty_unregister_driver(ipoctal->tty_drv); + kfree(ipoctal->tty_drv->name); put_tty_driver(ipoctal->tty_drv); kfree(ipoctal); } From 4ca2588b78637466c7f69b4b0b4536518ec11141 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 17 Sep 2021 13:46:18 +0200 Subject: [PATCH 1038/5344] ipack: ipoctal: fix tty registration race commit 65c001df517a7bf9be8621b53d43c89f426ce8d6 upstream. Make sure to set the tty class-device driver data before registering the tty to avoid having a racing open() dereference a NULL pointer. Fixes: 9c1d784afc6f ("Staging: ipack/devices/ipoctal: Get rid of ipoctal_list.") Cc: stable@vger.kernel.org # 3.7 Acked-by: Samuel Iglesias Gonsalvez Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20210917114622.5412-3-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/ipack/devices/ipoctal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/ipack/devices/ipoctal.c b/drivers/ipack/devices/ipoctal.c index 621f7bbe6223b..c5b598eeaea68 100644 --- a/drivers/ipack/devices/ipoctal.c +++ b/drivers/ipack/devices/ipoctal.c @@ -395,13 +395,13 @@ static int ipoctal_inst_slot(struct ipoctal *ipoctal, unsigned int bus_nr, spin_lock_init(&channel->lock); channel->pointer_read = 0; channel->pointer_write = 0; - tty_dev = tty_port_register_device(&channel->tty_port, tty, i, NULL); + tty_dev = tty_port_register_device_attr(&channel->tty_port, tty, + i, NULL, channel, NULL); if (IS_ERR(tty_dev)) { dev_err(&ipoctal->dev->dev, "Failed to register tty device.\n"); tty_port_destroy(&channel->tty_port); continue; } - dev_set_drvdata(tty_dev, channel); } /* From 9474810c3cce35b6fd0c212fcde96dcc9b4f5ea5 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 17 Sep 2021 13:46:19 +0200 Subject: [PATCH 1039/5344] ipack: ipoctal: fix tty-registration error handling commit cd20d59291d1790dc74248476e928f57fc455189 upstream. Registration of the ipoctal tty devices is unlikely to fail, but if it ever does, make sure not to deregister a never registered tty device (and dereference a NULL pointer) when the driver is later unbound. Fixes: 2afb41d9d30d ("Staging: ipack/devices/ipoctal: Check tty_register_device return value.") Cc: stable@vger.kernel.org # 3.7 Acked-by: Samuel Iglesias Gonsalvez Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20210917114622.5412-4-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/ipack/devices/ipoctal.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/ipack/devices/ipoctal.c b/drivers/ipack/devices/ipoctal.c index c5b598eeaea68..498f52784c3ac 100644 --- a/drivers/ipack/devices/ipoctal.c +++ b/drivers/ipack/devices/ipoctal.c @@ -35,6 +35,7 @@ struct ipoctal_channel { unsigned int pointer_read; unsigned int pointer_write; struct tty_port tty_port; + bool tty_registered; union scc2698_channel __iomem *regs; union scc2698_block __iomem *block_regs; unsigned int board_id; @@ -399,9 +400,11 @@ static int ipoctal_inst_slot(struct ipoctal *ipoctal, unsigned int bus_nr, i, NULL, channel, NULL); if (IS_ERR(tty_dev)) { dev_err(&ipoctal->dev->dev, "Failed to register tty device.\n"); + tty_port_free_xmit_buf(&channel->tty_port); tty_port_destroy(&channel->tty_port); continue; } + channel->tty_registered = true; } /* @@ -702,6 +705,10 @@ static void __ipoctal_remove(struct ipoctal *ipoctal) for (i = 0; i < NR_CHANNELS; i++) { struct ipoctal_channel *channel = &ipoctal->channel[i]; + + if (!channel->tty_registered) + continue; + tty_unregister_device(ipoctal->tty_drv, i); tty_port_free_xmit_buf(&channel->tty_port); tty_port_destroy(&channel->tty_port); From caf338e8757a30327bf4408c0c8a8151f85cda3b Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 17 Sep 2021 13:46:20 +0200 Subject: [PATCH 1040/5344] ipack: ipoctal: fix missing allocation-failure check commit 445c8132727728dc297492a7d9fc074af3e94ba3 upstream. Add the missing error handling when allocating the transmit buffer to avoid dereferencing a NULL pointer in write() should the allocation ever fail. Fixes: ba4dc61fe8c5 ("Staging: ipack: add support for IP-OCTAL mezzanine board") Cc: stable@vger.kernel.org # 3.5 Acked-by: Samuel Iglesias Gonsalvez Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20210917114622.5412-5-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/ipack/devices/ipoctal.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/ipack/devices/ipoctal.c b/drivers/ipack/devices/ipoctal.c index 498f52784c3ac..e22696ebd4067 100644 --- a/drivers/ipack/devices/ipoctal.c +++ b/drivers/ipack/devices/ipoctal.c @@ -388,7 +388,9 @@ static int ipoctal_inst_slot(struct ipoctal *ipoctal, unsigned int bus_nr, channel = &ipoctal->channel[i]; tty_port_init(&channel->tty_port); - tty_port_alloc_xmit_buf(&channel->tty_port); + res = tty_port_alloc_xmit_buf(&channel->tty_port); + if (res) + continue; channel->tty_port.ops = &ipoctal_tty_port_ops; ipoctal_reset_stats(&channel->stats); From b7c6191a57edb24f951221fd0c3d940591bb00e6 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 17 Sep 2021 13:46:21 +0200 Subject: [PATCH 1041/5344] ipack: ipoctal: fix module reference leak commit bb8a4fcb2136508224c596a7e665bdba1d7c3c27 upstream. A reference to the carrier module was taken on every open but was only released once when the final reference to the tty struct was dropped. Fix this by taking the module reference and initialising the tty driver data when installing the tty. Fixes: 82a82340bab6 ("ipoctal: get carrier driver to avoid rmmod") Cc: stable@vger.kernel.org # 3.18 Cc: Federico Vaga Acked-by: Samuel Iglesias Gonsalvez Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20210917114622.5412-6-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/ipack/devices/ipoctal.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/drivers/ipack/devices/ipoctal.c b/drivers/ipack/devices/ipoctal.c index e22696ebd4067..0f1dca623b5a1 100644 --- a/drivers/ipack/devices/ipoctal.c +++ b/drivers/ipack/devices/ipoctal.c @@ -84,22 +84,34 @@ static int ipoctal_port_activate(struct tty_port *port, struct tty_struct *tty) return 0; } -static int ipoctal_open(struct tty_struct *tty, struct file *file) +static int ipoctal_install(struct tty_driver *driver, struct tty_struct *tty) { struct ipoctal_channel *channel = dev_get_drvdata(tty->dev); struct ipoctal *ipoctal = chan_to_ipoctal(channel, tty->index); - int err; - - tty->driver_data = channel; + int res; if (!ipack_get_carrier(ipoctal->dev)) return -EBUSY; - err = tty_port_open(&channel->tty_port, tty, file); - if (err) - ipack_put_carrier(ipoctal->dev); + res = tty_standard_install(driver, tty); + if (res) + goto err_put_carrier; + + tty->driver_data = channel; + + return 0; + +err_put_carrier: + ipack_put_carrier(ipoctal->dev); + + return res; +} + +static int ipoctal_open(struct tty_struct *tty, struct file *file) +{ + struct ipoctal_channel *channel = tty->driver_data; - return err; + return tty_port_open(&channel->tty_port, tty, file); } static void ipoctal_reset_stats(struct ipoctal_stats *stats) @@ -665,6 +677,7 @@ static void ipoctal_cleanup(struct tty_struct *tty) static const struct tty_operations ipoctal_fops = { .ioctl = NULL, + .install = ipoctal_install, .open = ipoctal_open, .close = ipoctal_close, .write = ipoctal_write_tty, From 6a5abdedf5ca3c451800fa293148be26e138f809 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sat, 5 Jun 2021 10:39:32 +0530 Subject: [PATCH 1042/5344] ext4: fix loff_t overflow in ext4_max_bitmap_size() commit 75ca6ad408f459f00b09a64f04c774559848c097 upstream. We should use unsigned long long rather than loff_t to avoid overflow in ext4_max_bitmap_size() for comparison before returning. w/o this patch sbi->s_bitmap_maxbytes was becoming a negative value due to overflow of upper_limit (with has_huge_files as true) Below is a quick test to trigger it on a 64KB pagesize system. sudo mkfs.ext4 -b 65536 -O ^has_extents,^64bit /dev/loop2 sudo mount /dev/loop2 /mnt sudo echo "hello" > /mnt/hello -> This will error out with "echo: write error: File too large" Signed-off-by: Ritesh Harjani Reviewed-by: Jan Kara Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Link: https://lore.kernel.org/r/594f409e2c543e90fd836b78188dfa5c575065ba.1622867594.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/super.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ce8372ceaa43e..ec0fcd6d70bf4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2830,17 +2830,17 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files) */ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) { - loff_t res = EXT4_NDIR_BLOCKS; + unsigned long long upper_limit, res = EXT4_NDIR_BLOCKS; int meta_blocks; - loff_t upper_limit; - /* This is calculated to be the largest file size for a dense, block + + /* + * This is calculated to be the largest file size for a dense, block * mapped file such that the file's total number of 512-byte sectors, * including data and all indirect blocks, does not exceed (2^48 - 1). * * __u32 i_blocks_lo and _u16 i_blocks_high represent the total * number of 512-byte sectors of the file. */ - if (!has_huge_files) { /* * !has_huge_files or implies that the inode i_block field @@ -2883,7 +2883,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) if (res > MAX_LFS_FILESIZE) res = MAX_LFS_FILESIZE; - return res; + return (loff_t)res; } static ext4_fsblk_t descriptor_loc(struct super_block *sb, From 7b733385fb9a69bea7473e03640cbb62b157c0e9 Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Mon, 23 Aug 2021 14:13:58 +0800 Subject: [PATCH 1043/5344] ext4: fix reserved space counter leakage commit 6fed83957f21eff11c8496e9f24253b03d2bc1dc upstream. When ext4_insert_delayed block receives and recovers from an error from ext4_es_insert_delayed_block(), e.g., ENOMEM, it does not release the space it has reserved for that block insertion as it should. One effect of this bug is that s_dirtyclusters_counter is not decremented and remains incorrectly elevated until the file system has been unmounted. This can result in premature ENOSPC returns and apparent loss of free space. Another effect of this bug is that /sys/fs/ext4//delayed_allocation_blocks can remain non-zero even after syncfs has been executed on the filesystem. Besides, add check for s_dirtyclusters_counter when inode is going to be evicted and freed. s_dirtyclusters_counter can still keep non-zero until inode is written back in .evict_inode(), and thus the check is delayed to .destroy_inode(). Fixes: 51865fda28e5 ("ext4: let ext4 maintain extent status tree") Cc: stable@kernel.org Suggested-by: Gao Xiang Signed-off-by: Jeffle Xu Reviewed-by: Eric Whitney Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20210823061358.84473-1-jefflexu@linux.alibaba.com Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 5 +++++ fs/ext4/super.c | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1429d01d836bb..48b467353f6f1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1782,6 +1782,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int ret; bool allocated = false; + bool reserved = false; /* * If the cluster containing lblk is shared with a delayed, @@ -1798,6 +1799,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) ret = ext4_da_reserve_space(inode); if (ret != 0) /* ENOSPC */ goto errout; + reserved = true; } else { /* bigalloc */ if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) { if (!ext4_es_scan_clu(inode, @@ -1810,6 +1812,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) ret = ext4_da_reserve_space(inode); if (ret != 0) /* ENOSPC */ goto errout; + reserved = true; } else { allocated = true; } @@ -1820,6 +1823,8 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) } ret = ext4_es_insert_delayed_block(inode, lblk, allocated); + if (ret && reserved) + ext4_da_release_space(inode, 1); errout: return ret; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ec0fcd6d70bf4..1211ae203face 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1141,6 +1141,12 @@ static void ext4_destroy_inode(struct inode *inode) true); dump_stack(); } + + if (EXT4_I(inode)->i_reserved_data_blocks) + ext4_msg(inode->i_sb, KERN_ERR, + "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!", + inode->i_ino, EXT4_I(inode), + EXT4_I(inode)->i_reserved_data_blocks); } static void init_once(void *foo) From 7f6834b09690a7b6bc3e5cc5166177eac2facaa6 Mon Sep 17 00:00:00 2001 From: yangerkun Date: Tue, 14 Sep 2021 19:14:15 +0800 Subject: [PATCH 1044/5344] ext4: fix potential infinite loop in ext4_dx_readdir() commit 42cb447410d024e9d54139ae9c21ea132a8c384c upstream. When ext4_htree_fill_tree() fails, ext4_dx_readdir() can run into an infinite loop since if info->last_pos != ctx->pos this will reset the directory scan and reread the failing entry. For example: 1. a dx_dir which has 3 block, block 0 as dx_root block, block 1/2 as leaf block which own the ext4_dir_entry_2 2. block 1 read ok and call_filldir which will fill the dirent and update the ctx->pos 3. block 2 read fail, but we has already fill some dirent, so we will return back to userspace will a positive return val(see ksys_getdents64) 4. the second ext4_dx_readdir will reset the world since info->last_pos != ctx->pos, and will also init the curr_hash which pos to block 1 5. So we will read block1 too, and once block2 still read fail, we can only fill one dirent because the hash of the entry in block1(besides the last one) won't greater than curr_hash 6. this time, we forget update last_pos too since the read for block2 will fail, and since we has got the one entry, ksys_getdents64 can return success 7. Latter we will trapped in a loop with step 4~6 Cc: stable@kernel.org Signed-off-by: yangerkun Reviewed-by: Jan Kara Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20210914111415.3921954-1-yangerkun@huawei.com Signed-off-by: Greg Kroah-Hartman --- fs/ext4/dir.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 0589e914663fb..e8275b5d27439 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -536,7 +536,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) struct dir_private_info *info = file->private_data; struct inode *inode = file_inode(file); struct fname *fname; - int ret; + int ret = 0; if (!info) { info = ext4_htree_create_dir_info(file, ctx->pos); @@ -584,7 +584,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) info->curr_minor_hash, &info->next_hash); if (ret < 0) - return ret; + goto finished; if (ret == 0) { ctx->pos = ext4_get_htree_eof(file); break; @@ -615,7 +615,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) } finished: info->last_pos = ctx->pos; - return 0; + return ret < 0 ? ret : 0; } static int ext4_dir_open(struct inode * inode, struct file * filp) From 555e98006c454cf74a70b19a7fc6b08ddc8480ee Mon Sep 17 00:00:00 2001 From: Andrej Shadura Date: Thu, 16 Sep 2021 17:33:11 +0100 Subject: [PATCH 1045/5344] HID: u2fzero: ignore incomplete packets without data commit 22d65765f211cc83186fd8b87521159f354c0da9 upstream. Since the actual_length calculation is performed unsigned, packets shorter than 7 bytes (e.g. packets without data or otherwise truncated) or non-received packets ("zero" bytes) can cause buffer overflow. Link: https://bugzilla.kernel.org/show_bug.cgi?id=214437 Fixes: 42337b9d4d958("HID: add driver for U2F Zero built-in LED and RNG") Signed-off-by: Andrej Shadura Signed-off-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- drivers/hid/hid-u2fzero.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/hid/hid-u2fzero.c b/drivers/hid/hid-u2fzero.c index 95e0807878c7e..d70cd3d7f583b 100644 --- a/drivers/hid/hid-u2fzero.c +++ b/drivers/hid/hid-u2fzero.c @@ -198,7 +198,9 @@ static int u2fzero_rng_read(struct hwrng *rng, void *data, } ret = u2fzero_recv(dev, &req, &resp); - if (ret < 0) + + /* ignore errors or packets without data */ + if (ret < offsetof(struct u2f_hid_msg, init.data)) return 0; /* only take the minimum amount of data it is safe to take */ From 01f7110d1b529a95a8c6c62f68cab7a09444a864 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 27 Sep 2021 17:29:24 -0700 Subject: [PATCH 1046/5344] net: udp: annotate data race around udp_sk(sk)->corkflag commit a9f5970767d11eadc805d5283f202612c7ba1f59 upstream. up->corkflag field can be read or written without any lock. Annotate accesses to avoid possible syzbot/KCSAN reports. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/udp.c | 10 +++++----- net/ipv6/udp.c | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 63c45e665d248..579efc0a1f251 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -981,7 +981,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) __be16 dport; u8 tos; int err, is_udplite = IS_UDPLITE(sk); - int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; + int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE; int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); struct sk_buff *skb; struct ip_options_data opt_copy; @@ -1292,7 +1292,7 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset, } up->len += size; - if (!(up->corkflag || (flags&MSG_MORE))) + if (!(READ_ONCE(up->corkflag) || (flags&MSG_MORE))) ret = udp_push_pending_frames(sk); if (!ret) ret = size; @@ -2562,9 +2562,9 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, switch (optname) { case UDP_CORK: if (val != 0) { - up->corkflag = 1; + WRITE_ONCE(up->corkflag, 1); } else { - up->corkflag = 0; + WRITE_ONCE(up->corkflag, 0); lock_sock(sk); push_pending_frames(sk); release_sock(sk); @@ -2687,7 +2687,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, switch (optname) { case UDP_CORK: - val = up->corkflag; + val = READ_ONCE(up->corkflag); break; case UDP_ENCAP: diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 1a2f4e1a000ac..894d9102928ea 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1234,7 +1234,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int addr_len = msg->msg_namelen; bool connected = false; int ulen = len; - int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; + int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE; int err; int is_udplite = IS_UDPLITE(sk); int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); From fc4a2237c1a66b974424c00b2784b81ed4a6256c Mon Sep 17 00:00:00 2001 From: Leon Yu Date: Fri, 22 May 2020 23:29:43 +0800 Subject: [PATCH 1047/5344] net: stmmac: don't attach interface until resume finishes commit 31096c3e8b1163c6e966bf4d1f36d8b699008f84 upstream. Commit 14b41a2959fb ("net: stmmac: Delete txtimer in suspend") was the first attempt to fix a race between mod_timer() and setup_timer() during stmmac_resume(). However the issue still exists as the commit only addressed half of the issue. Same race can still happen as stmmac_resume() re-attaches interface way too early - even before hardware is fully initialized. Worse, doing so allows network traffic to restart and stmmac_tx_timer_arm() being called in the middle of stmmac_resume(), which re-init tx timers in stmmac_init_coalesce(). timer_list will be corrupted and system crashes as a result of race between mod_timer() and setup_timer(). systemd--1995 2.... 552950018us : stmmac_suspend: 4994 ksoftirq-9 0..s2 553123133us : stmmac_tx_timer_arm: 2276 systemd--1995 0.... 553127896us : stmmac_resume: 5101 systemd--320 7...2 553132752us : stmmac_tx_timer_arm: 2276 (sd-exec-1999 5...2 553135204us : stmmac_tx_timer_arm: 2276 --------------------------------- pc : run_timer_softirq+0x468/0x5e0 lr : run_timer_softirq+0x570/0x5e0 Call trace: run_timer_softirq+0x468/0x5e0 __do_softirq+0x124/0x398 irq_exit+0xd8/0xe0 __handle_domain_irq+0x6c/0xc0 gic_handle_irq+0x60/0xb0 el1_irq+0xb8/0x180 arch_cpu_idle+0x38/0x230 default_idle_call+0x24/0x3c do_idle+0x1e0/0x2b8 cpu_startup_entry+0x28/0x48 secondary_start_kernel+0x1b4/0x208 Fix this by deferring netif_device_attach() to the end of stmmac_resume(). Signed-off-by: Leon Yu Signed-off-by: David S. Miller Cc: Macpaul Lin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index e09851c7da9b8..835ac178bc8c0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -4855,8 +4855,6 @@ int stmmac_resume(struct device *dev) stmmac_mdio_reset(priv->mii); } - netif_device_attach(ndev); - mutex_lock(&priv->lock); stmmac_reset_queues_param(priv); @@ -4880,6 +4878,8 @@ int stmmac_resume(struct device *dev) phylink_mac_change(priv->phylink, true); + netif_device_attach(ndev); + return 0; } EXPORT_SYMBOL_GPL(stmmac_resume); From 65422360065d7601db80fca2af706330c5295c32 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Wed, 5 Aug 2020 14:07:07 +0200 Subject: [PATCH 1048/5344] hso: fix bailout in error case of probe commit 5fcfb6d0bfcda17f0d0656e4e5b3710af2bbaae5 upstream. The driver tries to reuse code for disconnect in case of a failed probe. If resources need to be freed after an error in probe, the netdev must not be freed because it has never been registered. Fix it by telling the helper which path we are in. Signed-off-by: Oliver Neukum Signed-off-by: David S. Miller Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/hso.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/usb/hso.c b/drivers/net/usb/hso.c index 9ad1f093c4ae1..8aa2e85f61a74 100644 --- a/drivers/net/usb/hso.c +++ b/drivers/net/usb/hso.c @@ -2354,7 +2354,7 @@ static int remove_net_device(struct hso_device *hso_dev) } /* Frees our network device */ -static void hso_free_net_device(struct hso_device *hso_dev) +static void hso_free_net_device(struct hso_device *hso_dev, bool bailout) { int i; struct hso_net *hso_net = dev2net(hso_dev); @@ -2377,7 +2377,7 @@ static void hso_free_net_device(struct hso_device *hso_dev) kfree(hso_net->mux_bulk_tx_buf); hso_net->mux_bulk_tx_buf = NULL; - if (hso_net->net) + if (hso_net->net && !bailout) free_netdev(hso_net->net); kfree(hso_dev); @@ -2553,7 +2553,7 @@ static struct hso_device *hso_create_net_device(struct usb_interface *interface, return hso_dev; exit: - hso_free_net_device(hso_dev); + hso_free_net_device(hso_dev, true); return NULL; } @@ -3120,7 +3120,7 @@ static void hso_free_interface(struct usb_interface *interface) rfkill_unregister(rfk); rfkill_destroy(rfk); } - hso_free_net_device(network_table[i]); + hso_free_net_device(network_table[i], false); } } } From d644a5028b6138d29378f5565a8ba5112a29ef06 Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Wed, 14 Jul 2021 17:13:22 +0800 Subject: [PATCH 1049/5344] usb: hso: fix error handling code of hso_create_net_device commit a6ecfb39ba9d7316057cea823b196b734f6b18ca upstream. The current error handling code of hso_create_net_device is hso_free_net_device, no matter which errors lead to. For example, WARNING in hso_free_net_device [1]. Fix this by refactoring the error handling code of hso_create_net_device by handling different errors by different code. [1] https://syzkaller.appspot.com/bug?id=66eff8d49af1b28370ad342787413e35bbe76efe Reported-by: syzbot+44d53c7255bb1aea22d2@syzkaller.appspotmail.com Fixes: 5fcfb6d0bfcd ("hso: fix bailout in error case of probe") Signed-off-by: Dongliang Mu Signed-off-by: David S. Miller Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/hso.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/drivers/net/usb/hso.c b/drivers/net/usb/hso.c index 8aa2e85f61a74..41b2724e7ce39 100644 --- a/drivers/net/usb/hso.c +++ b/drivers/net/usb/hso.c @@ -2497,7 +2497,7 @@ static struct hso_device *hso_create_net_device(struct usb_interface *interface, hso_net_init); if (!net) { dev_err(&interface->dev, "Unable to create ethernet device\n"); - goto exit; + goto err_hso_dev; } hso_net = netdev_priv(net); @@ -2510,13 +2510,13 @@ static struct hso_device *hso_create_net_device(struct usb_interface *interface, USB_DIR_IN); if (!hso_net->in_endp) { dev_err(&interface->dev, "Can't find BULK IN endpoint\n"); - goto exit; + goto err_net; } hso_net->out_endp = hso_get_ep(interface, USB_ENDPOINT_XFER_BULK, USB_DIR_OUT); if (!hso_net->out_endp) { dev_err(&interface->dev, "Can't find BULK OUT endpoint\n"); - goto exit; + goto err_net; } SET_NETDEV_DEV(net, &interface->dev); SET_NETDEV_DEVTYPE(net, &hso_type); @@ -2525,18 +2525,18 @@ static struct hso_device *hso_create_net_device(struct usb_interface *interface, for (i = 0; i < MUX_BULK_RX_BUF_COUNT; i++) { hso_net->mux_bulk_rx_urb_pool[i] = usb_alloc_urb(0, GFP_KERNEL); if (!hso_net->mux_bulk_rx_urb_pool[i]) - goto exit; + goto err_mux_bulk_rx; hso_net->mux_bulk_rx_buf_pool[i] = kzalloc(MUX_BULK_RX_BUF_SIZE, GFP_KERNEL); if (!hso_net->mux_bulk_rx_buf_pool[i]) - goto exit; + goto err_mux_bulk_rx; } hso_net->mux_bulk_tx_urb = usb_alloc_urb(0, GFP_KERNEL); if (!hso_net->mux_bulk_tx_urb) - goto exit; + goto err_mux_bulk_rx; hso_net->mux_bulk_tx_buf = kzalloc(MUX_BULK_TX_BUF_SIZE, GFP_KERNEL); if (!hso_net->mux_bulk_tx_buf) - goto exit; + goto err_free_tx_urb; add_net_device(hso_dev); @@ -2544,7 +2544,7 @@ static struct hso_device *hso_create_net_device(struct usb_interface *interface, result = register_netdev(net); if (result) { dev_err(&interface->dev, "Failed to register device\n"); - goto exit; + goto err_free_tx_buf; } hso_log_port(hso_dev); @@ -2552,8 +2552,21 @@ static struct hso_device *hso_create_net_device(struct usb_interface *interface, hso_create_rfkill(hso_dev, interface); return hso_dev; -exit: - hso_free_net_device(hso_dev, true); + +err_free_tx_buf: + remove_net_device(hso_dev); + kfree(hso_net->mux_bulk_tx_buf); +err_free_tx_urb: + usb_free_urb(hso_net->mux_bulk_tx_urb); +err_mux_bulk_rx: + for (i = 0; i < MUX_BULK_RX_BUF_COUNT; i++) { + usb_free_urb(hso_net->mux_bulk_rx_urb_pool[i]); + kfree(hso_net->mux_bulk_rx_buf_pool[i]); + } +err_net: + free_netdev(net); +err_hso_dev: + kfree(hso_dev); return NULL; } From 2636af9590b46c9cc407f484ee836449c8add098 Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Wed, 21 Jul 2021 16:14:57 +0800 Subject: [PATCH 1050/5344] usb: hso: remove the bailout parameter commit dcb713d53e2eadf42b878c12a471e74dc6ed3145 upstream. There are two invocation sites of hso_free_net_device. After refactoring hso_create_net_device, this parameter is useless. Remove the bailout in the hso_free_net_device and change the invocation sites of this function. Signed-off-by: Dongliang Mu Signed-off-by: David S. Miller Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/hso.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/usb/hso.c b/drivers/net/usb/hso.c index 41b2724e7ce39..31200275a0640 100644 --- a/drivers/net/usb/hso.c +++ b/drivers/net/usb/hso.c @@ -2354,7 +2354,7 @@ static int remove_net_device(struct hso_device *hso_dev) } /* Frees our network device */ -static void hso_free_net_device(struct hso_device *hso_dev, bool bailout) +static void hso_free_net_device(struct hso_device *hso_dev) { int i; struct hso_net *hso_net = dev2net(hso_dev); @@ -2377,7 +2377,7 @@ static void hso_free_net_device(struct hso_device *hso_dev, bool bailout) kfree(hso_net->mux_bulk_tx_buf); hso_net->mux_bulk_tx_buf = NULL; - if (hso_net->net && !bailout) + if (hso_net->net) free_netdev(hso_net->net); kfree(hso_dev); @@ -3133,7 +3133,7 @@ static void hso_free_interface(struct usb_interface *interface) rfkill_unregister(rfk); rfkill_destroy(rfk); } - hso_free_net_device(network_table[i], false); + hso_free_net_device(network_table[i]); } } } From a2a74a1ec4b63a486811e597ab6c332c2354f659 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 26 Aug 2021 16:04:27 +0300 Subject: [PATCH 1051/5344] crypto: ccp - fix resource leaks in ccp_run_aes_gcm_cmd() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 505d9dcb0f7ddf9d075e729523a33d38642ae680 upstream. There are three bugs in this code: 1) If we ccp_init_data() fails for &src then we need to free aad. Use goto e_aad instead of goto e_ctx. 2) The label to free the &final_wa was named incorrectly as "e_tag" but it should have been "e_final_wa". One error path leaked &final_wa. 3) The &tag was leaked on one error path. In that case, I added a free before the goto because the resource was local to that block. Fixes: 36cf515b9bbe ("crypto: ccp - Enable support for AES GCM on v5 CCPs") Reported-by: "minihanshen(沈明航)" Signed-off-by: Dan Carpenter Reviewed-by: John Allen Tested-by: John Allen Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- drivers/crypto/ccp/ccp-ops.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/crypto/ccp/ccp-ops.c b/drivers/crypto/ccp/ccp-ops.c index 7234b95241e91..e826c4b6b3afd 100644 --- a/drivers/crypto/ccp/ccp-ops.c +++ b/drivers/crypto/ccp/ccp-ops.c @@ -778,7 +778,7 @@ ccp_run_aes_gcm_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd) in_place ? DMA_BIDIRECTIONAL : DMA_TO_DEVICE); if (ret) - goto e_ctx; + goto e_aad; if (in_place) { dst = src; @@ -863,7 +863,7 @@ ccp_run_aes_gcm_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd) op.u.aes.size = 0; ret = cmd_q->ccp->vdata->perform->aes(&op); if (ret) - goto e_dst; + goto e_final_wa; if (aes->action == CCP_AES_ACTION_ENCRYPT) { /* Put the ciphered tag after the ciphertext. */ @@ -873,17 +873,19 @@ ccp_run_aes_gcm_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd) ret = ccp_init_dm_workarea(&tag, cmd_q, authsize, DMA_BIDIRECTIONAL); if (ret) - goto e_tag; + goto e_final_wa; ret = ccp_set_dm_area(&tag, 0, p_tag, 0, authsize); - if (ret) - goto e_tag; + if (ret) { + ccp_dm_free(&tag); + goto e_final_wa; + } ret = crypto_memneq(tag.address, final_wa.address, authsize) ? -EBADMSG : 0; ccp_dm_free(&tag); } -e_tag: +e_final_wa: ccp_dm_free(&final_wa); e_dst: From 5711258784a83bdb7b92f2af82795a9952c09de6 Mon Sep 17 00:00:00 2001 From: "F.A.Sulaiman" Date: Tue, 24 Aug 2021 20:37:30 +0530 Subject: [PATCH 1052/5344] HID: betop: fix slab-out-of-bounds Write in betop_probe commit 1e4ce418b1cb1a810256b5fb3fd33d22d1325993 upstream. Syzbot reported slab-out-of-bounds Write bug in hid-betopff driver. The problem is the driver assumes the device must have an input report but some malicious devices violate this assumption. So this patch checks hid_device's input is non empty before it's been used. Reported-by: syzbot+07efed3bc5a1407bd742@syzkaller.appspotmail.com Signed-off-by: F.A. SULAIMAN Reviewed-by: Pavel Skripkin Signed-off-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- drivers/hid/hid-betopff.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/hid/hid-betopff.c b/drivers/hid/hid-betopff.c index 0790fbd3fc9a2..467d789f9bc2d 100644 --- a/drivers/hid/hid-betopff.c +++ b/drivers/hid/hid-betopff.c @@ -56,15 +56,22 @@ static int betopff_init(struct hid_device *hid) { struct betopff_device *betopff; struct hid_report *report; - struct hid_input *hidinput = - list_first_entry(&hid->inputs, struct hid_input, list); + struct hid_input *hidinput; struct list_head *report_list = &hid->report_enum[HID_OUTPUT_REPORT].report_list; - struct input_dev *dev = hidinput->input; + struct input_dev *dev; int field_count = 0; int error; int i, j; + if (list_empty(&hid->inputs)) { + hid_err(hid, "no inputs found\n"); + return -ENODEV; + } + + hidinput = list_first_entry(&hid->inputs, struct hid_input, list); + dev = hidinput->input; + if (list_empty(report_list)) { hid_err(hid, "no output reports found\n"); return -ENODEV; From f4db06a546d16d7ef973b8c35bd6edfe7166aebf Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Mon, 6 Sep 2021 18:26:34 +0200 Subject: [PATCH 1053/5344] netfilter: ipset: Fix oversized kvmalloc() calls commit 7bbc3d385bd813077acaf0e6fdb2a86a901f5382 upstream. The commit commit 7661809d493b426e979f39ab512e3adf41fbcc69 Author: Linus Torvalds Date: Wed Jul 14 09:45:49 2021 -0700 mm: don't allow oversized kvmalloc() calls limits the max allocatable memory via kvmalloc() to MAX_INT. Apply the same limit in ipset. Reported-by: syzbot+3493b1873fb3ea827986@syzkaller.appspotmail.com Reported-by: syzbot+2b8443c35458a617c904@syzkaller.appspotmail.com Reported-by: syzbot+ee5cb15f4a0e85e0d54e@syzkaller.appspotmail.com Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman --- net/netfilter/ipset/ip_set_hash_gen.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 500de37858ac8..1b44dfa7ba856 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -132,11 +132,11 @@ htable_size(u8 hbits) { size_t hsize; - /* We must fit both into u32 in jhash and size_t */ + /* We must fit both into u32 in jhash and INT_MAX in kvmalloc_node() */ if (hbits > 31) return 0; hsize = jhash_size(hbits); - if ((((size_t)-1) - sizeof(struct htable)) / sizeof(struct hbucket *) + if ((INT_MAX - sizeof(struct htable)) / sizeof(struct hbucket *) < hsize) return 0; From f4214be4575dc73156b1a505984a5de1b2f6efe1 Mon Sep 17 00:00:00 2001 From: Anirudh Rayabharam Date: Thu, 24 Jun 2021 00:10:29 +0530 Subject: [PATCH 1054/5344] HID: usbhid: free raw_report buffers in usbhid_stop commit f7744fa16b96da57187dc8e5634152d3b63d72de upstream. Free the unsent raw_report buffers when the device is removed. Fixes a memory leak reported by syzbot at: https://syzkaller.appspot.com/bug?id=7b4fa7cb1a7c2d3342a2a8a6c53371c8c418ab47 Reported-by: syzbot+47b26cd837ececfc666d@syzkaller.appspotmail.com Tested-by: syzbot+47b26cd837ececfc666d@syzkaller.appspotmail.com Signed-off-by: Anirudh Rayabharam Signed-off-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- drivers/hid/usbhid/hid-core.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c index 1cfbbaf6901da..8537fcdb456df 100644 --- a/drivers/hid/usbhid/hid-core.c +++ b/drivers/hid/usbhid/hid-core.c @@ -503,7 +503,7 @@ static void hid_ctrl(struct urb *urb) if (unplug) { usbhid->ctrltail = usbhid->ctrlhead; - } else { + } else if (usbhid->ctrlhead != usbhid->ctrltail) { usbhid->ctrltail = (usbhid->ctrltail + 1) & (HID_CONTROL_FIFO_SIZE - 1); if (usbhid->ctrlhead != usbhid->ctrltail && @@ -1221,9 +1221,20 @@ static void usbhid_stop(struct hid_device *hid) mutex_lock(&usbhid->mutex); clear_bit(HID_STARTED, &usbhid->iofl); + spin_lock_irq(&usbhid->lock); /* Sync with error and led handlers */ set_bit(HID_DISCONNECTED, &usbhid->iofl); + while (usbhid->ctrltail != usbhid->ctrlhead) { + if (usbhid->ctrl[usbhid->ctrltail].dir == USB_DIR_OUT) { + kfree(usbhid->ctrl[usbhid->ctrltail].raw_report); + usbhid->ctrl[usbhid->ctrltail].raw_report = NULL; + } + + usbhid->ctrltail = (usbhid->ctrltail + 1) & + (HID_CONTROL_FIFO_SIZE - 1); + } spin_unlock_irq(&usbhid->lock); + usb_kill_urb(usbhid->urbin); usb_kill_urb(usbhid->urbout); usb_kill_urb(usbhid->urbctrl); From f160c253528990e7528218cd34a4d7308ff86091 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 6 Oct 2021 15:42:39 +0200 Subject: [PATCH 1055/5344] Linux 5.4.151 Link: https://lore.kernel.org/r/20211004125030.002116402@linuxfoundation.org Tested-by: Shuah Khan Tested-by: Guenter Roeck Link: https://lore.kernel.org/r/20211005083256.183739807@linuxfoundation.org Tested-by: Jon Hunter Tested-by: Florian Fainelli Tested-by: Shuah Khan Tested-by: Linux Kernel Functional Testing Tested-by: Sudip Mukherjee Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index c6b3a3d62f6ca..4eeb72027815b 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 4 -SUBLEVEL = 150 +SUBLEVEL = 151 EXTRAVERSION = NAME = Kleptomaniac Octopus From acc40357c78c5fa9e9e20f8b69b0ca9b725d6601 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 17 Sep 2021 16:34:32 +0300 Subject: [PATCH 1056/5344] net: mdio: introduce a shutdown method to mdio device drivers [ Upstream commit cf9579976f724ad517cc15b7caadea728c7e245c ] MDIO-attached devices might have interrupts and other things that might need quiesced when we kexec into a new kernel. Things are even more creepy when those interrupt lines are shared, and in that case it is absolutely mandatory to disable all interrupt sources. Moreover, MDIO devices might be DSA switches, and DSA needs its own shutdown method to unlink from the DSA master, which is a new requirement that appeared after commit 2f1e8ea726e9 ("net: dsa: link interfaces with the DSA master to get rid of lockdep warnings"). So introduce a ->shutdown method in the MDIO device driver structure. Signed-off-by: Vladimir Oltean Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/phy/mdio_device.c | 11 +++++++++++ include/linux/mdio.h | 3 +++ 2 files changed, 14 insertions(+) diff --git a/drivers/net/phy/mdio_device.c b/drivers/net/phy/mdio_device.c index c1d345c3cab35..b2dd293fc87eb 100644 --- a/drivers/net/phy/mdio_device.c +++ b/drivers/net/phy/mdio_device.c @@ -180,6 +180,16 @@ static int mdio_remove(struct device *dev) return 0; } +static void mdio_shutdown(struct device *dev) +{ + struct mdio_device *mdiodev = to_mdio_device(dev); + struct device_driver *drv = mdiodev->dev.driver; + struct mdio_driver *mdiodrv = to_mdio_driver(drv); + + if (mdiodrv->shutdown) + mdiodrv->shutdown(mdiodev); +} + /** * mdio_driver_register - register an mdio_driver with the MDIO layer * @new_driver: new mdio_driver to register @@ -194,6 +204,7 @@ int mdio_driver_register(struct mdio_driver *drv) mdiodrv->driver.bus = &mdio_bus_type; mdiodrv->driver.probe = mdio_probe; mdiodrv->driver.remove = mdio_remove; + mdiodrv->driver.shutdown = mdio_shutdown; retval = driver_register(&mdiodrv->driver); if (retval) { diff --git a/include/linux/mdio.h b/include/linux/mdio.h index a7604248777b7..0f1f784de80e9 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -64,6 +64,9 @@ struct mdio_driver { /* Clears up any memory if needed */ void (*remove)(struct mdio_device *mdiodev); + + /* Quiesces the device on system shutdown, turns off interrupts etc */ + void (*shutdown)(struct mdio_device *mdiodev); }; #define to_mdio_driver(d) \ container_of(to_mdio_common_driver(d), struct mdio_driver, mdiodrv) From 37fcdf07b17118eed935338aa91cf24c77c4853d Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 17 Sep 2021 08:27:10 +0200 Subject: [PATCH 1057/5344] xen-netback: correct success/error reporting for the SKB-with-fraglist case [ Upstream commit 3ede7f84c7c21f93c5eac611d60eba3f2c765e0f ] When re-entering the main loop of xenvif_tx_check_gop() a 2nd time, the special considerations for the head of the SKB no longer apply. Don't mistakenly report ERROR to the frontend for the first entry in the list, even if - from all I can tell - this shouldn't matter much as the overall transmit will need to be considered failed anyway. Signed-off-by: Jan Beulich Reviewed-by: Paul Durrant Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/xen-netback/netback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index c213f2b812691..995566a2785fa 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -492,7 +492,7 @@ static int xenvif_tx_check_gop(struct xenvif_queue *queue, * the header's copy failed, and they are * sharing a slot, send an error */ - if (i == 0 && sharedslot) + if (i == 0 && !first_shinfo && sharedslot) xenvif_idx_release(queue, pending_idx, XEN_NETIF_RSP_ERROR); else From afb892bd226ca0ef01be6470ac8d01328eaebef0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 20 Sep 2021 10:56:32 -0700 Subject: [PATCH 1058/5344] sparc64: fix pci_iounmap() when CONFIG_PCI is not set [ Upstream commit d8b1e10a2b8efaf71d151aa756052fbf2f3b6d57 ] Guenter reported [1] that the pci_iounmap() changes remain problematic, with sparc64 allnoconfig and tinyconfig still not building due to the header file changes and confusion with the arch-specific pci_iounmap() implementation. I'm pretty convinced that sparc should just use GENERIC_IOMAP instead of doing its own thing, since it turns out that the sparc64 version of pci_iounmap() is somewhat buggy (see [2]). But in the meantime, this just fixes the build by avoiding the trivial re-definition of the empty case. Link: https://lore.kernel.org/lkml/20210920134424.GA346531@roeck-us.net/ [1] Link: https://lore.kernel.org/lkml/CAHk-=wgheheFx9myQyy5osh79BAazvmvYURAtub2gQtMvLrhqQ@mail.gmail.com/ [2] Reported-by: Guenter Roeck Cc: David Miller Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- arch/sparc/lib/iomap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/sparc/lib/iomap.c b/arch/sparc/lib/iomap.c index c9da9f139694d..f3a8cd491ce0d 100644 --- a/arch/sparc/lib/iomap.c +++ b/arch/sparc/lib/iomap.c @@ -19,8 +19,10 @@ void ioport_unmap(void __iomem *addr) EXPORT_SYMBOL(ioport_map); EXPORT_SYMBOL(ioport_unmap); +#ifdef CONFIG_PCI void pci_iounmap(struct pci_dev *dev, void __iomem * addr) { /* nothing to do */ } EXPORT_SYMBOL(pci_iounmap); +#endif From f201a53764557ded42b0cfefc6292d1e1064adb1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 21 Sep 2021 23:32:33 +0300 Subject: [PATCH 1059/5344] ext2: fix sleeping in atomic bugs on error [ Upstream commit 372d1f3e1bfede719864d0d1fbf3146b1e638c88 ] The ext2_error() function syncs the filesystem so it sleeps. The caller is holding a spinlock so it's not allowed to sleep. ext2_statfs() <- disables preempt -> ext2_count_free_blocks() -> ext2_get_group_desc() Fix this by using WARN() to print an error message and a stack trace instead of using ext2_error(). Link: https://lore.kernel.org/r/20210921203233.GA16529@kili Signed-off-by: Dan Carpenter Signed-off-by: Jan Kara Signed-off-by: Sasha Levin --- fs/ext2/balloc.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index e0cc551645059..abac81a2e694c 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -48,10 +48,9 @@ struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb, struct ext2_sb_info *sbi = EXT2_SB(sb); if (block_group >= sbi->s_groups_count) { - ext2_error (sb, "ext2_get_group_desc", - "block_group >= groups_count - " - "block_group = %d, groups_count = %lu", - block_group, sbi->s_groups_count); + WARN(1, "block_group >= groups_count - " + "block_group = %d, groups_count = %lu", + block_group, sbi->s_groups_count); return NULL; } @@ -59,10 +58,9 @@ struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb, group_desc = block_group >> EXT2_DESC_PER_BLOCK_BITS(sb); offset = block_group & (EXT2_DESC_PER_BLOCK(sb) - 1); if (!sbi->s_group_desc[group_desc]) { - ext2_error (sb, "ext2_get_group_desc", - "Group descriptor not loaded - " - "block_group = %d, group_desc = %lu, desc = %lu", - block_group, group_desc, offset); + WARN(1, "Group descriptor not loaded - " + "block_group = %d, group_desc = %lu, desc = %lu", + block_group, group_desc, offset); return NULL; } From ff47c39ca1476cce4c65870a87cf9480b3cc51c9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 6 Sep 2021 17:01:12 +0800 Subject: [PATCH 1060/5344] scsi: sd: Free scsi_disk device via put_device() [ Upstream commit 265dfe8ebbabae7959060bd1c3f75c2473b697ed ] After a device is initialized via device_initialize() it should be freed via put_device(). sd_probe() currently gets this wrong, fix it up. Link: https://lore.kernel.org/r/20210906090112.531442-1-ming.lei@redhat.com Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/sd.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index f55249766d224..152b486051522 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3345,15 +3345,16 @@ static int sd_probe(struct device *dev) } device_initialize(&sdkp->dev); - sdkp->dev.parent = dev; + sdkp->dev.parent = get_device(dev); sdkp->dev.class = &sd_disk_class; dev_set_name(&sdkp->dev, "%s", dev_name(dev)); error = device_add(&sdkp->dev); - if (error) - goto out_free_index; + if (error) { + put_device(&sdkp->dev); + goto out; + } - get_device(dev); dev_set_drvdata(dev, sdkp); gd->major = sd_major((index & 0xf0) >> 4); From f713728101cb0f2bde985aa1ea758f3d74008b34 Mon Sep 17 00:00:00 2001 From: Faizel K B Date: Thu, 2 Sep 2021 17:14:44 +0530 Subject: [PATCH 1061/5344] usb: testusb: Fix for showing the connection speed [ Upstream commit f81c08f897adafd2ed43f86f00207ff929f0b2eb ] testusb' application which uses 'usbtest' driver reports 'unknown speed' from the function 'find_testdev'. The variable 'entry->speed' was not updated from the application. The IOCTL mentioned in the FIXME comment can only report whether the connection is low speed or not. Speed is read using the IOCTL USBDEVFS_GET_SPEED which reports the proper speed grade. The call is implemented in the function 'handle_testdev' where the file descriptor was availble locally. Sample output is given below where 'high speed' is printed as the connected speed. sudo ./testusb -a high speed /dev/bus/usb/001/011 0 /dev/bus/usb/001/011 test 0, 0.000015 secs /dev/bus/usb/001/011 test 1, 0.194208 secs /dev/bus/usb/001/011 test 2, 0.077289 secs /dev/bus/usb/001/011 test 3, 0.170604 secs /dev/bus/usb/001/011 test 4, 0.108335 secs /dev/bus/usb/001/011 test 5, 2.788076 secs /dev/bus/usb/001/011 test 6, 2.594610 secs /dev/bus/usb/001/011 test 7, 2.905459 secs /dev/bus/usb/001/011 test 8, 2.795193 secs /dev/bus/usb/001/011 test 9, 8.372651 secs /dev/bus/usb/001/011 test 10, 6.919731 secs /dev/bus/usb/001/011 test 11, 16.372687 secs /dev/bus/usb/001/011 test 12, 16.375233 secs /dev/bus/usb/001/011 test 13, 2.977457 secs /dev/bus/usb/001/011 test 14 --> 22 (Invalid argument) /dev/bus/usb/001/011 test 17, 0.148826 secs /dev/bus/usb/001/011 test 18, 0.068718 secs /dev/bus/usb/001/011 test 19, 0.125992 secs /dev/bus/usb/001/011 test 20, 0.127477 secs /dev/bus/usb/001/011 test 21 --> 22 (Invalid argument) /dev/bus/usb/001/011 test 24, 4.133763 secs /dev/bus/usb/001/011 test 27, 2.140066 secs /dev/bus/usb/001/011 test 28, 2.120713 secs /dev/bus/usb/001/011 test 29, 0.507762 secs Signed-off-by: Faizel K B Link: https://lore.kernel.org/r/20210902114444.15106-1-faizel.kb@dicortech.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- tools/usb/testusb.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tools/usb/testusb.c b/tools/usb/testusb.c index ee8208b2f9460..69c3ead25313d 100644 --- a/tools/usb/testusb.c +++ b/tools/usb/testusb.c @@ -265,12 +265,6 @@ static int find_testdev(const char *name, const struct stat *sb, int flag) } entry->ifnum = ifnum; - - /* FIXME update USBDEVFS_CONNECTINFO so it tells about high speed etc */ - - fprintf(stderr, "%s speed\t%s\t%u\n", - speed(entry->speed), entry->name, entry->ifnum); - entry->next = testdevs; testdevs = entry; return 0; @@ -299,6 +293,14 @@ static void *handle_testdev (void *arg) return 0; } + status = ioctl(fd, USBDEVFS_GET_SPEED, NULL); + if (status < 0) + fprintf(stderr, "USBDEVFS_GET_SPEED failed %d\n", status); + else + dev->speed = status; + fprintf(stderr, "%s speed\t%s\t%u\n", + speed(dev->speed), dev->name, dev->ifnum); + restart: for (i = 0; i < TEST_CASES; i++) { if (dev->test != -1 && dev->test != i) From 7d0788956c78afdc99a1d0547b3f98267990454d Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 31 Aug 2021 16:42:36 +0800 Subject: [PATCH 1062/5344] usb: dwc2: check return value after calling platform_get_resource() [ Upstream commit 856e6e8e0f9300befa87dde09edb578555c99a82 ] It will cause null-ptr-deref if platform_get_resource() returns NULL, we need check the return value. Signed-off-by: Yang Yingliang Link: https://lore.kernel.org/r/20210831084236.1359677-1-yangyingliang@huawei.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/usb/dwc2/hcd.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/usb/dwc2/hcd.c b/drivers/usb/dwc2/hcd.c index f29fbadb05487..78329d0e9af0a 100644 --- a/drivers/usb/dwc2/hcd.c +++ b/drivers/usb/dwc2/hcd.c @@ -5074,6 +5074,10 @@ int dwc2_hcd_init(struct dwc2_hsotg *hsotg) hcd->has_tt = 1; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) { + retval = -EINVAL; + goto error1; + } hcd->rsrc_start = res->start; hcd->rsrc_len = resource_size(res); From bab48e694120402c981bf7959090cbc72d2551e9 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Wed, 15 Sep 2021 21:45:54 +0800 Subject: [PATCH 1063/5344] selftests: be sure to make khdr before other targets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 8914a7a247e065438a0ec86a58c1c359223d2c9e ] LKP/0Day reported some building errors about kvm, and errors message are not always same: - lib/x86_64/processor.c:1083:31: error: ‘KVM_CAP_NESTED_STATE’ undeclared (first use in this function); did you mean ‘KVM_CAP_PIT_STATE2’? - lib/test_util.c:189:30: error: ‘MAP_HUGE_16KB’ undeclared (first use in this function); did you mean ‘MAP_HUGE_16GB’? Although kvm relies on the khdr, they still be built in parallel when -j is specified. In this case, it will cause compiling errors. Here we mark target khdr as NOTPARALLEL to make it be always built first. CC: Philip Li Reported-by: kernel test robot Signed-off-by: Li Zhijian Signed-off-by: Shuah Khan Signed-off-by: Sasha Levin --- tools/testing/selftests/lib.mk | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index 67386aa3f31d1..8794ce382bf5a 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -48,6 +48,7 @@ ARCH ?= $(SUBARCH) # When local build is done, headers are installed in the default # INSTALL_HDR_PATH usr/include. .PHONY: khdr +.NOTPARALLEL: khdr: ifndef KSFT_KHDR_INSTALL_DONE ifeq (1,$(DEFAULT_INSTALL_HDR_PATH)) From 83b929ef0c4d06bcd0eedd1ee11c1c835fe218c8 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 15 Sep 2021 15:28:06 -0600 Subject: [PATCH 1064/5344] selftests:kvm: fix get_warnings_count() ignoring fscanf() return warn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 39a71f712d8a13728febd8f3cb3f6db7e1fa7221 ] Fix get_warnings_count() to check fscanf() return value to get rid of the following warning: x86_64/mmio_warning_test.c: In function ‘get_warnings_count’: x86_64/mmio_warning_test.c:85:2: warning: ignoring return value of ‘fscanf’ declared with attribute ‘warn_unused_result’ [-Wunused-result] 85 | fscanf(f, "%d", &warnings); | ^~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Shuah Khan Acked-by: Paolo Bonzini Signed-off-by: Shuah Khan Signed-off-by: Sasha Levin --- tools/testing/selftests/kvm/x86_64/mmio_warning_test.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c b/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c index 00bb97d76000f..2cbc09aad7f64 100644 --- a/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c +++ b/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c @@ -82,7 +82,8 @@ int get_warnings_count(void) FILE *f; f = popen("dmesg | grep \"WARNING:\" | wc -l", "r"); - fscanf(f, "%d", &warnings); + if (fscanf(f, "%d", &warnings) < 1) + warnings = 0; fclose(f); return warnings; From 82d5e8a2af27e1ef191c4c2ba85a3f36a7e83d8f Mon Sep 17 00:00:00 2001 From: Wen Xiong Date: Thu, 16 Sep 2021 22:24:21 -0500 Subject: [PATCH 1065/5344] scsi: ses: Retry failed Send/Receive Diagnostic commands [ Upstream commit fbdac19e642899455b4e64c63aafe2325df7aafa ] Setting SCSI logging level with error=3, we saw some errors from enclosues: [108017.360833] ses 0:0:9:0: tag#641 Done: NEEDS_RETRY Result: hostbyte=DID_ERROR driverbyte=DRIVER_OK cmd_age=0s [108017.360838] ses 0:0:9:0: tag#641 CDB: Receive Diagnostic 1c 01 01 00 20 00 [108017.427778] ses 0:0:9:0: Power-on or device reset occurred [108017.427784] ses 0:0:9:0: tag#641 Done: SUCCESS Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s [108017.427788] ses 0:0:9:0: tag#641 CDB: Receive Diagnostic 1c 01 01 00 20 00 [108017.427791] ses 0:0:9:0: tag#641 Sense Key : Unit Attention [current] [108017.427793] ses 0:0:9:0: tag#641 Add. Sense: Bus device reset function occurred [108017.427801] ses 0:0:9:0: Failed to get diagnostic page 0x1 [108017.427804] ses 0:0:9:0: Failed to bind enclosure -19 [108017.427895] ses 0:0:10:0: Attached Enclosure device [108017.427942] ses 0:0:10:0: Attached scsi generic sg18 type 13 Retry if the Send/Receive Diagnostic commands complete with a transient error status (NOT_READY or UNIT_ATTENTION with ASC 0x29). Link: https://lore.kernel.org/r/1631849061-10210-2-git-send-email-wenxiong@linux.ibm.com Reviewed-by: Brian King Reviewed-by: James Bottomley Signed-off-by: Wen Xiong Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/ses.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/ses.c b/drivers/scsi/ses.c index c2afba2a5414d..43e682297fd5f 100644 --- a/drivers/scsi/ses.c +++ b/drivers/scsi/ses.c @@ -87,9 +87,16 @@ static int ses_recv_diag(struct scsi_device *sdev, int page_code, 0 }; unsigned char recv_page_code; + unsigned int retries = SES_RETRIES; + struct scsi_sense_hdr sshdr; + + do { + ret = scsi_execute_req(sdev, cmd, DMA_FROM_DEVICE, buf, bufflen, + &sshdr, SES_TIMEOUT, 1, NULL); + } while (ret > 0 && --retries && scsi_sense_valid(&sshdr) && + (sshdr.sense_key == NOT_READY || + (sshdr.sense_key == UNIT_ATTENTION && sshdr.asc == 0x29))); - ret = scsi_execute_req(sdev, cmd, DMA_FROM_DEVICE, buf, bufflen, - NULL, SES_TIMEOUT, SES_RETRIES, NULL); if (unlikely(ret)) return ret; @@ -121,9 +128,16 @@ static int ses_send_diag(struct scsi_device *sdev, int page_code, bufflen & 0xff, 0 }; + struct scsi_sense_hdr sshdr; + unsigned int retries = SES_RETRIES; + + do { + result = scsi_execute_req(sdev, cmd, DMA_TO_DEVICE, buf, bufflen, + &sshdr, SES_TIMEOUT, 1, NULL); + } while (result > 0 && --retries && scsi_sense_valid(&sshdr) && + (sshdr.sense_key == NOT_READY || + (sshdr.sense_key == UNIT_ATTENTION && sshdr.asc == 0x29))); - result = scsi_execute_req(sdev, cmd, DMA_TO_DEVICE, buf, bufflen, - NULL, SES_TIMEOUT, SES_RETRIES, NULL); if (result) sdev_printk(KERN_ERR, sdev, "SEND DIAGNOSTIC result: %8x\n", result); From 9c6a7890783d0b6e15e059b3551149147cecb73b Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Fri, 24 Sep 2021 15:43:41 -0700 Subject: [PATCH 1066/5344] tools/vm/page-types: remove dependency on opt_file for idle page tracking [ Upstream commit ebaeab2fe87987cef28eb5ab174c42cd28594387 ] Idle page tracking can also be used for process address space, not only file mappings. Without this change, using with '-i' option for process address space encounters below errors reported. $ sudo ./page-types -p $(pidof bash) -i mark page idle: Bad file descriptor mark page idle: Bad file descriptor mark page idle: Bad file descriptor mark page idle: Bad file descriptor ... Link: https://lkml.kernel.org/r/20210917032826.10669-1-changbin.du@gmail.com Signed-off-by: Changbin Du Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- tools/vm/page-types.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index 58c0eab71bca3..d2e836b2d16b0 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c @@ -1329,7 +1329,7 @@ int main(int argc, char *argv[]) if (opt_list && opt_list_mapcnt) kpagecount_fd = checked_open(PROC_KPAGECOUNT, O_RDONLY); - if (opt_mark_idle && opt_file) + if (opt_mark_idle) page_idle_fd = checked_open(SYS_KERNEL_MM_PAGE_IDLE, O_RDWR); if (opt_list && opt_pid) From e5dd92781299ff02da86eda1bd5e204e831bae0d Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 2 Sep 2021 12:11:00 +0900 Subject: [PATCH 1067/5344] KVM: do not shrink halt_poll_ns below grow_start [ Upstream commit ae232ea460888dc5a8b37e840c553b02521fbf18 ] grow_halt_poll_ns() ignores values between 0 and halt_poll_ns_grow_start (10000 by default). However, when we shrink halt_poll_ns we may fall way below halt_poll_ns_grow_start and endup with halt_poll_ns values that don't make a lot of sense: like 1 or 9, or 19. VCPU1 trace (halt_poll_ns_shrink equals 2): VCPU1 grow 10000 VCPU1 shrink 5000 VCPU1 shrink 2500 VCPU1 shrink 1250 VCPU1 shrink 625 VCPU1 shrink 312 VCPU1 shrink 156 VCPU1 shrink 78 VCPU1 shrink 39 VCPU1 shrink 19 VCPU1 shrink 9 VCPU1 shrink 4 Mirror what grow_halt_poll_ns() does and set halt_poll_ns to 0 as soon as new shrink-ed halt_poll_ns value falls below halt_poll_ns_grow_start. Signed-off-by: Sergey Senozhatsky Signed-off-by: Paolo Bonzini Message-Id: <20210902031100.252080-1-senozhatsky@chromium.org> Signed-off-by: Paolo Bonzini Signed-off-by: Sasha Levin --- virt/kvm/kvm_main.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c77a0ee76eaa0..cfe67ac78648d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2470,15 +2470,19 @@ static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu) { - unsigned int old, val, shrink; + unsigned int old, val, shrink, grow_start; old = val = vcpu->halt_poll_ns; shrink = READ_ONCE(halt_poll_ns_shrink); + grow_start = READ_ONCE(halt_poll_ns_grow_start); if (shrink == 0) val = 0; else val /= shrink; + if (val < grow_start) + val = 0; + vcpu->halt_poll_ns = val; trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old); } From ed1dcc235fcb4c6c0de8db6e3fd0ebfad9861d46 Mon Sep 17 00:00:00 2001 From: Fares Mehanna Date: Wed, 15 Sep 2021 13:39:50 +0000 Subject: [PATCH 1068/5344] kvm: x86: Add AMD PMU MSRs to msrs_to_save_all[] [ Upstream commit e1fc1553cd78292ab3521c94c9dd6e3e70e606a1 ] Intel PMU MSRs is in msrs_to_save_all[], so add AMD PMU MSRs to have a consistent behavior between Intel and AMD when using KVM_GET_MSRS, KVM_SET_MSRS or KVM_GET_MSR_INDEX_LIST. We have to add legacy and new MSRs to handle guests running without X86_FEATURE_PERFCTR_CORE. Signed-off-by: Fares Mehanna Message-Id: <20210915133951.22389-1-faresx@amazon.de> Signed-off-by: Paolo Bonzini Signed-off-by: Sasha Levin --- arch/x86/kvm/x86.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c9ca600258a09..cc62a1feb9290 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1301,6 +1301,13 @@ static const u32 msrs_to_save_all[] = { MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, + + MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, + MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, + MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, + MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, + MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, + MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, }; static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)]; From 467ddd5adcecc2b5012067c248b21874efa67cb5 Mon Sep 17 00:00:00 2001 From: Anand K Mistry Date: Wed, 29 Sep 2021 17:04:21 +1000 Subject: [PATCH 1069/5344] perf/x86: Reset destroy callback on event init failure commit 02d029a41dc986e2d5a77ecca45803857b346829 upstream. perf_init_event tries multiple init callbacks and does not reset the event state between tries. When x86_pmu_event_init runs, it unconditionally sets the destroy callback to hw_perf_event_destroy. On the next init attempt after x86_pmu_event_init, in perf_try_init_event, if the pmu's capabilities includes PERF_PMU_CAP_NO_EXCLUDE, the destroy callback will be run. However, if the next init didn't set the destroy callback, hw_perf_event_destroy will be run (since the callback wasn't reset). Looking at other pmu init functions, the common pattern is to only set the destroy callback on a successful init. Resetting the callback on failure tries to replicate that pattern. This was discovered after commit f11dd0d80555 ("perf/x86/amd/ibs: Extend PERF_PMU_CAP_NO_EXCLUDE to IBS Op") when the second (and only second) run of the perf tool after a reboot results in 0 samples being generated. The extra run of hw_perf_event_destroy results in active_events having an extra decrement on each perf run. The second run has active_events == 0 and every subsequent run has active_events < 0. When active_events == 0, the NMI handler will early-out and not record any samples. Signed-off-by: Anand K Mistry Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210929170405.1.I078b98ee7727f9ae9d6df8262bad7e325e40faf0@changeid Signed-off-by: Greg Kroah-Hartman --- arch/x86/events/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index d608ad17b69cc..48b8cce4404fa 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2214,6 +2214,7 @@ static int x86_pmu_event_init(struct perf_event *event) if (err) { if (event->destroy) event->destroy(event); + event->destroy = NULL; } if (READ_ONCE(x86_pmu.attr_rdpmc) && From dbc9af6dfd5a72b034101da71a9e263a35b2e717 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 14 Sep 2020 13:07:19 -0400 Subject: [PATCH 1070/5344] silence nfscache allocation warnings with kvzalloc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 8c38b705b4f4ca4e7f9cc116141bc38391917c30 upstream. silence nfscache allocation warnings with kvzalloc Currently nfsd_reply_cache_init attempts hash table allocation through kmalloc, and manually falls back to vzalloc if that fails. This makes the code a little larger than needed, and creates a significant amount of serial console spam if you have enough systems. Switching to kvzalloc gets rid of the allocation warnings, and makes the code a little cleaner too as a side effect. Freeing of nn->drc_hashtbl is already done using kvfree currently. Signed-off-by: Rik van Riel Signed-off-by: J. Bruce Fields Cc: Krzysztof Olędzki Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfscache.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 4a258065188e1..670e97dd67f06 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -173,14 +173,10 @@ int nfsd_reply_cache_init(struct nfsd_net *nn) if (status) goto out_nomem; - nn->drc_hashtbl = kcalloc(hashsize, - sizeof(*nn->drc_hashtbl), GFP_KERNEL); - if (!nn->drc_hashtbl) { - nn->drc_hashtbl = vzalloc(array_size(hashsize, - sizeof(*nn->drc_hashtbl))); - if (!nn->drc_hashtbl) - goto out_shrinker; - } + nn->drc_hashtbl = kvzalloc(array_size(hashsize, + sizeof(*nn->drc_hashtbl)), GFP_KERNEL); + if (!nn->drc_hashtbl) + goto out_shrinker; for (i = 0; i < hashsize; i++) { INIT_LIST_HEAD(&nn->drc_hashtbl[i].lru_head); From 667489618721ca00168ae7a5b00a0116d6301ce9 Mon Sep 17 00:00:00 2001 From: Kate Hsuan Date: Fri, 3 Sep 2021 17:44:11 +0800 Subject: [PATCH 1071/5344] libata: Add ATA_HORKAGE_NO_NCQ_ON_ATI for Samsung 860 and 870 SSD. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 7a8526a5cd51cf5f070310c6c37dd7293334ac49 upstream. Many users are reporting that the Samsung 860 and 870 SSD are having various issues when combined with AMD/ATI (vendor ID 0x1002) SATA controllers and only completely disabling NCQ helps to avoid these issues. Always disabling NCQ for Samsung 860/870 SSDs regardless of the host SATA adapter vendor will cause I/O performance degradation with well behaved adapters. To limit the performance impact to ATI adapters, introduce the ATA_HORKAGE_NO_NCQ_ON_ATI flag to force disable NCQ only for these adapters. Also, two libata.force parameters (noncqati and ncqati) are introduced to disable and enable the NCQ for the system which equipped with ATI SATA adapter and Samsung 860 and 870 SSDs. The user can determine NCQ function to be enabled or disabled according to the demand. After verifying the chipset from the user reports, the issue appears on AMD/ATI SB7x0/SB8x0/SB9x0 SATA Controllers and does not appear on recent AMD SATA adapters. The vendor ID of ATI should be 0x1002. Therefore, ATA_HORKAGE_NO_NCQ_ON_AMD was modified to ATA_HORKAGE_NO_NCQ_ON_ATI. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=201693 Signed-off-by: Kate Hsuan Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20210903094411.58749-1-hpa@redhat.com Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe Cc: Krzysztof Olędzki Signed-off-by: Greg Kroah-Hartman --- drivers/ata/libata-core.c | 34 ++++++++++++++++++++++++++++++++-- include/linux/libata.h | 1 + 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 5c354c7aff946..7c6c05fd5dfc3 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -2252,6 +2252,25 @@ static void ata_dev_config_ncq_prio(struct ata_device *dev) } +static bool ata_dev_check_adapter(struct ata_device *dev, + unsigned short vendor_id) +{ + struct pci_dev *pcidev = NULL; + struct device *parent_dev = NULL; + + for (parent_dev = dev->tdev.parent; parent_dev != NULL; + parent_dev = parent_dev->parent) { + if (dev_is_pci(parent_dev)) { + pcidev = to_pci_dev(parent_dev); + if (pcidev->vendor == vendor_id) + return true; + break; + } + } + + return false; +} + static int ata_dev_config_ncq(struct ata_device *dev, char *desc, size_t desc_sz) { @@ -2268,6 +2287,13 @@ static int ata_dev_config_ncq(struct ata_device *dev, snprintf(desc, desc_sz, "NCQ (not used)"); return 0; } + + if (dev->horkage & ATA_HORKAGE_NO_NCQ_ON_ATI && + ata_dev_check_adapter(dev, PCI_VENDOR_ID_ATI)) { + snprintf(desc, desc_sz, "NCQ (not used)"); + return 0; + } + if (ap->flags & ATA_FLAG_NCQ) { hdepth = min(ap->scsi_host->can_queue, ATA_MAX_QUEUE); dev->flags |= ATA_DFLAG_NCQ; @@ -4557,9 +4583,11 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { { "Samsung SSD 850*", NULL, ATA_HORKAGE_NO_NCQ_TRIM | ATA_HORKAGE_ZERO_AFTER_TRIM, }, { "Samsung SSD 860*", NULL, ATA_HORKAGE_NO_NCQ_TRIM | - ATA_HORKAGE_ZERO_AFTER_TRIM, }, + ATA_HORKAGE_ZERO_AFTER_TRIM | + ATA_HORKAGE_NO_NCQ_ON_ATI, }, { "Samsung SSD 870*", NULL, ATA_HORKAGE_NO_NCQ_TRIM | - ATA_HORKAGE_ZERO_AFTER_TRIM, }, + ATA_HORKAGE_ZERO_AFTER_TRIM | + ATA_HORKAGE_NO_NCQ_ON_ATI, }, { "FCCT*M500*", NULL, ATA_HORKAGE_NO_NCQ_TRIM | ATA_HORKAGE_ZERO_AFTER_TRIM, }, @@ -6916,6 +6944,8 @@ static int __init ata_parse_force_one(char **cur, { "ncq", .horkage_off = ATA_HORKAGE_NONCQ }, { "noncqtrim", .horkage_on = ATA_HORKAGE_NO_NCQ_TRIM }, { "ncqtrim", .horkage_off = ATA_HORKAGE_NO_NCQ_TRIM }, + { "noncqati", .horkage_on = ATA_HORKAGE_NO_NCQ_ON_ATI }, + { "ncqati", .horkage_off = ATA_HORKAGE_NO_NCQ_ON_ATI }, { "dump_id", .horkage_on = ATA_HORKAGE_DUMP_ID }, { "pio0", .xfer_mask = 1 << (ATA_SHIFT_PIO + 0) }, { "pio1", .xfer_mask = 1 << (ATA_SHIFT_PIO + 1) }, diff --git a/include/linux/libata.h b/include/linux/libata.h index 3c3d8d6b16183..3d5adbaf8214f 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -423,6 +423,7 @@ enum { ATA_HORKAGE_NOTRIM = (1 << 24), /* don't use TRIM */ ATA_HORKAGE_MAX_SEC_1024 = (1 << 25), /* Limit max sects to 1024 */ ATA_HORKAGE_MAX_TRIM_128M = (1 << 26), /* Limit max trim size to 128M */ + ATA_HORKAGE_NO_NCQ_ON_ATI = (1 << 27), /* Disable NCQ on ATI chipset */ /* DMA mask for user DMA control: User visible values; DO NOT renumber */ From 56a9c52bc11b03241afcdb8e4eda337103bf2b03 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 9 Oct 2021 14:39:50 +0200 Subject: [PATCH 1072/5344] Linux 5.4.152 Link: https://lore.kernel.org/r/20211008112715.444305067@linuxfoundation.org Tested-by: Jon Hunter Tested-by: Florian Fainelli Tested-by: Shuah Khan Tested-by: Shuah Khan Tested-by: Guenter Roeck Tested-by: Linux Kernel Functional Testing Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 4eeb72027815b..ffcdc36c56f54 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 4 -SUBLEVEL = 151 +SUBLEVEL = 152 EXTRAVERSION = NAME = Kleptomaniac Octopus From 8bfff89d71b1da570f1f5f17dcb806fdc66023be Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 21 Sep 2021 16:34:42 +0200 Subject: [PATCH 1073/5344] Partially revert "usb: Kconfig: using select for USB_COMMON dependency" commit 4d1aa9112c8e6995ef2c8a76972c9671332ccfea upstream. This reverts commit cb9c1cfc86926d0e86d19c8e34f6c23458cd3478 for USB_LED_TRIG. This config symbol has bool type and enables extra code in usb_common itself, not a separate driver. Enabling it should not force usb_common to be built-in! Fixes: cb9c1cfc8692 ("usb: Kconfig: using select for USB_COMMON dependency") Cc: stable Signed-off-by: Ben Hutchings Signed-off-by: Salvatore Bonaccorso Link: https://lore.kernel.org/r/20210921143442.340087-1-carnil@debian.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/common/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/usb/common/Kconfig b/drivers/usb/common/Kconfig index d611477aae414..196f4a3975871 100644 --- a/drivers/usb/common/Kconfig +++ b/drivers/usb/common/Kconfig @@ -6,8 +6,7 @@ config USB_COMMON config USB_LED_TRIG bool "USB LED Triggers" - depends on LEDS_CLASS && LEDS_TRIGGERS - select USB_COMMON + depends on LEDS_CLASS && USB_COMMON && LEDS_TRIGGERS help This option adds LED triggers for USB host and/or gadget activity. From c66e655cb5c82c5737df95dc933890404531e041 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Tue, 28 Sep 2021 19:16:39 +0800 Subject: [PATCH 1074/5344] usb: typec: tcpm: handle SRC_STARTUP state if cc changes commit 6d91017a295e9790eec02c4e43f020cdb55f5d98 upstream. TCPM for DRP should do the same action as SRC_ATTACHED when cc changes in SRC_STARTUP state. Otherwise, TCPM will transition to SRC_UNATTACHED state which is not satisfied with the Type-C spec. Per Type-C spec: DRP port should move to Unattached.SNK instead of Unattached.SRC if sink removed. Fixes: 4b4e02c83167 ("typec: tcpm: Move out of staging") cc: Reviewed-by: Guenter Roeck Acked-by: Heikki Krogerus Signed-off-by: Xu Yang Link: https://lore.kernel.org/r/20210928111639.3854174-1-xu.yang_2@nxp.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index b40db48f8874d..89391939630bd 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -3679,6 +3679,7 @@ static void _tcpm_cc_change(struct tcpm_port *port, enum typec_cc_status cc1, tcpm_set_state(port, SRC_ATTACH_WAIT, 0); break; case SRC_ATTACHED: + case SRC_STARTUP: case SRC_SEND_CAPABILITIES: case SRC_READY: if (tcpm_port_is_disconnected(port) || From 554981484cff33ac2bada1d99d91d9aea2cb3a1c Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 22 Sep 2021 12:17:48 +0200 Subject: [PATCH 1075/5344] xen/privcmd: fix error handling in mmap-resource processing commit e11423d6721dd63b23fb41ade5e8d0b448b17780 upstream. xen_pfn_t is the same size as int only on 32-bit builds (and not even on Arm32). Hence pfns[] can't be used directly to read individual error values returned from xen_remap_domain_mfn_array(); every other error indicator would be skipped/ignored on 64-bit. Fixes: 3ad0876554ca ("xen/privcmd: add IOCTL_PRIVCMD_MMAP_RESOURCE") Cc: stable@vger.kernel.org Signed-off-by: Jan Beulich Reviewed-by: Boris Ostrovsky Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/aa6d6a67-6889-338a-a910-51e889f792d5@suse.com Signed-off-by: Juergen Gross --- drivers/xen/privcmd.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 4ea47de6e3676..3a01a17d614a7 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -810,11 +810,12 @@ static long privcmd_ioctl_mmap_resource(struct file *file, unsigned int domid = (xdata.flags & XENMEM_rsrc_acq_caller_owned) ? DOMID_SELF : kdata.dom; - int num; + int num, *errs = (int *)pfns; + BUILD_BUG_ON(sizeof(*errs) > sizeof(*pfns)); num = xen_remap_domain_mfn_array(vma, kdata.addr & PAGE_MASK, - pfns, kdata.num, (int *)pfns, + pfns, kdata.num, errs, vma->vm_page_prot, domid, vma->vm_private_data); @@ -824,7 +825,7 @@ static long privcmd_ioctl_mmap_resource(struct file *file, unsigned int i; for (i = 0; i < num; i++) { - rc = pfns[i]; + rc = errs[i]; if (rc < 0) break; } From 366b276dcccafa46edc6e75f7b138fb559f5ae9b Mon Sep 17 00:00:00 2001 From: Zheng Liang Date: Fri, 24 Sep 2021 09:16:27 +0800 Subject: [PATCH 1076/5344] ovl: fix missing negative dentry check in ovl_rename() commit a295aef603e109a47af355477326bd41151765b6 upstream. The following reproducer mkdir lower upper work merge touch lower/old touch lower/new mount -t overlay overlay -olowerdir=lower,upperdir=upper,workdir=work merge rm merge/new mv merge/old merge/new & unlink upper/new may result in this race: PROCESS A: rename("merge/old", "merge/new"); overwrite=true,ovl_lower_positive(old)=true, ovl_dentry_is_whiteout(new)=true -> flags |= RENAME_EXCHANGE PROCESS B: unlink("upper/new"); PROCESS A: lookup newdentry in new_upperdir call vfs_rename() with negative newdentry and RENAME_EXCHANGE Fix by adding the missing check for negative newdentry. Signed-off-by: Zheng Liang Fixes: e9be9d5e76e3 ("overlay filesystem") Cc: # v3.18 Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/overlayfs/dir.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 073be36b0686c..876de87f604cd 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -1162,9 +1162,13 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, goto out_dput; } } else { - if (!d_is_negative(newdentry) && - (!new_opaque || !ovl_is_whiteout(newdentry))) - goto out_dput; + if (!d_is_negative(newdentry)) { + if (!new_opaque || !ovl_is_whiteout(newdentry)) + goto out_dput; + } else { + if (flags & RENAME_EXCHANGE) + goto out_dput; + } } if (olddentry == trap) From f76af5d4a38f618ea69399046e59de97ef480b3d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 30 Sep 2021 15:44:41 -0400 Subject: [PATCH 1077/5344] nfsd4: Handle the NFSv4 READDIR 'dircount' hint being zero commit f2e717d655040d632c9015f19aa4275f8b16e7f2 upstream. RFC3530 notes that the 'dircount' field may be zero, in which case the recommendation is to ignore it, and only enforce the 'maxcount' field. In RFC5661, this recommendation to ignore a zero valued field becomes a requirement. Fixes: aee377644146 ("nfsd4: fix rd_dircount enforcement") Cc: Signed-off-by: Trond Myklebust Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4xdr.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index d6f244559e759..e61d9c4359573 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3131,15 +3131,18 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, goto fail; cd->rd_maxcount -= entry_bytes; /* - * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so - * let's always let through the first entry, at least: + * RFC 3530 14.2.24 describes rd_dircount as only a "hint", and + * notes that it could be zero. If it is zero, then the server + * should enforce only the rd_maxcount value. */ - if (!cd->rd_dircount) - goto fail; - name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8; - if (name_and_cookie > cd->rd_dircount && cd->cookie_offset) - goto fail; - cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie); + if (cd->rd_dircount) { + name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8; + if (name_and_cookie > cd->rd_dircount && cd->cookie_offset) + goto fail; + cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie); + if (!cd->rd_dircount) + cd->rd_maxcount = 0; + } cd->cookie_offset = cookie_offset; skip_entry: From 070b408045c455adcae7d0e3fc7fd9e5aa13c142 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 5 Oct 2021 15:34:33 +0200 Subject: [PATCH 1078/5344] xen/balloon: fix cancelled balloon action MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 319933a80fd4f07122466a77f93e5019d71be74c upstream. In case a ballooning action is cancelled the new kernel thread handling the ballooning might end up in a busy loop. Fix that by handling the cancelled action gracefully. While at it introduce a short wait for the BP_WAIT case. Cc: stable@vger.kernel.org Fixes: 8480ed9c2bbd56 ("xen/balloon: use a kernel thread instead a workqueue") Reported-by: Marek Marczykowski-Górecki Signed-off-by: Juergen Gross Tested-by: Jason Andryuk Reviewed-by: Boris Ostrovsky Link: https://lore.kernel.org/r/20211005133433.32008-1-jgross@suse.com Signed-off-by: Juergen Gross Signed-off-by: Greg Kroah-Hartman --- drivers/xen/balloon.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index be31c296eed4c..07f362c63ae90 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -508,12 +508,12 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) } /* - * Stop waiting if either state is not BP_EAGAIN and ballooning action is - * needed, or if the credit has changed while state is BP_EAGAIN. + * Stop waiting if either state is BP_DONE and ballooning action is + * needed, or if the credit has changed while state is not BP_DONE. */ static bool balloon_thread_cond(enum bp_state state, long credit) { - if (state != BP_EAGAIN) + if (state == BP_DONE) credit = 0; return current_credit() != credit || kthread_should_stop(); @@ -533,10 +533,19 @@ static int balloon_thread(void *unused) set_freezable(); for (;;) { - if (state == BP_EAGAIN) - timeout = balloon_stats.schedule_delay * HZ; - else + switch (state) { + case BP_DONE: + case BP_ECANCELED: timeout = 3600 * HZ; + break; + case BP_EAGAIN: + timeout = balloon_stats.schedule_delay * HZ; + break; + case BP_WAIT: + timeout = HZ; + break; + } + credit = current_credit(); wait_event_freezable_timeout(balloon_thread_wq, From 3159c08d2bb3f5eb885f8f0a3478de43ace0fcb6 Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Thu, 2 Sep 2021 12:58:28 +0300 Subject: [PATCH 1079/5344] ARM: dts: omap3430-sdp: Fix NAND device node commit 80d680fdccba214e8106dc1aa33de5207ad75394 upstream. Nand is on CS1 so reg properties first field should be 1 not 0. Fixes: 44e4716499b8 ("ARM: dts: omap3: Fix NAND device nodes") Cc: stable@vger.kernel.org # v4.6+ Signed-off-by: Roger Quadros Signed-off-by: Tony Lindgren Signed-off-by: Greg Kroah-Hartman --- arch/arm/boot/dts/omap3430-sdp.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/omap3430-sdp.dts b/arch/arm/boot/dts/omap3430-sdp.dts index 0abd61108a539..ec16979825378 100644 --- a/arch/arm/boot/dts/omap3430-sdp.dts +++ b/arch/arm/boot/dts/omap3430-sdp.dts @@ -101,7 +101,7 @@ nand@1,0 { compatible = "ti,omap2-nand"; - reg = <0 0 4>; /* CS0, offset 0, IO size 4 */ + reg = <1 0 4>; /* CS1, offset 0, IO size 4 */ interrupt-parent = <&gpmc>; interrupts = <0 IRQ_TYPE_NONE>, /* fifoevent */ <1 IRQ_TYPE_NONE>; /* termcount */ From bfe810d829b3d8cd5bc25fc3da9352f3320cafc5 Mon Sep 17 00:00:00 2001 From: David Heidelberg Date: Wed, 18 Aug 2021 08:53:17 +0200 Subject: [PATCH 1080/5344] ARM: dts: qcom: apq8064: use compatible which contains chipid commit f5c03f131dae3f06d08464e6157dd461200f78d9 upstream. Also resolves these kernel warnings for APQ8064: adreno 4300000.adreno-3xx: Using legacy qcom,chipid binding! adreno 4300000.adreno-3xx: Use compatible qcom,adreno-320.2 instead. Tested on Nexus 7 2013, no functional changes. Cc: Signed-off-by: David Heidelberg Link: https://lore.kernel.org/r/20210818065317.19822-1-david@ixit.cz Signed-off-by: Bjorn Andersson Signed-off-by: Greg Kroah-Hartman --- arch/arm/boot/dts/qcom-apq8064.dtsi | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/qcom-apq8064.dtsi b/arch/arm/boot/dts/qcom-apq8064.dtsi index 2b075e287610f..27accc164df37 100644 --- a/arch/arm/boot/dts/qcom-apq8064.dtsi +++ b/arch/arm/boot/dts/qcom-apq8064.dtsi @@ -1147,7 +1147,7 @@ }; gpu: adreno-3xx@4300000 { - compatible = "qcom,adreno-3xx"; + compatible = "qcom,adreno-320.2", "qcom,adreno"; reg = <0x04300000 0x20000>; reg-names = "kgsl_3d0_reg_memory"; interrupts = ; @@ -1162,7 +1162,6 @@ <&mmcc GFX3D_AHB_CLK>, <&mmcc GFX3D_AXI_CLK>, <&mmcc MMSS_IMEM_AHB_CLK>; - qcom,chipid = <0x03020002>; iommus = <&gfx3d 0 &gfx3d 1 From cef83d1eb2863436e324babbea768c84ec79a5fb Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Thu, 5 Dec 2019 10:23:18 -0800 Subject: [PATCH 1081/5344] MIPS: BPF: Restore MIPS32 cBPF JIT commit 36366e367ee93ced84fddb8fae6675e12985f5a4 upstream. Commit 716850ab104d ("MIPS: eBPF: Initial eBPF support for MIPS32 architecture.") enabled our eBPF JIT for MIPS32 kernels, whereas it has previously only been availailable for MIPS64. It was my understanding at the time that the BPF test suite was passing & JITing a comparable number of tests to our cBPF JIT [1], but it turns out that was not the case. The eBPF JIT has a number of problems on MIPS32: - Most notably various code paths still result in emission of MIPS64 instructions which will cause reserved instruction exceptions & kernel panics when run on MIPS32 CPUs. - The eBPF JIT doesn't account for differences between the O32 ABI used by MIPS32 kernels versus the N64 ABI used by MIPS64 kernels. Notably arguments beyond the first 4 are passed on the stack in O32, and this is entirely unhandled when JITing a BPF_CALL instruction. Stack space must be reserved for arguments even if they all fit in registers, and the callee is free to assume that stack space has been reserved for its use - with the eBPF JIT this is not the case, so calling any function can result in clobbering values on the stack & unpredictable behaviour. Function arguments in eBPF are always 64-bit values which is also entirely unhandled - the JIT still uses a single (32-bit) register per argument. As a result all function arguments are always passed incorrectly when JITing a BPF_CALL instruction, leading to kernel crashes or strange behavior. - The JIT attempts to bail our on use of ALU64 instructions or 64-bit memory access instructions. The code doing this at the start of build_one_insn() incorrectly checks whether BPF_OP() equals BPF_DW, when it should really be checking BPF_SIZE() & only doing so when BPF_CLASS() is one of BPF_{LD,LDX,ST,STX}. This results in false positives that cause more bailouts than intended, and that in turns hides some of the problems described above. - The kernel's cBPF->eBPF translation makes heavy use of 64-bit eBPF instructions that the MIPS32 eBPF JIT bails out on, leading to most cBPF programs not being JITed at all. Until these problems are resolved, revert the removal of the cBPF JIT performed by commit 716850ab104d ("MIPS: eBPF: Initial eBPF support for MIPS32 architecture."). Together with commit f8fffebdea75 ("MIPS: BPF: Disable MIPS32 eBPF JIT") this restores MIPS32 BPF JIT behavior back to the same state it was prior to the introduction of the broken eBPF JIT support. [1] https://lore.kernel.org/linux-mips/MWHPR2201MB13583388481F01A422CE7D66D4410@MWHPR2201MB1358.namprd22.prod.outlook.com/ Signed-off-by: Paul Burton Fixes: 716850ab104d ("MIPS: eBPF: Initial eBPF support for MIPS32 architecture.") Cc: Daniel Borkmann Cc: Hassan Naveed Cc: Tony Ambardar Cc: bpf@vger.kernel.org Cc: netdev@vger.kernel.org Cc: linux-mips@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- arch/mips/Kconfig | 1 + arch/mips/net/Makefile | 1 + arch/mips/net/bpf_jit.c | 1270 +++++++++++++++++++++++++++++++++++ arch/mips/net/bpf_jit_asm.S | 285 ++++++++ 4 files changed, 1557 insertions(+) create mode 100644 arch/mips/net/bpf_jit.c create mode 100644 arch/mips/net/bpf_jit_asm.S diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 6ecdc690f7336..2bfef67d52c63 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -46,6 +46,7 @@ config MIPS select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE if CPU_SUPPORTS_HUGEPAGES select HAVE_ASM_MODVERSIONS + select HAVE_CBPF_JIT if !64BIT && !CPU_MICROMIPS select HAVE_EBPF_JIT if 64BIT && !CPU_MICROMIPS && TARGET_ISA_REV >= 2 select HAVE_CONTEXT_TRACKING select HAVE_COPY_THREAD_TLS diff --git a/arch/mips/net/Makefile b/arch/mips/net/Makefile index 2d03af7d6b19d..d55912349039c 100644 --- a/arch/mips/net/Makefile +++ b/arch/mips/net/Makefile @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only # MIPS networking code +obj-$(CONFIG_MIPS_CBPF_JIT) += bpf_jit.o bpf_jit_asm.o obj-$(CONFIG_MIPS_EBPF_JIT) += ebpf_jit.o diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c new file mode 100644 index 0000000000000..3a0e34f4e6153 --- /dev/null +++ b/arch/mips/net/bpf_jit.c @@ -0,0 +1,1270 @@ +/* + * Just-In-Time compiler for BPF filters on MIPS + * + * Copyright (c) 2014 Imagination Technologies Ltd. + * Author: Markos Chandras + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; version 2 of the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bpf_jit.h" + +/* ABI + * r_skb_hl SKB header length + * r_data SKB data pointer + * r_off Offset + * r_A BPF register A + * r_X BPF register X + * r_skb *skb + * r_M *scratch memory + * r_skb_len SKB length + * + * On entry (*bpf_func)(*skb, *filter) + * a0 = MIPS_R_A0 = skb; + * a1 = MIPS_R_A1 = filter; + * + * Stack + * ... + * M[15] + * M[14] + * M[13] + * ... + * M[0] <-- r_M + * saved reg k-1 + * saved reg k-2 + * ... + * saved reg 0 <-- r_sp + * + * + * Packet layout + * + * <--------------------- len ------------------------> + * <--skb-len(r_skb_hl)-->< ----- skb->data_len ------> + * ---------------------------------------------------- + * | skb->data | + * ---------------------------------------------------- + */ + +#define ptr typeof(unsigned long) + +#define SCRATCH_OFF(k) (4 * (k)) + +/* JIT flags */ +#define SEEN_CALL (1 << BPF_MEMWORDS) +#define SEEN_SREG_SFT (BPF_MEMWORDS + 1) +#define SEEN_SREG_BASE (1 << SEEN_SREG_SFT) +#define SEEN_SREG(x) (SEEN_SREG_BASE << (x)) +#define SEEN_OFF SEEN_SREG(2) +#define SEEN_A SEEN_SREG(3) +#define SEEN_X SEEN_SREG(4) +#define SEEN_SKB SEEN_SREG(5) +#define SEEN_MEM SEEN_SREG(6) +/* SEEN_SK_DATA also implies skb_hl an skb_len */ +#define SEEN_SKB_DATA (SEEN_SREG(7) | SEEN_SREG(1) | SEEN_SREG(0)) + +/* Arguments used by JIT */ +#define ARGS_USED_BY_JIT 2 /* only applicable to 64-bit */ + +#define SBIT(x) (1 << (x)) /* Signed version of BIT() */ + +/** + * struct jit_ctx - JIT context + * @skf: The sk_filter + * @prologue_bytes: Number of bytes for prologue + * @idx: Instruction index + * @flags: JIT flags + * @offsets: Instruction offsets + * @target: Memory location for the compiled filter + */ +struct jit_ctx { + const struct bpf_prog *skf; + unsigned int prologue_bytes; + u32 idx; + u32 flags; + u32 *offsets; + u32 *target; +}; + + +static inline int optimize_div(u32 *k) +{ + /* power of 2 divides can be implemented with right shift */ + if (!(*k & (*k-1))) { + *k = ilog2(*k); + return 1; + } + + return 0; +} + +static inline void emit_jit_reg_move(ptr dst, ptr src, struct jit_ctx *ctx); + +/* Simply emit the instruction if the JIT memory space has been allocated */ +#define emit_instr(ctx, func, ...) \ +do { \ + if ((ctx)->target != NULL) { \ + u32 *p = &(ctx)->target[ctx->idx]; \ + uasm_i_##func(&p, ##__VA_ARGS__); \ + } \ + (ctx)->idx++; \ +} while (0) + +/* + * Similar to emit_instr but it must be used when we need to emit + * 32-bit or 64-bit instructions + */ +#define emit_long_instr(ctx, func, ...) \ +do { \ + if ((ctx)->target != NULL) { \ + u32 *p = &(ctx)->target[ctx->idx]; \ + UASM_i_##func(&p, ##__VA_ARGS__); \ + } \ + (ctx)->idx++; \ +} while (0) + +/* Determine if immediate is within the 16-bit signed range */ +static inline bool is_range16(s32 imm) +{ + return !(imm >= SBIT(15) || imm < -SBIT(15)); +} + +static inline void emit_addu(unsigned int dst, unsigned int src1, + unsigned int src2, struct jit_ctx *ctx) +{ + emit_instr(ctx, addu, dst, src1, src2); +} + +static inline void emit_nop(struct jit_ctx *ctx) +{ + emit_instr(ctx, nop); +} + +/* Load a u32 immediate to a register */ +static inline void emit_load_imm(unsigned int dst, u32 imm, struct jit_ctx *ctx) +{ + if (ctx->target != NULL) { + /* addiu can only handle s16 */ + if (!is_range16(imm)) { + u32 *p = &ctx->target[ctx->idx]; + uasm_i_lui(&p, r_tmp_imm, (s32)imm >> 16); + p = &ctx->target[ctx->idx + 1]; + uasm_i_ori(&p, dst, r_tmp_imm, imm & 0xffff); + } else { + u32 *p = &ctx->target[ctx->idx]; + uasm_i_addiu(&p, dst, r_zero, imm); + } + } + ctx->idx++; + + if (!is_range16(imm)) + ctx->idx++; +} + +static inline void emit_or(unsigned int dst, unsigned int src1, + unsigned int src2, struct jit_ctx *ctx) +{ + emit_instr(ctx, or, dst, src1, src2); +} + +static inline void emit_ori(unsigned int dst, unsigned src, u32 imm, + struct jit_ctx *ctx) +{ + if (imm >= BIT(16)) { + emit_load_imm(r_tmp, imm, ctx); + emit_or(dst, src, r_tmp, ctx); + } else { + emit_instr(ctx, ori, dst, src, imm); + } +} + +static inline void emit_daddiu(unsigned int dst, unsigned int src, + int imm, struct jit_ctx *ctx) +{ + /* + * Only used for stack, so the imm is relatively small + * and it fits in 15-bits + */ + emit_instr(ctx, daddiu, dst, src, imm); +} + +static inline void emit_addiu(unsigned int dst, unsigned int src, + u32 imm, struct jit_ctx *ctx) +{ + if (!is_range16(imm)) { + emit_load_imm(r_tmp, imm, ctx); + emit_addu(dst, r_tmp, src, ctx); + } else { + emit_instr(ctx, addiu, dst, src, imm); + } +} + +static inline void emit_and(unsigned int dst, unsigned int src1, + unsigned int src2, struct jit_ctx *ctx) +{ + emit_instr(ctx, and, dst, src1, src2); +} + +static inline void emit_andi(unsigned int dst, unsigned int src, + u32 imm, struct jit_ctx *ctx) +{ + /* If imm does not fit in u16 then load it to register */ + if (imm >= BIT(16)) { + emit_load_imm(r_tmp, imm, ctx); + emit_and(dst, src, r_tmp, ctx); + } else { + emit_instr(ctx, andi, dst, src, imm); + } +} + +static inline void emit_xor(unsigned int dst, unsigned int src1, + unsigned int src2, struct jit_ctx *ctx) +{ + emit_instr(ctx, xor, dst, src1, src2); +} + +static inline void emit_xori(ptr dst, ptr src, u32 imm, struct jit_ctx *ctx) +{ + /* If imm does not fit in u16 then load it to register */ + if (imm >= BIT(16)) { + emit_load_imm(r_tmp, imm, ctx); + emit_xor(dst, src, r_tmp, ctx); + } else { + emit_instr(ctx, xori, dst, src, imm); + } +} + +static inline void emit_stack_offset(int offset, struct jit_ctx *ctx) +{ + emit_long_instr(ctx, ADDIU, r_sp, r_sp, offset); +} + +static inline void emit_subu(unsigned int dst, unsigned int src1, + unsigned int src2, struct jit_ctx *ctx) +{ + emit_instr(ctx, subu, dst, src1, src2); +} + +static inline void emit_neg(unsigned int reg, struct jit_ctx *ctx) +{ + emit_subu(reg, r_zero, reg, ctx); +} + +static inline void emit_sllv(unsigned int dst, unsigned int src, + unsigned int sa, struct jit_ctx *ctx) +{ + emit_instr(ctx, sllv, dst, src, sa); +} + +static inline void emit_sll(unsigned int dst, unsigned int src, + unsigned int sa, struct jit_ctx *ctx) +{ + /* sa is 5-bits long */ + if (sa >= BIT(5)) + /* Shifting >= 32 results in zero */ + emit_jit_reg_move(dst, r_zero, ctx); + else + emit_instr(ctx, sll, dst, src, sa); +} + +static inline void emit_srlv(unsigned int dst, unsigned int src, + unsigned int sa, struct jit_ctx *ctx) +{ + emit_instr(ctx, srlv, dst, src, sa); +} + +static inline void emit_srl(unsigned int dst, unsigned int src, + unsigned int sa, struct jit_ctx *ctx) +{ + /* sa is 5-bits long */ + if (sa >= BIT(5)) + /* Shifting >= 32 results in zero */ + emit_jit_reg_move(dst, r_zero, ctx); + else + emit_instr(ctx, srl, dst, src, sa); +} + +static inline void emit_slt(unsigned int dst, unsigned int src1, + unsigned int src2, struct jit_ctx *ctx) +{ + emit_instr(ctx, slt, dst, src1, src2); +} + +static inline void emit_sltu(unsigned int dst, unsigned int src1, + unsigned int src2, struct jit_ctx *ctx) +{ + emit_instr(ctx, sltu, dst, src1, src2); +} + +static inline void emit_sltiu(unsigned dst, unsigned int src, + unsigned int imm, struct jit_ctx *ctx) +{ + /* 16 bit immediate */ + if (!is_range16((s32)imm)) { + emit_load_imm(r_tmp, imm, ctx); + emit_sltu(dst, src, r_tmp, ctx); + } else { + emit_instr(ctx, sltiu, dst, src, imm); + } + +} + +/* Store register on the stack */ +static inline void emit_store_stack_reg(ptr reg, ptr base, + unsigned int offset, + struct jit_ctx *ctx) +{ + emit_long_instr(ctx, SW, reg, offset, base); +} + +static inline void emit_store(ptr reg, ptr base, unsigned int offset, + struct jit_ctx *ctx) +{ + emit_instr(ctx, sw, reg, offset, base); +} + +static inline void emit_load_stack_reg(ptr reg, ptr base, + unsigned int offset, + struct jit_ctx *ctx) +{ + emit_long_instr(ctx, LW, reg, offset, base); +} + +static inline void emit_load(unsigned int reg, unsigned int base, + unsigned int offset, struct jit_ctx *ctx) +{ + emit_instr(ctx, lw, reg, offset, base); +} + +static inline void emit_load_byte(unsigned int reg, unsigned int base, + unsigned int offset, struct jit_ctx *ctx) +{ + emit_instr(ctx, lb, reg, offset, base); +} + +static inline void emit_half_load(unsigned int reg, unsigned int base, + unsigned int offset, struct jit_ctx *ctx) +{ + emit_instr(ctx, lh, reg, offset, base); +} + +static inline void emit_half_load_unsigned(unsigned int reg, unsigned int base, + unsigned int offset, struct jit_ctx *ctx) +{ + emit_instr(ctx, lhu, reg, offset, base); +} + +static inline void emit_mul(unsigned int dst, unsigned int src1, + unsigned int src2, struct jit_ctx *ctx) +{ + emit_instr(ctx, mul, dst, src1, src2); +} + +static inline void emit_div(unsigned int dst, unsigned int src, + struct jit_ctx *ctx) +{ + if (ctx->target != NULL) { + u32 *p = &ctx->target[ctx->idx]; + uasm_i_divu(&p, dst, src); + p = &ctx->target[ctx->idx + 1]; + uasm_i_mflo(&p, dst); + } + ctx->idx += 2; /* 2 insts */ +} + +static inline void emit_mod(unsigned int dst, unsigned int src, + struct jit_ctx *ctx) +{ + if (ctx->target != NULL) { + u32 *p = &ctx->target[ctx->idx]; + uasm_i_divu(&p, dst, src); + p = &ctx->target[ctx->idx + 1]; + uasm_i_mfhi(&p, dst); + } + ctx->idx += 2; /* 2 insts */ +} + +static inline void emit_dsll(unsigned int dst, unsigned int src, + unsigned int sa, struct jit_ctx *ctx) +{ + emit_instr(ctx, dsll, dst, src, sa); +} + +static inline void emit_dsrl32(unsigned int dst, unsigned int src, + unsigned int sa, struct jit_ctx *ctx) +{ + emit_instr(ctx, dsrl32, dst, src, sa); +} + +static inline void emit_wsbh(unsigned int dst, unsigned int src, + struct jit_ctx *ctx) +{ + emit_instr(ctx, wsbh, dst, src); +} + +/* load pointer to register */ +static inline void emit_load_ptr(unsigned int dst, unsigned int src, + int imm, struct jit_ctx *ctx) +{ + /* src contains the base addr of the 32/64-pointer */ + emit_long_instr(ctx, LW, dst, imm, src); +} + +/* load a function pointer to register */ +static inline void emit_load_func(unsigned int reg, ptr imm, + struct jit_ctx *ctx) +{ + if (IS_ENABLED(CONFIG_64BIT)) { + /* At this point imm is always 64-bit */ + emit_load_imm(r_tmp, (u64)imm >> 32, ctx); + emit_dsll(r_tmp_imm, r_tmp, 16, ctx); /* left shift by 16 */ + emit_ori(r_tmp, r_tmp_imm, (imm >> 16) & 0xffff, ctx); + emit_dsll(r_tmp_imm, r_tmp, 16, ctx); /* left shift by 16 */ + emit_ori(reg, r_tmp_imm, imm & 0xffff, ctx); + } else { + emit_load_imm(reg, imm, ctx); + } +} + +/* Move to real MIPS register */ +static inline void emit_reg_move(ptr dst, ptr src, struct jit_ctx *ctx) +{ + emit_long_instr(ctx, ADDU, dst, src, r_zero); +} + +/* Move to JIT (32-bit) register */ +static inline void emit_jit_reg_move(ptr dst, ptr src, struct jit_ctx *ctx) +{ + emit_addu(dst, src, r_zero, ctx); +} + +/* Compute the immediate value for PC-relative branches. */ +static inline u32 b_imm(unsigned int tgt, struct jit_ctx *ctx) +{ + if (ctx->target == NULL) + return 0; + + /* + * We want a pc-relative branch. We only do forward branches + * so tgt is always after pc. tgt is the instruction offset + * we want to jump to. + + * Branch on MIPS: + * I: target_offset <- sign_extend(offset) + * I+1: PC += target_offset (delay slot) + * + * ctx->idx currently points to the branch instruction + * but the offset is added to the delay slot so we need + * to subtract 4. + */ + return ctx->offsets[tgt] - + (ctx->idx * 4 - ctx->prologue_bytes) - 4; +} + +static inline void emit_bcond(int cond, unsigned int reg1, unsigned int reg2, + unsigned int imm, struct jit_ctx *ctx) +{ + if (ctx->target != NULL) { + u32 *p = &ctx->target[ctx->idx]; + + switch (cond) { + case MIPS_COND_EQ: + uasm_i_beq(&p, reg1, reg2, imm); + break; + case MIPS_COND_NE: + uasm_i_bne(&p, reg1, reg2, imm); + break; + case MIPS_COND_ALL: + uasm_i_b(&p, imm); + break; + default: + pr_warn("%s: Unhandled branch conditional: %d\n", + __func__, cond); + } + } + ctx->idx++; +} + +static inline void emit_b(unsigned int imm, struct jit_ctx *ctx) +{ + emit_bcond(MIPS_COND_ALL, r_zero, r_zero, imm, ctx); +} + +static inline void emit_jalr(unsigned int link, unsigned int reg, + struct jit_ctx *ctx) +{ + emit_instr(ctx, jalr, link, reg); +} + +static inline void emit_jr(unsigned int reg, struct jit_ctx *ctx) +{ + emit_instr(ctx, jr, reg); +} + +static inline u16 align_sp(unsigned int num) +{ + /* Double word alignment for 32-bit, quadword for 64-bit */ + unsigned int align = IS_ENABLED(CONFIG_64BIT) ? 16 : 8; + num = (num + (align - 1)) & -align; + return num; +} + +static void save_bpf_jit_regs(struct jit_ctx *ctx, unsigned offset) +{ + int i = 0, real_off = 0; + u32 sflags, tmp_flags; + + /* Adjust the stack pointer */ + if (offset) + emit_stack_offset(-align_sp(offset), ctx); + + tmp_flags = sflags = ctx->flags >> SEEN_SREG_SFT; + /* sflags is essentially a bitmap */ + while (tmp_flags) { + if ((sflags >> i) & 0x1) { + emit_store_stack_reg(MIPS_R_S0 + i, r_sp, real_off, + ctx); + real_off += SZREG; + } + i++; + tmp_flags >>= 1; + } + + /* save return address */ + if (ctx->flags & SEEN_CALL) { + emit_store_stack_reg(r_ra, r_sp, real_off, ctx); + real_off += SZREG; + } + + /* Setup r_M leaving the alignment gap if necessary */ + if (ctx->flags & SEEN_MEM) { + if (real_off % (SZREG * 2)) + real_off += SZREG; + emit_long_instr(ctx, ADDIU, r_M, r_sp, real_off); + } +} + +static void restore_bpf_jit_regs(struct jit_ctx *ctx, + unsigned int offset) +{ + int i, real_off = 0; + u32 sflags, tmp_flags; + + tmp_flags = sflags = ctx->flags >> SEEN_SREG_SFT; + /* sflags is a bitmap */ + i = 0; + while (tmp_flags) { + if ((sflags >> i) & 0x1) { + emit_load_stack_reg(MIPS_R_S0 + i, r_sp, real_off, + ctx); + real_off += SZREG; + } + i++; + tmp_flags >>= 1; + } + + /* restore return address */ + if (ctx->flags & SEEN_CALL) + emit_load_stack_reg(r_ra, r_sp, real_off, ctx); + + /* Restore the sp and discard the scrach memory */ + if (offset) + emit_stack_offset(align_sp(offset), ctx); +} + +static unsigned int get_stack_depth(struct jit_ctx *ctx) +{ + int sp_off = 0; + + + /* How may s* regs do we need to preserved? */ + sp_off += hweight32(ctx->flags >> SEEN_SREG_SFT) * SZREG; + + if (ctx->flags & SEEN_MEM) + sp_off += 4 * BPF_MEMWORDS; /* BPF_MEMWORDS are 32-bit */ + + if (ctx->flags & SEEN_CALL) + sp_off += SZREG; /* Space for our ra register */ + + return sp_off; +} + +static void build_prologue(struct jit_ctx *ctx) +{ + int sp_off; + + /* Calculate the total offset for the stack pointer */ + sp_off = get_stack_depth(ctx); + save_bpf_jit_regs(ctx, sp_off); + + if (ctx->flags & SEEN_SKB) + emit_reg_move(r_skb, MIPS_R_A0, ctx); + + if (ctx->flags & SEEN_SKB_DATA) { + /* Load packet length */ + emit_load(r_skb_len, r_skb, offsetof(struct sk_buff, len), + ctx); + emit_load(r_tmp, r_skb, offsetof(struct sk_buff, data_len), + ctx); + /* Load the data pointer */ + emit_load_ptr(r_skb_data, r_skb, + offsetof(struct sk_buff, data), ctx); + /* Load the header length */ + emit_subu(r_skb_hl, r_skb_len, r_tmp, ctx); + } + + if (ctx->flags & SEEN_X) + emit_jit_reg_move(r_X, r_zero, ctx); + + /* + * Do not leak kernel data to userspace, we only need to clear + * r_A if it is ever used. In fact if it is never used, we + * will not save/restore it, so clearing it in this case would + * corrupt the state of the caller. + */ + if (bpf_needs_clear_a(&ctx->skf->insns[0]) && + (ctx->flags & SEEN_A)) + emit_jit_reg_move(r_A, r_zero, ctx); +} + +static void build_epilogue(struct jit_ctx *ctx) +{ + unsigned int sp_off; + + /* Calculate the total offset for the stack pointer */ + + sp_off = get_stack_depth(ctx); + restore_bpf_jit_regs(ctx, sp_off); + + /* Return */ + emit_jr(r_ra, ctx); + emit_nop(ctx); +} + +#define CHOOSE_LOAD_FUNC(K, func) \ + ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative : func) : \ + func##_positive) + +static int build_body(struct jit_ctx *ctx) +{ + const struct bpf_prog *prog = ctx->skf; + const struct sock_filter *inst; + unsigned int i, off, condt; + u32 k, b_off __maybe_unused; + u8 (*sk_load_func)(unsigned long *skb, int offset); + + for (i = 0; i < prog->len; i++) { + u16 code; + + inst = &(prog->insns[i]); + pr_debug("%s: code->0x%02x, jt->0x%x, jf->0x%x, k->0x%x\n", + __func__, inst->code, inst->jt, inst->jf, inst->k); + k = inst->k; + code = bpf_anc_helper(inst); + + if (ctx->target == NULL) + ctx->offsets[i] = ctx->idx * 4; + + switch (code) { + case BPF_LD | BPF_IMM: + /* A <- k ==> li r_A, k */ + ctx->flags |= SEEN_A; + emit_load_imm(r_A, k, ctx); + break; + case BPF_LD | BPF_W | BPF_LEN: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); + /* A <- len ==> lw r_A, offset(skb) */ + ctx->flags |= SEEN_SKB | SEEN_A; + off = offsetof(struct sk_buff, len); + emit_load(r_A, r_skb, off, ctx); + break; + case BPF_LD | BPF_MEM: + /* A <- M[k] ==> lw r_A, offset(M) */ + ctx->flags |= SEEN_MEM | SEEN_A; + emit_load(r_A, r_M, SCRATCH_OFF(k), ctx); + break; + case BPF_LD | BPF_W | BPF_ABS: + /* A <- P[k:4] */ + sk_load_func = CHOOSE_LOAD_FUNC(k, sk_load_word); + goto load; + case BPF_LD | BPF_H | BPF_ABS: + /* A <- P[k:2] */ + sk_load_func = CHOOSE_LOAD_FUNC(k, sk_load_half); + goto load; + case BPF_LD | BPF_B | BPF_ABS: + /* A <- P[k:1] */ + sk_load_func = CHOOSE_LOAD_FUNC(k, sk_load_byte); +load: + emit_load_imm(r_off, k, ctx); +load_common: + ctx->flags |= SEEN_CALL | SEEN_OFF | + SEEN_SKB | SEEN_A | SEEN_SKB_DATA; + + emit_load_func(r_s0, (ptr)sk_load_func, ctx); + emit_reg_move(MIPS_R_A0, r_skb, ctx); + emit_jalr(MIPS_R_RA, r_s0, ctx); + /* Load second argument to delay slot */ + emit_reg_move(MIPS_R_A1, r_off, ctx); + /* Check the error value */ + emit_bcond(MIPS_COND_EQ, r_ret, 0, b_imm(i + 1, ctx), + ctx); + /* Load return register on DS for failures */ + emit_reg_move(r_ret, r_zero, ctx); + /* Return with error */ + emit_b(b_imm(prog->len, ctx), ctx); + emit_nop(ctx); + break; + case BPF_LD | BPF_W | BPF_IND: + /* A <- P[X + k:4] */ + sk_load_func = sk_load_word; + goto load_ind; + case BPF_LD | BPF_H | BPF_IND: + /* A <- P[X + k:2] */ + sk_load_func = sk_load_half; + goto load_ind; + case BPF_LD | BPF_B | BPF_IND: + /* A <- P[X + k:1] */ + sk_load_func = sk_load_byte; +load_ind: + ctx->flags |= SEEN_OFF | SEEN_X; + emit_addiu(r_off, r_X, k, ctx); + goto load_common; + case BPF_LDX | BPF_IMM: + /* X <- k */ + ctx->flags |= SEEN_X; + emit_load_imm(r_X, k, ctx); + break; + case BPF_LDX | BPF_MEM: + /* X <- M[k] */ + ctx->flags |= SEEN_X | SEEN_MEM; + emit_load(r_X, r_M, SCRATCH_OFF(k), ctx); + break; + case BPF_LDX | BPF_W | BPF_LEN: + /* X <- len */ + ctx->flags |= SEEN_X | SEEN_SKB; + off = offsetof(struct sk_buff, len); + emit_load(r_X, r_skb, off, ctx); + break; + case BPF_LDX | BPF_B | BPF_MSH: + /* X <- 4 * (P[k:1] & 0xf) */ + ctx->flags |= SEEN_X | SEEN_CALL | SEEN_SKB; + /* Load offset to a1 */ + emit_load_func(r_s0, (ptr)sk_load_byte, ctx); + /* + * This may emit two instructions so it may not fit + * in the delay slot. So use a0 in the delay slot. + */ + emit_load_imm(MIPS_R_A1, k, ctx); + emit_jalr(MIPS_R_RA, r_s0, ctx); + emit_reg_move(MIPS_R_A0, r_skb, ctx); /* delay slot */ + /* Check the error value */ + emit_bcond(MIPS_COND_NE, r_ret, 0, + b_imm(prog->len, ctx), ctx); + emit_reg_move(r_ret, r_zero, ctx); + /* We are good */ + /* X <- P[1:K] & 0xf */ + emit_andi(r_X, r_A, 0xf, ctx); + /* X << 2 */ + emit_b(b_imm(i + 1, ctx), ctx); + emit_sll(r_X, r_X, 2, ctx); /* delay slot */ + break; + case BPF_ST: + /* M[k] <- A */ + ctx->flags |= SEEN_MEM | SEEN_A; + emit_store(r_A, r_M, SCRATCH_OFF(k), ctx); + break; + case BPF_STX: + /* M[k] <- X */ + ctx->flags |= SEEN_MEM | SEEN_X; + emit_store(r_X, r_M, SCRATCH_OFF(k), ctx); + break; + case BPF_ALU | BPF_ADD | BPF_K: + /* A += K */ + ctx->flags |= SEEN_A; + emit_addiu(r_A, r_A, k, ctx); + break; + case BPF_ALU | BPF_ADD | BPF_X: + /* A += X */ + ctx->flags |= SEEN_A | SEEN_X; + emit_addu(r_A, r_A, r_X, ctx); + break; + case BPF_ALU | BPF_SUB | BPF_K: + /* A -= K */ + ctx->flags |= SEEN_A; + emit_addiu(r_A, r_A, -k, ctx); + break; + case BPF_ALU | BPF_SUB | BPF_X: + /* A -= X */ + ctx->flags |= SEEN_A | SEEN_X; + emit_subu(r_A, r_A, r_X, ctx); + break; + case BPF_ALU | BPF_MUL | BPF_K: + /* A *= K */ + /* Load K to scratch register before MUL */ + ctx->flags |= SEEN_A; + emit_load_imm(r_s0, k, ctx); + emit_mul(r_A, r_A, r_s0, ctx); + break; + case BPF_ALU | BPF_MUL | BPF_X: + /* A *= X */ + ctx->flags |= SEEN_A | SEEN_X; + emit_mul(r_A, r_A, r_X, ctx); + break; + case BPF_ALU | BPF_DIV | BPF_K: + /* A /= k */ + if (k == 1) + break; + if (optimize_div(&k)) { + ctx->flags |= SEEN_A; + emit_srl(r_A, r_A, k, ctx); + break; + } + ctx->flags |= SEEN_A; + emit_load_imm(r_s0, k, ctx); + emit_div(r_A, r_s0, ctx); + break; + case BPF_ALU | BPF_MOD | BPF_K: + /* A %= k */ + if (k == 1) { + ctx->flags |= SEEN_A; + emit_jit_reg_move(r_A, r_zero, ctx); + } else { + ctx->flags |= SEEN_A; + emit_load_imm(r_s0, k, ctx); + emit_mod(r_A, r_s0, ctx); + } + break; + case BPF_ALU | BPF_DIV | BPF_X: + /* A /= X */ + ctx->flags |= SEEN_X | SEEN_A; + /* Check if r_X is zero */ + emit_bcond(MIPS_COND_EQ, r_X, r_zero, + b_imm(prog->len, ctx), ctx); + emit_load_imm(r_ret, 0, ctx); /* delay slot */ + emit_div(r_A, r_X, ctx); + break; + case BPF_ALU | BPF_MOD | BPF_X: + /* A %= X */ + ctx->flags |= SEEN_X | SEEN_A; + /* Check if r_X is zero */ + emit_bcond(MIPS_COND_EQ, r_X, r_zero, + b_imm(prog->len, ctx), ctx); + emit_load_imm(r_ret, 0, ctx); /* delay slot */ + emit_mod(r_A, r_X, ctx); + break; + case BPF_ALU | BPF_OR | BPF_K: + /* A |= K */ + ctx->flags |= SEEN_A; + emit_ori(r_A, r_A, k, ctx); + break; + case BPF_ALU | BPF_OR | BPF_X: + /* A |= X */ + ctx->flags |= SEEN_A; + emit_ori(r_A, r_A, r_X, ctx); + break; + case BPF_ALU | BPF_XOR | BPF_K: + /* A ^= k */ + ctx->flags |= SEEN_A; + emit_xori(r_A, r_A, k, ctx); + break; + case BPF_ANC | SKF_AD_ALU_XOR_X: + case BPF_ALU | BPF_XOR | BPF_X: + /* A ^= X */ + ctx->flags |= SEEN_A; + emit_xor(r_A, r_A, r_X, ctx); + break; + case BPF_ALU | BPF_AND | BPF_K: + /* A &= K */ + ctx->flags |= SEEN_A; + emit_andi(r_A, r_A, k, ctx); + break; + case BPF_ALU | BPF_AND | BPF_X: + /* A &= X */ + ctx->flags |= SEEN_A | SEEN_X; + emit_and(r_A, r_A, r_X, ctx); + break; + case BPF_ALU | BPF_LSH | BPF_K: + /* A <<= K */ + ctx->flags |= SEEN_A; + emit_sll(r_A, r_A, k, ctx); + break; + case BPF_ALU | BPF_LSH | BPF_X: + /* A <<= X */ + ctx->flags |= SEEN_A | SEEN_X; + emit_sllv(r_A, r_A, r_X, ctx); + break; + case BPF_ALU | BPF_RSH | BPF_K: + /* A >>= K */ + ctx->flags |= SEEN_A; + emit_srl(r_A, r_A, k, ctx); + break; + case BPF_ALU | BPF_RSH | BPF_X: + ctx->flags |= SEEN_A | SEEN_X; + emit_srlv(r_A, r_A, r_X, ctx); + break; + case BPF_ALU | BPF_NEG: + /* A = -A */ + ctx->flags |= SEEN_A; + emit_neg(r_A, ctx); + break; + case BPF_JMP | BPF_JA: + /* pc += K */ + emit_b(b_imm(i + k + 1, ctx), ctx); + emit_nop(ctx); + break; + case BPF_JMP | BPF_JEQ | BPF_K: + /* pc += ( A == K ) ? pc->jt : pc->jf */ + condt = MIPS_COND_EQ | MIPS_COND_K; + goto jmp_cmp; + case BPF_JMP | BPF_JEQ | BPF_X: + ctx->flags |= SEEN_X; + /* pc += ( A == X ) ? pc->jt : pc->jf */ + condt = MIPS_COND_EQ | MIPS_COND_X; + goto jmp_cmp; + case BPF_JMP | BPF_JGE | BPF_K: + /* pc += ( A >= K ) ? pc->jt : pc->jf */ + condt = MIPS_COND_GE | MIPS_COND_K; + goto jmp_cmp; + case BPF_JMP | BPF_JGE | BPF_X: + ctx->flags |= SEEN_X; + /* pc += ( A >= X ) ? pc->jt : pc->jf */ + condt = MIPS_COND_GE | MIPS_COND_X; + goto jmp_cmp; + case BPF_JMP | BPF_JGT | BPF_K: + /* pc += ( A > K ) ? pc->jt : pc->jf */ + condt = MIPS_COND_GT | MIPS_COND_K; + goto jmp_cmp; + case BPF_JMP | BPF_JGT | BPF_X: + ctx->flags |= SEEN_X; + /* pc += ( A > X ) ? pc->jt : pc->jf */ + condt = MIPS_COND_GT | MIPS_COND_X; +jmp_cmp: + /* Greater or Equal */ + if ((condt & MIPS_COND_GE) || + (condt & MIPS_COND_GT)) { + if (condt & MIPS_COND_K) { /* K */ + ctx->flags |= SEEN_A; + emit_sltiu(r_s0, r_A, k, ctx); + } else { /* X */ + ctx->flags |= SEEN_A | + SEEN_X; + emit_sltu(r_s0, r_A, r_X, ctx); + } + /* A < (K|X) ? r_scrach = 1 */ + b_off = b_imm(i + inst->jf + 1, ctx); + emit_bcond(MIPS_COND_NE, r_s0, r_zero, b_off, + ctx); + emit_nop(ctx); + /* A > (K|X) ? scratch = 0 */ + if (condt & MIPS_COND_GT) { + /* Checking for equality */ + ctx->flags |= SEEN_A | SEEN_X; + if (condt & MIPS_COND_K) + emit_load_imm(r_s0, k, ctx); + else + emit_jit_reg_move(r_s0, r_X, + ctx); + b_off = b_imm(i + inst->jf + 1, ctx); + emit_bcond(MIPS_COND_EQ, r_A, r_s0, + b_off, ctx); + emit_nop(ctx); + /* Finally, A > K|X */ + b_off = b_imm(i + inst->jt + 1, ctx); + emit_b(b_off, ctx); + emit_nop(ctx); + } else { + /* A >= (K|X) so jump */ + b_off = b_imm(i + inst->jt + 1, ctx); + emit_b(b_off, ctx); + emit_nop(ctx); + } + } else { + /* A == K|X */ + if (condt & MIPS_COND_K) { /* K */ + ctx->flags |= SEEN_A; + emit_load_imm(r_s0, k, ctx); + /* jump true */ + b_off = b_imm(i + inst->jt + 1, ctx); + emit_bcond(MIPS_COND_EQ, r_A, r_s0, + b_off, ctx); + emit_nop(ctx); + /* jump false */ + b_off = b_imm(i + inst->jf + 1, + ctx); + emit_bcond(MIPS_COND_NE, r_A, r_s0, + b_off, ctx); + emit_nop(ctx); + } else { /* X */ + /* jump true */ + ctx->flags |= SEEN_A | SEEN_X; + b_off = b_imm(i + inst->jt + 1, + ctx); + emit_bcond(MIPS_COND_EQ, r_A, r_X, + b_off, ctx); + emit_nop(ctx); + /* jump false */ + b_off = b_imm(i + inst->jf + 1, ctx); + emit_bcond(MIPS_COND_NE, r_A, r_X, + b_off, ctx); + emit_nop(ctx); + } + } + break; + case BPF_JMP | BPF_JSET | BPF_K: + ctx->flags |= SEEN_A; + /* pc += (A & K) ? pc -> jt : pc -> jf */ + emit_load_imm(r_s1, k, ctx); + emit_and(r_s0, r_A, r_s1, ctx); + /* jump true */ + b_off = b_imm(i + inst->jt + 1, ctx); + emit_bcond(MIPS_COND_NE, r_s0, r_zero, b_off, ctx); + emit_nop(ctx); + /* jump false */ + b_off = b_imm(i + inst->jf + 1, ctx); + emit_b(b_off, ctx); + emit_nop(ctx); + break; + case BPF_JMP | BPF_JSET | BPF_X: + ctx->flags |= SEEN_X | SEEN_A; + /* pc += (A & X) ? pc -> jt : pc -> jf */ + emit_and(r_s0, r_A, r_X, ctx); + /* jump true */ + b_off = b_imm(i + inst->jt + 1, ctx); + emit_bcond(MIPS_COND_NE, r_s0, r_zero, b_off, ctx); + emit_nop(ctx); + /* jump false */ + b_off = b_imm(i + inst->jf + 1, ctx); + emit_b(b_off, ctx); + emit_nop(ctx); + break; + case BPF_RET | BPF_A: + ctx->flags |= SEEN_A; + if (i != prog->len - 1) + /* + * If this is not the last instruction + * then jump to the epilogue + */ + emit_b(b_imm(prog->len, ctx), ctx); + emit_reg_move(r_ret, r_A, ctx); /* delay slot */ + break; + case BPF_RET | BPF_K: + /* + * It can emit two instructions so it does not fit on + * the delay slot. + */ + emit_load_imm(r_ret, k, ctx); + if (i != prog->len - 1) { + /* + * If this is not the last instruction + * then jump to the epilogue + */ + emit_b(b_imm(prog->len, ctx), ctx); + emit_nop(ctx); + } + break; + case BPF_MISC | BPF_TAX: + /* X = A */ + ctx->flags |= SEEN_X | SEEN_A; + emit_jit_reg_move(r_X, r_A, ctx); + break; + case BPF_MISC | BPF_TXA: + /* A = X */ + ctx->flags |= SEEN_A | SEEN_X; + emit_jit_reg_move(r_A, r_X, ctx); + break; + /* AUX */ + case BPF_ANC | SKF_AD_PROTOCOL: + /* A = ntohs(skb->protocol */ + ctx->flags |= SEEN_SKB | SEEN_OFF | SEEN_A; + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, + protocol) != 2); + off = offsetof(struct sk_buff, protocol); + emit_half_load(r_A, r_skb, off, ctx); +#ifdef CONFIG_CPU_LITTLE_ENDIAN + /* This needs little endian fixup */ + if (cpu_has_wsbh) { + /* R2 and later have the wsbh instruction */ + emit_wsbh(r_A, r_A, ctx); + } else { + /* Get first byte */ + emit_andi(r_tmp_imm, r_A, 0xff, ctx); + /* Shift it */ + emit_sll(r_tmp, r_tmp_imm, 8, ctx); + /* Get second byte */ + emit_srl(r_tmp_imm, r_A, 8, ctx); + emit_andi(r_tmp_imm, r_tmp_imm, 0xff, ctx); + /* Put everyting together in r_A */ + emit_or(r_A, r_tmp, r_tmp_imm, ctx); + } +#endif + break; + case BPF_ANC | SKF_AD_CPU: + ctx->flags |= SEEN_A | SEEN_OFF; + /* A = current_thread_info()->cpu */ + BUILD_BUG_ON(FIELD_SIZEOF(struct thread_info, + cpu) != 4); + off = offsetof(struct thread_info, cpu); + /* $28/gp points to the thread_info struct */ + emit_load(r_A, 28, off, ctx); + break; + case BPF_ANC | SKF_AD_IFINDEX: + /* A = skb->dev->ifindex */ + case BPF_ANC | SKF_AD_HATYPE: + /* A = skb->dev->type */ + ctx->flags |= SEEN_SKB | SEEN_A; + off = offsetof(struct sk_buff, dev); + /* Load *dev pointer */ + emit_load_ptr(r_s0, r_skb, off, ctx); + /* error (0) in the delay slot */ + emit_bcond(MIPS_COND_EQ, r_s0, r_zero, + b_imm(prog->len, ctx), ctx); + emit_reg_move(r_ret, r_zero, ctx); + if (code == (BPF_ANC | SKF_AD_IFINDEX)) { + BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); + off = offsetof(struct net_device, ifindex); + emit_load(r_A, r_s0, off, ctx); + } else { /* (code == (BPF_ANC | SKF_AD_HATYPE) */ + BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2); + off = offsetof(struct net_device, type); + emit_half_load_unsigned(r_A, r_s0, off, ctx); + } + break; + case BPF_ANC | SKF_AD_MARK: + ctx->flags |= SEEN_SKB | SEEN_A; + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); + off = offsetof(struct sk_buff, mark); + emit_load(r_A, r_skb, off, ctx); + break; + case BPF_ANC | SKF_AD_RXHASH: + ctx->flags |= SEEN_SKB | SEEN_A; + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); + off = offsetof(struct sk_buff, hash); + emit_load(r_A, r_skb, off, ctx); + break; + case BPF_ANC | SKF_AD_VLAN_TAG: + ctx->flags |= SEEN_SKB | SEEN_A; + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, + vlan_tci) != 2); + off = offsetof(struct sk_buff, vlan_tci); + emit_half_load_unsigned(r_A, r_skb, off, ctx); + break; + case BPF_ANC | SKF_AD_VLAN_TAG_PRESENT: + ctx->flags |= SEEN_SKB | SEEN_A; + emit_load_byte(r_A, r_skb, PKT_VLAN_PRESENT_OFFSET(), ctx); + if (PKT_VLAN_PRESENT_BIT) + emit_srl(r_A, r_A, PKT_VLAN_PRESENT_BIT, ctx); + if (PKT_VLAN_PRESENT_BIT < 7) + emit_andi(r_A, r_A, 1, ctx); + break; + case BPF_ANC | SKF_AD_PKTTYPE: + ctx->flags |= SEEN_SKB; + + emit_load_byte(r_tmp, r_skb, PKT_TYPE_OFFSET(), ctx); + /* Keep only the last 3 bits */ + emit_andi(r_A, r_tmp, PKT_TYPE_MAX, ctx); +#ifdef __BIG_ENDIAN_BITFIELD + /* Get the actual packet type to the lower 3 bits */ + emit_srl(r_A, r_A, 5, ctx); +#endif + break; + case BPF_ANC | SKF_AD_QUEUE: + ctx->flags |= SEEN_SKB | SEEN_A; + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, + queue_mapping) != 2); + BUILD_BUG_ON(offsetof(struct sk_buff, + queue_mapping) > 0xff); + off = offsetof(struct sk_buff, queue_mapping); + emit_half_load_unsigned(r_A, r_skb, off, ctx); + break; + default: + pr_debug("%s: Unhandled opcode: 0x%02x\n", __FILE__, + inst->code); + return -1; + } + } + + /* compute offsets only during the first pass */ + if (ctx->target == NULL) + ctx->offsets[i] = ctx->idx * 4; + + return 0; +} + +void bpf_jit_compile(struct bpf_prog *fp) +{ + struct jit_ctx ctx; + unsigned int alloc_size, tmp_idx; + + if (!bpf_jit_enable) + return; + + memset(&ctx, 0, sizeof(ctx)); + + ctx.offsets = kcalloc(fp->len + 1, sizeof(*ctx.offsets), GFP_KERNEL); + if (ctx.offsets == NULL) + return; + + ctx.skf = fp; + + if (build_body(&ctx)) + goto out; + + tmp_idx = ctx.idx; + build_prologue(&ctx); + ctx.prologue_bytes = (ctx.idx - tmp_idx) * 4; + /* just to complete the ctx.idx count */ + build_epilogue(&ctx); + + alloc_size = 4 * ctx.idx; + ctx.target = module_alloc(alloc_size); + if (ctx.target == NULL) + goto out; + + /* Clean it */ + memset(ctx.target, 0, alloc_size); + + ctx.idx = 0; + + /* Generate the actual JIT code */ + build_prologue(&ctx); + build_body(&ctx); + build_epilogue(&ctx); + + /* Update the icache */ + flush_icache_range((ptr)ctx.target, (ptr)(ctx.target + ctx.idx)); + + if (bpf_jit_enable > 1) + /* Dump JIT code */ + bpf_jit_dump(fp->len, alloc_size, 2, ctx.target); + + fp->bpf_func = (void *)ctx.target; + fp->jited = 1; + +out: + kfree(ctx.offsets); +} + +void bpf_jit_free(struct bpf_prog *fp) +{ + if (fp->jited) + module_memfree(fp->bpf_func); + + bpf_prog_unlock_free(fp); +} diff --git a/arch/mips/net/bpf_jit_asm.S b/arch/mips/net/bpf_jit_asm.S new file mode 100644 index 0000000000000..57154c5883b6f --- /dev/null +++ b/arch/mips/net/bpf_jit_asm.S @@ -0,0 +1,285 @@ +/* + * bpf_jib_asm.S: Packet/header access helper functions for MIPS/MIPS64 BPF + * compiler. + * + * Copyright (C) 2015 Imagination Technologies Ltd. + * Author: Markos Chandras + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; version 2 of the License. + */ + +#include +#include +#include +#include "bpf_jit.h" + +/* ABI + * + * r_skb_hl skb header length + * r_skb_data skb data + * r_off(a1) offset register + * r_A BPF register A + * r_X PF register X + * r_skb(a0) *skb + * r_M *scratch memory + * r_skb_le skb length + * r_s0 Scratch register 0 + * r_s1 Scratch register 1 + * + * On entry: + * a0: *skb + * a1: offset (imm or imm + X) + * + * All non-BPF-ABI registers are free for use. On return, we only + * care about r_ret. The BPF-ABI registers are assumed to remain + * unmodified during the entire filter operation. + */ + +#define skb a0 +#define offset a1 +#define SKF_LL_OFF (-0x200000) /* Can't include linux/filter.h in assembly */ + + /* We know better :) so prevent assembler reordering etc */ + .set noreorder + +#define is_offset_negative(TYPE) \ + /* If offset is negative we have more work to do */ \ + slti t0, offset, 0; \ + bgtz t0, bpf_slow_path_##TYPE##_neg; \ + /* Be careful what follows in DS. */ + +#define is_offset_in_header(SIZE, TYPE) \ + /* Reading from header? */ \ + addiu $r_s0, $r_skb_hl, -SIZE; \ + slt t0, $r_s0, offset; \ + bgtz t0, bpf_slow_path_##TYPE; \ + +LEAF(sk_load_word) + is_offset_negative(word) +FEXPORT(sk_load_word_positive) + is_offset_in_header(4, word) + /* Offset within header boundaries */ + PTR_ADDU t1, $r_skb_data, offset + .set reorder + lw $r_A, 0(t1) + .set noreorder +#ifdef CONFIG_CPU_LITTLE_ENDIAN +# if MIPS_ISA_REV >= 2 + wsbh t0, $r_A + rotr $r_A, t0, 16 +# else + sll t0, $r_A, 24 + srl t1, $r_A, 24 + srl t2, $r_A, 8 + or t0, t0, t1 + andi t2, t2, 0xff00 + andi t1, $r_A, 0xff00 + or t0, t0, t2 + sll t1, t1, 8 + or $r_A, t0, t1 +# endif +#endif + jr $r_ra + move $r_ret, zero + END(sk_load_word) + +LEAF(sk_load_half) + is_offset_negative(half) +FEXPORT(sk_load_half_positive) + is_offset_in_header(2, half) + /* Offset within header boundaries */ + PTR_ADDU t1, $r_skb_data, offset + lhu $r_A, 0(t1) +#ifdef CONFIG_CPU_LITTLE_ENDIAN +# if MIPS_ISA_REV >= 2 + wsbh $r_A, $r_A +# else + sll t0, $r_A, 8 + srl t1, $r_A, 8 + andi t0, t0, 0xff00 + or $r_A, t0, t1 +# endif +#endif + jr $r_ra + move $r_ret, zero + END(sk_load_half) + +LEAF(sk_load_byte) + is_offset_negative(byte) +FEXPORT(sk_load_byte_positive) + is_offset_in_header(1, byte) + /* Offset within header boundaries */ + PTR_ADDU t1, $r_skb_data, offset + lbu $r_A, 0(t1) + jr $r_ra + move $r_ret, zero + END(sk_load_byte) + +/* + * call skb_copy_bits: + * (prototype in linux/skbuff.h) + * + * int skb_copy_bits(sk_buff *skb, int offset, void *to, int len) + * + * o32 mandates we leave 4 spaces for argument registers in case + * the callee needs to use them. Even though we don't care about + * the argument registers ourselves, we need to allocate that space + * to remain ABI compliant since the callee may want to use that space. + * We also allocate 2 more spaces for $r_ra and our return register (*to). + * + * n64 is a bit different. The *caller* will allocate the space to preserve + * the arguments. So in 64-bit kernels, we allocate the 4-arg space for no + * good reason but it does not matter that much really. + * + * (void *to) is returned in r_s0 + * + */ +#ifdef CONFIG_CPU_LITTLE_ENDIAN +#define DS_OFFSET(SIZE) (4 * SZREG) +#else +#define DS_OFFSET(SIZE) ((4 * SZREG) + (4 - SIZE)) +#endif +#define bpf_slow_path_common(SIZE) \ + /* Quick check. Are we within reasonable boundaries? */ \ + LONG_ADDIU $r_s1, $r_skb_len, -SIZE; \ + sltu $r_s0, offset, $r_s1; \ + beqz $r_s0, fault; \ + /* Load 4th argument in DS */ \ + LONG_ADDIU a3, zero, SIZE; \ + PTR_ADDIU $r_sp, $r_sp, -(6 * SZREG); \ + PTR_LA t0, skb_copy_bits; \ + PTR_S $r_ra, (5 * SZREG)($r_sp); \ + /* Assign low slot to a2 */ \ + PTR_ADDIU a2, $r_sp, DS_OFFSET(SIZE); \ + jalr t0; \ + /* Reset our destination slot (DS but it's ok) */ \ + INT_S zero, (4 * SZREG)($r_sp); \ + /* \ + * skb_copy_bits returns 0 on success and -EFAULT \ + * on error. Our data live in a2. Do not bother with \ + * our data if an error has been returned. \ + */ \ + /* Restore our frame */ \ + PTR_L $r_ra, (5 * SZREG)($r_sp); \ + INT_L $r_s0, (4 * SZREG)($r_sp); \ + bltz v0, fault; \ + PTR_ADDIU $r_sp, $r_sp, 6 * SZREG; \ + move $r_ret, zero; \ + +NESTED(bpf_slow_path_word, (6 * SZREG), $r_sp) + bpf_slow_path_common(4) +#ifdef CONFIG_CPU_LITTLE_ENDIAN +# if MIPS_ISA_REV >= 2 + wsbh t0, $r_s0 + jr $r_ra + rotr $r_A, t0, 16 +# else + sll t0, $r_s0, 24 + srl t1, $r_s0, 24 + srl t2, $r_s0, 8 + or t0, t0, t1 + andi t2, t2, 0xff00 + andi t1, $r_s0, 0xff00 + or t0, t0, t2 + sll t1, t1, 8 + jr $r_ra + or $r_A, t0, t1 +# endif +#else + jr $r_ra + move $r_A, $r_s0 +#endif + + END(bpf_slow_path_word) + +NESTED(bpf_slow_path_half, (6 * SZREG), $r_sp) + bpf_slow_path_common(2) +#ifdef CONFIG_CPU_LITTLE_ENDIAN +# if MIPS_ISA_REV >= 2 + jr $r_ra + wsbh $r_A, $r_s0 +# else + sll t0, $r_s0, 8 + andi t1, $r_s0, 0xff00 + andi t0, t0, 0xff00 + srl t1, t1, 8 + jr $r_ra + or $r_A, t0, t1 +# endif +#else + jr $r_ra + move $r_A, $r_s0 +#endif + + END(bpf_slow_path_half) + +NESTED(bpf_slow_path_byte, (6 * SZREG), $r_sp) + bpf_slow_path_common(1) + jr $r_ra + move $r_A, $r_s0 + + END(bpf_slow_path_byte) + +/* + * Negative entry points + */ + .macro bpf_is_end_of_data + li t0, SKF_LL_OFF + /* Reading link layer data? */ + slt t1, offset, t0 + bgtz t1, fault + /* Be careful what follows in DS. */ + .endm +/* + * call skb_copy_bits: + * (prototype in linux/filter.h) + * + * void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, + * int k, unsigned int size) + * + * see above (bpf_slow_path_common) for ABI restrictions + */ +#define bpf_negative_common(SIZE) \ + PTR_ADDIU $r_sp, $r_sp, -(6 * SZREG); \ + PTR_LA t0, bpf_internal_load_pointer_neg_helper; \ + PTR_S $r_ra, (5 * SZREG)($r_sp); \ + jalr t0; \ + li a2, SIZE; \ + PTR_L $r_ra, (5 * SZREG)($r_sp); \ + /* Check return pointer */ \ + beqz v0, fault; \ + PTR_ADDIU $r_sp, $r_sp, 6 * SZREG; \ + /* Preserve our pointer */ \ + move $r_s0, v0; \ + /* Set return value */ \ + move $r_ret, zero; \ + +bpf_slow_path_word_neg: + bpf_is_end_of_data +NESTED(sk_load_word_negative, (6 * SZREG), $r_sp) + bpf_negative_common(4) + jr $r_ra + lw $r_A, 0($r_s0) + END(sk_load_word_negative) + +bpf_slow_path_half_neg: + bpf_is_end_of_data +NESTED(sk_load_half_negative, (6 * SZREG), $r_sp) + bpf_negative_common(2) + jr $r_ra + lhu $r_A, 0($r_s0) + END(sk_load_half_negative) + +bpf_slow_path_byte_neg: + bpf_is_end_of_data +NESTED(sk_load_byte_negative, (6 * SZREG), $r_sp) + bpf_negative_common(1) + jr $r_ra + lbu $r_A, 0($r_s0) + END(sk_load_byte_negative) + +fault: + jr $r_ra + addiu $r_ret, zero, 1 From 9f894011b5983709f8a833807df91aa47d14a2ff Mon Sep 17 00:00:00 2001 From: Piotr Krysiuk Date: Wed, 15 Sep 2021 17:04:37 +0100 Subject: [PATCH 1082/5344] bpf, mips: Validate conditional branch offsets commit 37cb28ec7d3a36a5bace7063a3dba633ab110f8b upstream. The conditional branch instructions on MIPS use 18-bit signed offsets allowing for a branch range of 128 KBytes (backward and forward). However, this limit is not observed by the cBPF JIT compiler, and so the JIT compiler emits out-of-range branches when translating certain cBPF programs. A specific example of such a cBPF program is included in the "BPF_MAXINSNS: exec all MSH" test from lib/test_bpf.c that executes anomalous machine code containing incorrect branch offsets under JIT. Furthermore, this issue can be abused to craft undesirable machine code, where the control flow is hijacked to execute arbitrary Kernel code. The following steps can be used to reproduce the issue: # echo 1 > /proc/sys/net/core/bpf_jit_enable # modprobe test_bpf test_name="BPF_MAXINSNS: exec all MSH" This should produce multiple warnings from build_bimm() similar to: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 209 at arch/mips/mm/uasm-mips.c:210 build_insn+0x558/0x590 Micro-assembler field overflow Modules linked in: test_bpf(+) CPU: 0 PID: 209 Comm: modprobe Not tainted 5.14.3 #1 Stack : 00000000 807bb824 82b33c9c 801843c0 00000000 00000004 00000000 63c9b5ee 82b33af4 80999898 80910000 80900000 82fd6030 00000001 82b33a98 82087180 00000000 00000000 80873b28 00000000 000000fc 82b3394c 00000000 2e34312e 6d6d6f43 809a180f 809a1836 6f6d203a 80900000 00000001 82b33bac 80900000 00027f80 00000000 00000000 807bb824 00000000 804ed790 001cc317 00000001 [...] Call Trace: [<80108f44>] show_stack+0x38/0x118 [<807a7aac>] dump_stack_lvl+0x5c/0x7c [<807a4b3c>] __warn+0xcc/0x140 [<807a4c3c>] warn_slowpath_fmt+0x8c/0xb8 [<8011e198>] build_insn+0x558/0x590 [<8011e358>] uasm_i_bne+0x20/0x2c [<80127b48>] build_body+0xa58/0x2a94 [<80129c98>] bpf_jit_compile+0x114/0x1e4 [<80613fc4>] bpf_prepare_filter+0x2ec/0x4e4 [<8061423c>] bpf_prog_create+0x80/0xc4 [] test_bpf_init+0x300/0xba8 [test_bpf] [<8010051c>] do_one_initcall+0x50/0x1d4 [<801c5e54>] do_init_module+0x60/0x220 [<801c8b20>] sys_finit_module+0xc4/0xfc [<801144d0>] syscall_common+0x34/0x58 [...] ---[ end trace a287d9742503c645 ]--- Then the anomalous machine code executes: => 0xc0a18000: addiu sp,sp,-16 0xc0a18004: sw s3,0(sp) 0xc0a18008: sw s4,4(sp) 0xc0a1800c: sw s5,8(sp) 0xc0a18010: sw ra,12(sp) 0xc0a18014: move s5,a0 0xc0a18018: move s4,zero 0xc0a1801c: move s3,zero # __BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0) 0xc0a18020: lui t6,0x8012 0xc0a18024: ori t4,t6,0x9e14 0xc0a18028: li a1,0 0xc0a1802c: jalr t4 0xc0a18030: move a0,s5 0xc0a18034: bnez v0,0xc0a1ffb8 # incorrect branch offset 0xc0a18038: move v0,zero 0xc0a1803c: andi s4,s3,0xf 0xc0a18040: b 0xc0a18048 0xc0a18044: sll s4,s4,0x2 [...] # __BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0) 0xc0a1ffa0: lui t6,0x8012 0xc0a1ffa4: ori t4,t6,0x9e14 0xc0a1ffa8: li a1,0 0xc0a1ffac: jalr t4 0xc0a1ffb0: move a0,s5 0xc0a1ffb4: bnez v0,0xc0a1ffb8 # incorrect branch offset 0xc0a1ffb8: move v0,zero 0xc0a1ffbc: andi s4,s3,0xf 0xc0a1ffc0: b 0xc0a1ffc8 0xc0a1ffc4: sll s4,s4,0x2 # __BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0) 0xc0a1ffc8: lui t6,0x8012 0xc0a1ffcc: ori t4,t6,0x9e14 0xc0a1ffd0: li a1,0 0xc0a1ffd4: jalr t4 0xc0a1ffd8: move a0,s5 0xc0a1ffdc: bnez v0,0xc0a3ffb8 # correct branch offset 0xc0a1ffe0: move v0,zero 0xc0a1ffe4: andi s4,s3,0xf 0xc0a1ffe8: b 0xc0a1fff0 0xc0a1ffec: sll s4,s4,0x2 [...] # epilogue 0xc0a3ffb8: lw s3,0(sp) 0xc0a3ffbc: lw s4,4(sp) 0xc0a3ffc0: lw s5,8(sp) 0xc0a3ffc4: lw ra,12(sp) 0xc0a3ffc8: addiu sp,sp,16 0xc0a3ffcc: jr ra 0xc0a3ffd0: nop To mitigate this issue, we assert the branch ranges for each emit call that could generate an out-of-range branch. Fixes: 36366e367ee9 ("MIPS: BPF: Restore MIPS32 cBPF JIT") Fixes: c6610de353da ("MIPS: net: Add BPF JIT") Signed-off-by: Piotr Krysiuk Signed-off-by: Daniel Borkmann Tested-by: Johan Almbladh Acked-by: Johan Almbladh Cc: Paul Burton Cc: Thomas Bogendoerfer Link: https://lore.kernel.org/bpf/20210915160437.4080-1-piotras@gmail.com Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- arch/mips/net/bpf_jit.c | 57 +++++++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 14 deletions(-) diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c index 3a0e34f4e6153..29a288ff4f183 100644 --- a/arch/mips/net/bpf_jit.c +++ b/arch/mips/net/bpf_jit.c @@ -662,6 +662,11 @@ static void build_epilogue(struct jit_ctx *ctx) ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative : func) : \ func##_positive) +static bool is_bad_offset(int b_off) +{ + return b_off > 0x1ffff || b_off < -0x20000; +} + static int build_body(struct jit_ctx *ctx) { const struct bpf_prog *prog = ctx->skf; @@ -728,7 +733,10 @@ static int build_body(struct jit_ctx *ctx) /* Load return register on DS for failures */ emit_reg_move(r_ret, r_zero, ctx); /* Return with error */ - emit_b(b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_b(b_off, ctx); emit_nop(ctx); break; case BPF_LD | BPF_W | BPF_IND: @@ -775,8 +783,10 @@ static int build_body(struct jit_ctx *ctx) emit_jalr(MIPS_R_RA, r_s0, ctx); emit_reg_move(MIPS_R_A0, r_skb, ctx); /* delay slot */ /* Check the error value */ - emit_bcond(MIPS_COND_NE, r_ret, 0, - b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_bcond(MIPS_COND_NE, r_ret, 0, b_off, ctx); emit_reg_move(r_ret, r_zero, ctx); /* We are good */ /* X <- P[1:K] & 0xf */ @@ -855,8 +865,10 @@ static int build_body(struct jit_ctx *ctx) /* A /= X */ ctx->flags |= SEEN_X | SEEN_A; /* Check if r_X is zero */ - emit_bcond(MIPS_COND_EQ, r_X, r_zero, - b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_bcond(MIPS_COND_EQ, r_X, r_zero, b_off, ctx); emit_load_imm(r_ret, 0, ctx); /* delay slot */ emit_div(r_A, r_X, ctx); break; @@ -864,8 +876,10 @@ static int build_body(struct jit_ctx *ctx) /* A %= X */ ctx->flags |= SEEN_X | SEEN_A; /* Check if r_X is zero */ - emit_bcond(MIPS_COND_EQ, r_X, r_zero, - b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_bcond(MIPS_COND_EQ, r_X, r_zero, b_off, ctx); emit_load_imm(r_ret, 0, ctx); /* delay slot */ emit_mod(r_A, r_X, ctx); break; @@ -926,7 +940,10 @@ static int build_body(struct jit_ctx *ctx) break; case BPF_JMP | BPF_JA: /* pc += K */ - emit_b(b_imm(i + k + 1, ctx), ctx); + b_off = b_imm(i + k + 1, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_b(b_off, ctx); emit_nop(ctx); break; case BPF_JMP | BPF_JEQ | BPF_K: @@ -1056,12 +1073,16 @@ static int build_body(struct jit_ctx *ctx) break; case BPF_RET | BPF_A: ctx->flags |= SEEN_A; - if (i != prog->len - 1) + if (i != prog->len - 1) { /* * If this is not the last instruction * then jump to the epilogue */ - emit_b(b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_b(b_off, ctx); + } emit_reg_move(r_ret, r_A, ctx); /* delay slot */ break; case BPF_RET | BPF_K: @@ -1075,7 +1096,10 @@ static int build_body(struct jit_ctx *ctx) * If this is not the last instruction * then jump to the epilogue */ - emit_b(b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_b(b_off, ctx); emit_nop(ctx); } break; @@ -1133,8 +1157,10 @@ static int build_body(struct jit_ctx *ctx) /* Load *dev pointer */ emit_load_ptr(r_s0, r_skb, off, ctx); /* error (0) in the delay slot */ - emit_bcond(MIPS_COND_EQ, r_s0, r_zero, - b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_bcond(MIPS_COND_EQ, r_s0, r_zero, b_off, ctx); emit_reg_move(r_ret, r_zero, ctx); if (code == (BPF_ANC | SKF_AD_IFINDEX)) { BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); @@ -1244,7 +1270,10 @@ void bpf_jit_compile(struct bpf_prog *fp) /* Generate the actual JIT code */ build_prologue(&ctx); - build_body(&ctx); + if (build_body(&ctx)) { + module_memfree(ctx.target); + goto out; + } build_epilogue(&ctx); /* Update the icache */ From acc3c32240513a1d3515ac4174de602810c79f35 Mon Sep 17 00:00:00 2001 From: Antonio Martorana Date: Mon, 16 Aug 2021 17:24:39 -0700 Subject: [PATCH 1083/5344] soc: qcom: socinfo: Fixed argument passed to platform_set_data() [ Upstream commit 9c5a4ec69bbf5951f84ada9e0db9c6c50de61808 ] Set qcom_socinfo pointer as data being stored instead of pointer to soc_device structure. Aligns with future calls to platform_get_data() which expects qcom_socinfo pointer. Fixes: efb448d0a3fc ("soc: qcom: Add socinfo driver") Signed-off-by: Antonio Martorana Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/1629159879-95777-1-git-send-email-amartora@codeaurora.org Signed-off-by: Bjorn Andersson Signed-off-by: Sasha Levin --- drivers/soc/qcom/socinfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/soc/qcom/socinfo.c b/drivers/soc/qcom/socinfo.c index 176696f8f38d1..3303bcaf67154 100644 --- a/drivers/soc/qcom/socinfo.c +++ b/drivers/soc/qcom/socinfo.c @@ -447,7 +447,7 @@ static int qcom_socinfo_probe(struct platform_device *pdev) /* Feed the soc specific unique data into entropy pool */ add_device_randomness(info, item_size); - platform_set_drvdata(pdev, qs->soc_dev); + platform_set_drvdata(pdev, qs); return 0; } From 5752585673b09a9afbad982b39a777c4fe843e18 Mon Sep 17 00:00:00 2001 From: Marijn Suijten Date: Sun, 29 Aug 2021 22:30:25 +0200 Subject: [PATCH 1084/5344] ARM: dts: qcom: apq8064: Use 27MHz PXO clock as DSI PLL reference [ Upstream commit f1db21c315f4b4f8c3fbea56aac500673132d317 ] The 28NM DSI PLL driver for msm8960 calculates with a 27MHz reference clock and should hence use PXO, not CXO which runs at 19.2MHz. Note that none of the DSI PHY/PLL drivers currently use this "ref" clock; they all rely on (sometimes inexistant) global clock names and usually function normally without a parent clock. This discrepancy will be corrected in a future patch, for which this change needs to be in place first. Fixes: 6969d1d9c615 ("ARM: dts: qcom-apq8064: Set 'cxo_board' as ref clock of the DSI PHY") Reviewed-by: Dmitry Baryshkov Signed-off-by: Marijn Suijten Link: https://lore.kernel.org/r/20210829203027.276143-2-marijn.suijten@somainline.org Signed-off-by: Bjorn Andersson Signed-off-by: Sasha Levin --- arch/arm/boot/dts/qcom-apq8064.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/qcom-apq8064.dtsi b/arch/arm/boot/dts/qcom-apq8064.dtsi index 27accc164df37..764984c95c686 100644 --- a/arch/arm/boot/dts/qcom-apq8064.dtsi +++ b/arch/arm/boot/dts/qcom-apq8064.dtsi @@ -198,7 +198,7 @@ clock-frequency = <19200000>; }; - pxo_board { + pxo_board: pxo_board { compatible = "fixed-clock"; #clock-cells = <0>; clock-frequency = <27000000>; @@ -1304,7 +1304,7 @@ reg-names = "dsi_pll", "dsi_phy", "dsi_phy_regulator"; clock-names = "iface_clk", "ref"; clocks = <&mmcc DSI_M_AHB_CLK>, - <&cxo_board>; + <&pxo_board>; }; From f431e1e6d30b184d94b8edd826209b71dfa7c101 Mon Sep 17 00:00:00 2001 From: Shawn Guo Date: Sat, 28 Aug 2021 15:02:02 +0800 Subject: [PATCH 1085/5344] soc: qcom: mdt_loader: Drop PT_LOAD check on hash segment [ Upstream commit 833d51d7c66d6708abbc02398892b96b950167b9 ] PT_LOAD type denotes that the segment should be loaded into the final firmware memory region. Hash segment is not one such, because it's only needed for PAS init and shouldn't be in the final firmware memory region. That's why mdt_phdr_valid() explicitly reject non PT_LOAD segment and hash segment. This actually makes the hash segment type check in qcom_mdt_read_metadata() unnecessary and redundant. For a hash segment, it won't be loaded into firmware memory region anyway, due to the QCOM_MDT_TYPE_HASH check in mdt_phdr_valid(), even if it has a PT_LOAD type for some reason (misusing or abusing?). Some firmware files on Sony phones are such examples, e.g WCNSS firmware of Sony Xperia M4 Aqua phone. The type of hash segment is just PT_LOAD. Drop the unnecessary hash segment type check in qcom_mdt_read_metadata() to fix firmware loading failure on these phones, while hash segment is still kept away from the final firmware memory region. Fixes: 498b98e93900 ("soc: qcom: mdt_loader: Support loading non-split images") Signed-off-by: Shawn Guo Reviewed-by: Marijn Suijten Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20210828070202.7033-1-shawn.guo@linaro.org Signed-off-by: Sasha Levin --- drivers/soc/qcom/mdt_loader.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/soc/qcom/mdt_loader.c b/drivers/soc/qcom/mdt_loader.c index eba7f76f9d61a..6034cd8992b0e 100644 --- a/drivers/soc/qcom/mdt_loader.c +++ b/drivers/soc/qcom/mdt_loader.c @@ -98,7 +98,7 @@ void *qcom_mdt_read_metadata(const struct firmware *fw, size_t *data_len) if (ehdr->e_phnum < 2) return ERR_PTR(-EINVAL); - if (phdrs[0].p_type == PT_LOAD || phdrs[1].p_type == PT_LOAD) + if (phdrs[0].p_type == PT_LOAD) return ERR_PTR(-EINVAL); if ((phdrs[1].p_flags & QCOM_MDT_TYPE_MASK) != QCOM_MDT_TYPE_HASH) From 447b7d19a422e59eb94b6c4ffa14ccec70dd8aaa Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sun, 5 Sep 2021 02:00:48 +0200 Subject: [PATCH 1086/5344] ARM: dts: imx: Add missing pinctrl-names for panel on M53Menlo [ Upstream commit c8c1efe14a4aadcfe93a158b1272e48298d2de15 ] The panel already contains pinctrl-0 phandle, but it is missing the default pinctrl-names property, so the pin configuration is ignored. Fill in the missing pinctrl-names property, so the pin configuration is applied. Fixes: d81765d693db6 ("ARM: dts: imx53: Update LCD panel node on M53Menlo") Signed-off-by: Marek Vasut Cc: Shawn Guo Cc: Fabio Estevam Cc: NXP Linux Team Reviewed-by: Fabio Estevam Signed-off-by: Shawn Guo Signed-off-by: Sasha Levin --- arch/arm/boot/dts/imx53-m53menlo.dts | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/boot/dts/imx53-m53menlo.dts b/arch/arm/boot/dts/imx53-m53menlo.dts index 64faf5b46d92f..a6eb8319f321b 100644 --- a/arch/arm/boot/dts/imx53-m53menlo.dts +++ b/arch/arm/boot/dts/imx53-m53menlo.dts @@ -56,6 +56,7 @@ panel { compatible = "edt,etm0700g0dh6"; pinctrl-0 = <&pinctrl_display_gpio>; + pinctrl-names = "default"; enable-gpios = <&gpio6 0 GPIO_ACTIVE_HIGH>; port { From fee51173ae7461d2bc274b0428758b5a94bc5b72 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sun, 5 Sep 2021 02:01:37 +0200 Subject: [PATCH 1087/5344] ARM: dts: imx: Fix USB host power regulator polarity on M53Menlo [ Upstream commit 5c187e2eb3f92daa38cb3d4ab45e1107ea34108e ] The MIC2025 switch input signal nEN is active low, describe it as such in the DT. The previous change to this regulator polarity was incorrectly influenced by broken quirks in gpiolib-of.c, which is now long fixed. So fix this regulator polarity setting here once and for all. Fixes: 3c3601cd6a6d3 ("ARM: dts: imx53: Update USB configuration on M53Menlo") Signed-off-by: Marek Vasut Cc: Shawn Guo Cc: Fabio Estevam Cc: NXP Linux Team Reviewed-by: Fabio Estevam Signed-off-by: Shawn Guo Signed-off-by: Sasha Levin --- arch/arm/boot/dts/imx53-m53menlo.dts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/imx53-m53menlo.dts b/arch/arm/boot/dts/imx53-m53menlo.dts index a6eb8319f321b..03c43c1912a7e 100644 --- a/arch/arm/boot/dts/imx53-m53menlo.dts +++ b/arch/arm/boot/dts/imx53-m53menlo.dts @@ -77,8 +77,7 @@ regulator-name = "vbus"; regulator-min-microvolt = <5000000>; regulator-max-microvolt = <5000000>; - gpio = <&gpio1 2 GPIO_ACTIVE_HIGH>; - enable-active-high; + gpio = <&gpio1 2 0>; }; }; From f5135362666c7970a819499cf1c9a1d2bee9f5f8 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Thu, 16 Sep 2021 18:13:39 +0300 Subject: [PATCH 1088/5344] arm64: dts: qcom: pm8150: use qcom,pm8998-pon binding [ Upstream commit a153d317168aa3d61a204fadc85bac3995381d33 ] Change pm8150 to use the qcom,pm8998-pon compatible string for the pon in order to pass reboot mode properly. Fixes: 5101f22a5c37 ("arm64: dts: qcom: pm8150: Add base dts file") Signed-off-by: Dmitry Baryshkov Tested-by: Amit Pundir Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20210916151341.1797512-1-dmitry.baryshkov@linaro.org Signed-off-by: Sasha Levin --- arch/arm64/boot/dts/qcom/pm8150.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/qcom/pm8150.dtsi b/arch/arm64/boot/dts/qcom/pm8150.dtsi index c0b197458665d..6f7dfcb8c0421 100644 --- a/arch/arm64/boot/dts/qcom/pm8150.dtsi +++ b/arch/arm64/boot/dts/qcom/pm8150.dtsi @@ -17,7 +17,7 @@ #size-cells = <0>; pon: power-on@800 { - compatible = "qcom,pm8916-pon"; + compatible = "qcom,pm8998-pon"; reg = <0x0800>; pwrkey { compatible = "qcom,pm8941-pwrkey"; From 86d3e25807f411b798c112ebff3b705d54c7c255 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Fri, 27 Sep 2019 21:28:47 -0700 Subject: [PATCH 1089/5344] xtensa: move XCHAL_KIO_* definitions to kmem_layout.h [ Upstream commit 6591685d50043f615a1ad7ddd5bb263ef54808fc ] These address and size definitions define xtensa kernel memory layout, move them from vectors.h to the kmem_layout.h Signed-off-by: Max Filippov Signed-off-by: Sasha Levin --- arch/xtensa/include/asm/kmem_layout.h | 29 ++++++++++++++++++ arch/xtensa/include/asm/vectors.h | 42 ++------------------------- 2 files changed, 32 insertions(+), 39 deletions(-) diff --git a/arch/xtensa/include/asm/kmem_layout.h b/arch/xtensa/include/asm/kmem_layout.h index 9c12babc016cd..7cbf68ca71060 100644 --- a/arch/xtensa/include/asm/kmem_layout.h +++ b/arch/xtensa/include/asm/kmem_layout.h @@ -11,6 +11,7 @@ #ifndef _XTENSA_KMEM_LAYOUT_H #define _XTENSA_KMEM_LAYOUT_H +#include #include #ifdef CONFIG_MMU @@ -65,6 +66,34 @@ #endif +/* KIO definition */ + +#if XCHAL_HAVE_PTP_MMU +#define XCHAL_KIO_CACHED_VADDR 0xe0000000 +#define XCHAL_KIO_BYPASS_VADDR 0xf0000000 +#define XCHAL_KIO_DEFAULT_PADDR 0xf0000000 +#else +#define XCHAL_KIO_BYPASS_VADDR XCHAL_KIO_PADDR +#define XCHAL_KIO_DEFAULT_PADDR 0x90000000 +#endif +#define XCHAL_KIO_SIZE 0x10000000 + +#if (!XCHAL_HAVE_PTP_MMU || XCHAL_HAVE_SPANNING_WAY) && defined(CONFIG_OF) +#define XCHAL_KIO_PADDR xtensa_get_kio_paddr() +#ifndef __ASSEMBLY__ +extern unsigned long xtensa_kio_paddr; + +static inline unsigned long xtensa_get_kio_paddr(void) +{ + return xtensa_kio_paddr; +} +#endif +#else +#define XCHAL_KIO_PADDR XCHAL_KIO_DEFAULT_PADDR +#endif + +/* KERNEL_STACK definition */ + #ifndef CONFIG_KASAN #define KERNEL_STACK_SHIFT 13 #else diff --git a/arch/xtensa/include/asm/vectors.h b/arch/xtensa/include/asm/vectors.h index 79fe3007919eb..4220c6dac44f4 100644 --- a/arch/xtensa/include/asm/vectors.h +++ b/arch/xtensa/include/asm/vectors.h @@ -21,50 +21,14 @@ #include #include -#if XCHAL_HAVE_PTP_MMU -#define XCHAL_KIO_CACHED_VADDR 0xe0000000 -#define XCHAL_KIO_BYPASS_VADDR 0xf0000000 -#define XCHAL_KIO_DEFAULT_PADDR 0xf0000000 -#else -#define XCHAL_KIO_BYPASS_VADDR XCHAL_KIO_PADDR -#define XCHAL_KIO_DEFAULT_PADDR 0x90000000 -#endif -#define XCHAL_KIO_SIZE 0x10000000 - -#if (!XCHAL_HAVE_PTP_MMU || XCHAL_HAVE_SPANNING_WAY) && defined(CONFIG_OF) -#define XCHAL_KIO_PADDR xtensa_get_kio_paddr() -#ifndef __ASSEMBLY__ -extern unsigned long xtensa_kio_paddr; - -static inline unsigned long xtensa_get_kio_paddr(void) -{ - return xtensa_kio_paddr; -} -#endif -#else -#define XCHAL_KIO_PADDR XCHAL_KIO_DEFAULT_PADDR -#endif - -#if defined(CONFIG_MMU) - -#if XCHAL_HAVE_PTP_MMU && XCHAL_HAVE_SPANNING_WAY -/* Image Virtual Start Address */ -#define KERNELOFFSET (XCHAL_KSEG_CACHED_VADDR + \ - CONFIG_KERNEL_LOAD_ADDRESS - \ +#if defined(CONFIG_MMU) && XCHAL_HAVE_PTP_MMU && XCHAL_HAVE_SPANNING_WAY +#define KERNELOFFSET (CONFIG_KERNEL_LOAD_ADDRESS + \ + XCHAL_KSEG_CACHED_VADDR - \ XCHAL_KSEG_PADDR) #else #define KERNELOFFSET CONFIG_KERNEL_LOAD_ADDRESS #endif -#else /* !defined(CONFIG_MMU) */ - /* MMU Not being used - Virtual == Physical */ - -/* Location of the start of the kernel text, _start */ -#define KERNELOFFSET CONFIG_KERNEL_LOAD_ADDRESS - - -#endif /* CONFIG_MMU */ - #define RESET_VECTOR1_VADDR (XCHAL_RESET_VECTOR1_VADDR) #ifdef CONFIG_VECTORS_OFFSET #define VECBASE_VADDR (KERNELOFFSET - CONFIG_VECTORS_OFFSET) From ce4b32f60c14c4e498fa7f1211b892eb48697a68 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 24 Sep 2021 20:29:51 -0700 Subject: [PATCH 1090/5344] xtensa: use CONFIG_USE_OF instead of CONFIG_OF MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit d67ed2510d28a1eb33171010d35cf52178cfcbdd ] CONFIG_OF can be set by a randconfig or by a user -- without setting the early flattree option (OF_EARLY_FLATTREE). This causes build errors. However, if randconfig or a user sets USE_OF in the Xtensa config, the right kconfig symbols are set to fix the build. Fixes these build errors: ../arch/xtensa/kernel/setup.c:67:19: error: ‘__dtb_start’ undeclared here (not in a function); did you mean ‘dtb_start’? 67 | void *dtb_start = __dtb_start; | ^~~~~~~~~~~ ../arch/xtensa/kernel/setup.c: In function 'xtensa_dt_io_area': ../arch/xtensa/kernel/setup.c:201:14: error: implicit declaration of function 'of_flat_dt_is_compatible'; did you mean 'of_machine_is_compatible'? [-Werror=implicit-function-declaration] 201 | if (!of_flat_dt_is_compatible(node, "simple-bus")) ../arch/xtensa/kernel/setup.c:204:18: error: implicit declaration of function 'of_get_flat_dt_prop' [-Werror=implicit-function-declaration] 204 | ranges = of_get_flat_dt_prop(node, "ranges", &len); ../arch/xtensa/kernel/setup.c:204:16: error: assignment to 'const __be32 *' {aka 'const unsigned int *'} from 'int' makes pointer from integer without a cast [-Werror=int-conversion] 204 | ranges = of_get_flat_dt_prop(node, "ranges", &len); | ^ ../arch/xtensa/kernel/setup.c: In function 'early_init_devtree': ../arch/xtensa/kernel/setup.c:228:9: error: implicit declaration of function 'early_init_dt_scan'; did you mean 'early_init_devtree'? [-Werror=implicit-function-declaration] 228 | early_init_dt_scan(params); ../arch/xtensa/kernel/setup.c:229:9: error: implicit declaration of function 'of_scan_flat_dt' [-Werror=implicit-function-declaration] 229 | of_scan_flat_dt(xtensa_dt_io_area, NULL); xtensa-elf-ld: arch/xtensa/mm/mmu.o:(.text+0x0): undefined reference to `xtensa_kio_paddr' Fixes: da844a81779e ("xtensa: add device trees support") Fixes: 6cb971114f63 ("xtensa: remap io area defined in device tree") Signed-off-by: Randy Dunlap Signed-off-by: Max Filippov Signed-off-by: Sasha Levin --- arch/xtensa/include/asm/kmem_layout.h | 2 +- arch/xtensa/kernel/setup.c | 12 ++++++------ arch/xtensa/mm/mmu.c | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/xtensa/include/asm/kmem_layout.h b/arch/xtensa/include/asm/kmem_layout.h index 7cbf68ca71060..6fc05cba61a27 100644 --- a/arch/xtensa/include/asm/kmem_layout.h +++ b/arch/xtensa/include/asm/kmem_layout.h @@ -78,7 +78,7 @@ #endif #define XCHAL_KIO_SIZE 0x10000000 -#if (!XCHAL_HAVE_PTP_MMU || XCHAL_HAVE_SPANNING_WAY) && defined(CONFIG_OF) +#if (!XCHAL_HAVE_PTP_MMU || XCHAL_HAVE_SPANNING_WAY) && defined(CONFIG_USE_OF) #define XCHAL_KIO_PADDR xtensa_get_kio_paddr() #ifndef __ASSEMBLY__ extern unsigned long xtensa_kio_paddr; diff --git a/arch/xtensa/kernel/setup.c b/arch/xtensa/kernel/setup.c index d08172138369b..5a25bc2b80521 100644 --- a/arch/xtensa/kernel/setup.c +++ b/arch/xtensa/kernel/setup.c @@ -64,7 +64,7 @@ extern unsigned long initrd_end; extern int initrd_below_start_ok; #endif -#ifdef CONFIG_OF +#ifdef CONFIG_USE_OF void *dtb_start = __dtb_start; #endif @@ -126,7 +126,7 @@ __tagtable(BP_TAG_INITRD, parse_tag_initrd); #endif /* CONFIG_BLK_DEV_INITRD */ -#ifdef CONFIG_OF +#ifdef CONFIG_USE_OF static int __init parse_tag_fdt(const bp_tag_t *tag) { @@ -136,7 +136,7 @@ static int __init parse_tag_fdt(const bp_tag_t *tag) __tagtable(BP_TAG_FDT, parse_tag_fdt); -#endif /* CONFIG_OF */ +#endif /* CONFIG_USE_OF */ static int __init parse_tag_cmdline(const bp_tag_t* tag) { @@ -184,7 +184,7 @@ static int __init parse_bootparam(const bp_tag_t *tag) } #endif -#ifdef CONFIG_OF +#ifdef CONFIG_USE_OF #if !XCHAL_HAVE_PTP_MMU || XCHAL_HAVE_SPANNING_WAY unsigned long xtensa_kio_paddr = XCHAL_KIO_DEFAULT_PADDR; @@ -233,7 +233,7 @@ void __init early_init_devtree(void *params) strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); } -#endif /* CONFIG_OF */ +#endif /* CONFIG_USE_OF */ /* * Initialize architecture. (Early stage) @@ -254,7 +254,7 @@ void __init init_arch(bp_tag_t *bp_start) if (bp_start) parse_bootparam(bp_start); -#ifdef CONFIG_OF +#ifdef CONFIG_USE_OF early_init_devtree(dtb_start); #endif diff --git a/arch/xtensa/mm/mmu.c b/arch/xtensa/mm/mmu.c index 03678c4afc39b..bc858a7f98ba4 100644 --- a/arch/xtensa/mm/mmu.c +++ b/arch/xtensa/mm/mmu.c @@ -101,7 +101,7 @@ void init_mmu(void) void init_kio(void) { -#if XCHAL_HAVE_PTP_MMU && XCHAL_HAVE_SPANNING_WAY && defined(CONFIG_OF) +#if XCHAL_HAVE_PTP_MMU && XCHAL_HAVE_SPANNING_WAY && defined(CONFIG_USE_OF) /* * Update the IO area mapping in case xtensa_kio_paddr has changed */ From dafe9957b683622860b364a2a44aa4025906572b Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Mon, 27 Sep 2021 09:46:33 -0700 Subject: [PATCH 1091/5344] xtensa: call irqchip_init only when CONFIG_USE_OF is selected [ Upstream commit 6489f8d0e1d93a3603d8dad8125797559e4cf2a2 ] During boot time kernel configured with OF=y but USE_OF=n displays the following warnings and hangs shortly after starting userspace: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 0 at kernel/irq/irqdomain.c:695 irq_create_mapping_affinity+0x29/0xc0 irq_create_mapping_affinity(, 6) called with NULL domain CPU: 0 PID: 0 Comm: swapper Not tainted 5.15.0-rc3-00001-gd67ed2510d28 #30 Call Trace: __warn+0x69/0xc4 warn_slowpath_fmt+0x6c/0x94 irq_create_mapping_affinity+0x29/0xc0 local_timer_setup+0x40/0x88 time_init+0xb1/0xe8 start_kernel+0x31d/0x3f4 _startup+0x13b/0x13b ---[ end trace 1e6630e1c5eda35b ]--- ------------[ cut here ]------------ WARNING: CPU: 0 PID: 0 at arch/xtensa/kernel/time.c:141 local_timer_setup+0x58/0x88 error: can't map timer irq CPU: 0 PID: 0 Comm: swapper Tainted: G W 5.15.0-rc3-00001-gd67ed2510d28 #30 Call Trace: __warn+0x69/0xc4 warn_slowpath_fmt+0x6c/0x94 local_timer_setup+0x58/0x88 time_init+0xb1/0xe8 start_kernel+0x31d/0x3f4 _startup+0x13b/0x13b ---[ end trace 1e6630e1c5eda35c ]--- Failed to request irq 0 (timer) Fix that by calling irqchip_init only when CONFIG_USE_OF is selected and calling legacy interrupt controller init otherwise. Fixes: da844a81779e ("xtensa: add device trees support") Signed-off-by: Max Filippov Signed-off-by: Sasha Levin --- arch/xtensa/kernel/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/xtensa/kernel/irq.c b/arch/xtensa/kernel/irq.c index a48bf2d10ac2d..80cc9770a8d2d 100644 --- a/arch/xtensa/kernel/irq.c +++ b/arch/xtensa/kernel/irq.c @@ -145,7 +145,7 @@ unsigned xtensa_get_ext_irq_no(unsigned irq) void __init init_IRQ(void) { -#ifdef CONFIG_OF +#ifdef CONFIG_USE_OF irqchip_init(); #else #ifdef CONFIG_HAVE_SMP From dabe60f8780c773d50ed44059c49abb2e026674f Mon Sep 17 00:00:00 2001 From: Johan Almbladh Date: Tue, 28 Sep 2021 11:13:10 +0200 Subject: [PATCH 1092/5344] bpf, arm: Fix register clobbering in div/mod implementation [ Upstream commit 79e3445b38e0cab94264a3894c0c3d57c930b97e ] On ARM CPUs that lack div/mod instructions, ALU32 BPF_DIV and BPF_MOD are implemented using a call to a helper function. Before, the emitted code for those function calls failed to preserve caller-saved ARM registers. Since some of those registers happen to be mapped to BPF registers, it resulted in eBPF register values being overwritten. This patch emits code to push and pop the remaining caller-saved ARM registers r2-r3 into the stack during the div/mod function call. ARM registers r0-r1 are used as arguments and return value, and those were already saved and restored correctly. Fixes: 39c13c204bb1 ("arm: eBPF JIT compiler") Signed-off-by: Johan Almbladh Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin --- arch/arm/net/bpf_jit_32.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index b51a8c7b01114..1c6e57f1dbc48 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -36,6 +36,10 @@ * +-----+ * |RSVD | JIT scratchpad * current ARM_SP => +-----+ <= (BPF_FP - STACK_SIZE + SCRATCH_SIZE) + * | ... | caller-saved registers + * +-----+ + * | ... | arguments passed on stack + * ARM_SP during call => +-----| * | | * | ... | Function call stack * | | @@ -63,6 +67,12 @@ * * When popping registers off the stack at the end of a BPF function, we * reference them via the current ARM_FP register. + * + * Some eBPF operations are implemented via a call to a helper function. + * Such calls are "invisible" in the eBPF code, so it is up to the calling + * program to preserve any caller-saved ARM registers during the call. The + * JIT emits code to push and pop those registers onto the stack, immediately + * above the callee stack frame. */ #define CALLEE_MASK (1 << ARM_R4 | 1 << ARM_R5 | 1 << ARM_R6 | \ 1 << ARM_R7 | 1 << ARM_R8 | 1 << ARM_R9 | \ @@ -70,6 +80,8 @@ #define CALLEE_PUSH_MASK (CALLEE_MASK | 1 << ARM_LR) #define CALLEE_POP_MASK (CALLEE_MASK | 1 << ARM_PC) +#define CALLER_MASK (1 << ARM_R0 | 1 << ARM_R1 | 1 << ARM_R2 | 1 << ARM_R3) + enum { /* Stack layout - these are offsets from (top of stack - 4) */ BPF_R2_HI, @@ -464,6 +476,7 @@ static inline int epilogue_offset(const struct jit_ctx *ctx) static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op) { + const int exclude_mask = BIT(ARM_R0) | BIT(ARM_R1); const s8 *tmp = bpf2a32[TMP_REG_1]; #if __LINUX_ARM_ARCH__ == 7 @@ -495,11 +508,17 @@ static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op) emit(ARM_MOV_R(ARM_R0, rm), ctx); } + /* Push caller-saved registers on stack */ + emit(ARM_PUSH(CALLER_MASK & ~exclude_mask), ctx); + /* Call appropriate function */ emit_mov_i(ARM_IP, op == BPF_DIV ? (u32)jit_udiv32 : (u32)jit_mod32, ctx); emit_blx_r(ARM_IP, ctx); + /* Restore caller-saved registers from stack */ + emit(ARM_POP(CALLER_MASK & ~exclude_mask), ctx); + /* Save return value */ if (rd != ARM_R0) emit(ARM_MOV_R(rd, ARM_R0), ctx); From bbbbed729df34ceda65f85d854d4e2055bde69b5 Mon Sep 17 00:00:00 2001 From: Tatsuhiko Yasumatsu Date: Thu, 30 Sep 2021 22:55:45 +0900 Subject: [PATCH 1093/5344] bpf: Fix integer overflow in prealloc_elems_and_freelist() [ Upstream commit 30e29a9a2bc6a4888335a6ede968b75cd329657a ] In prealloc_elems_and_freelist(), the multiplication to calculate the size passed to bpf_map_area_alloc() could lead to an integer overflow. As a result, out-of-bounds write could occur in pcpu_freelist_populate() as reported by KASAN: [...] [ 16.968613] BUG: KASAN: slab-out-of-bounds in pcpu_freelist_populate+0xd9/0x100 [ 16.969408] Write of size 8 at addr ffff888104fc6ea0 by task crash/78 [ 16.970038] [ 16.970195] CPU: 0 PID: 78 Comm: crash Not tainted 5.15.0-rc2+ #1 [ 16.970878] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 [ 16.972026] Call Trace: [ 16.972306] dump_stack_lvl+0x34/0x44 [ 16.972687] print_address_description.constprop.0+0x21/0x140 [ 16.973297] ? pcpu_freelist_populate+0xd9/0x100 [ 16.973777] ? pcpu_freelist_populate+0xd9/0x100 [ 16.974257] kasan_report.cold+0x7f/0x11b [ 16.974681] ? pcpu_freelist_populate+0xd9/0x100 [ 16.975190] pcpu_freelist_populate+0xd9/0x100 [ 16.975669] stack_map_alloc+0x209/0x2a0 [ 16.976106] __sys_bpf+0xd83/0x2ce0 [...] The possibility of this overflow was originally discussed in [0], but was overlooked. Fix the integer overflow by changing elem_size to u64 from u32. [0] https://lore.kernel.org/bpf/728b238e-a481-eb50-98e9-b0f430ab01e7@gmail.com/ Fixes: 557c0c6e7df8 ("bpf: convert stackmap to pre-allocation") Signed-off-by: Tatsuhiko Yasumatsu Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210930135545.173698-1-th.yasumatsu@gmail.com Signed-off-by: Sasha Levin --- kernel/bpf/stackmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index d7e57143f5195..45a7130154a98 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -59,7 +59,8 @@ static inline int stack_map_data_size(struct bpf_map *map) static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) { - u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size; + u64 elem_size = sizeof(struct stack_map_bucket) + + (u64)smap->map.value_size; int err; smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries, From 412cc19fc3e2b25c1b1a668bfb956c128f38e1d6 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Thu, 30 Sep 2021 20:50:28 +0300 Subject: [PATCH 1094/5344] phy: mdio: fix memory leak [ Upstream commit ca6e11c337daf7925ff8a2aac8e84490a8691905 ] Syzbot reported memory leak in MDIO bus interface, the problem was in wrong state logic. MDIOBUS_ALLOCATED indicates 2 states: 1. Bus is only allocated 2. Bus allocated and __mdiobus_register() fails, but device_register() was called In case of device_register() has been called we should call put_device() to correctly free the memory allocated for this device, but mdiobus_free() calls just kfree(dev) in case of MDIOBUS_ALLOCATED state To avoid this behaviour we need to set bus->state to MDIOBUS_UNREGISTERED _before_ calling device_register(), because put_device() should be called even in case of device_register() failure. Link: https://lore.kernel.org/netdev/YVMRWNDZDUOvQjHL@shell.armlinux.org.uk/ Fixes: 46abc02175b3 ("phylib: give mdio buses a device tree presence") Reported-and-tested-by: syzbot+398e7dc692ddbbb4cfec@syzkaller.appspotmail.com Reviewed-by: Dan Carpenter Signed-off-by: Pavel Skripkin Link: https://lore.kernel.org/r/eceae1429fbf8fa5c73dd2a0d39d525aa905074d.1633024062.git.paskripkin@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/phy/mdio_bus.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 5bf06eac04ba3..bec73f0640d03 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -385,6 +385,13 @@ int __mdiobus_register(struct mii_bus *bus, struct module *owner) bus->dev.groups = NULL; dev_set_name(&bus->dev, "%s", bus->id); + /* We need to set state to MDIOBUS_UNREGISTERED to correctly release + * the device in mdiobus_free() + * + * State will be updated later in this function in case of success + */ + bus->state = MDIOBUS_UNREGISTERED; + err = device_register(&bus->dev); if (err) { pr_err("mii_bus %s failed to register\n", bus->id); From 3752bf87c6685a922a7bce9709d3bf765072ad75 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Sep 2021 14:22:39 -0700 Subject: [PATCH 1095/5344] net_sched: fix NULL deref in fifo_set_limit() [ Upstream commit 560ee196fe9e5037e5015e2cdb14b3aecb1cd7dc ] syzbot reported another NULL deref in fifo_set_limit() [1] I could repro the issue with : unshare -n tc qd add dev lo root handle 1:0 tbf limit 200000 burst 70000 rate 100Mbit tc qd replace dev lo parent 1:0 pfifo_fast tc qd change dev lo root handle 1:0 tbf limit 300000 burst 70000 rate 100Mbit pfifo_fast does not have a change() operation. Make fifo_set_limit() more robust about this. [1] BUG: kernel NULL pointer dereference, address: 0000000000000000 PGD 1cf99067 P4D 1cf99067 PUD 7ca49067 PMD 0 Oops: 0010 [#1] PREEMPT SMP KASAN CPU: 1 PID: 14443 Comm: syz-executor959 Not tainted 5.15.0-rc3-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:0x0 Code: Unable to access opcode bytes at RIP 0xffffffffffffffd6. RSP: 0018:ffffc9000e2f7310 EFLAGS: 00010246 RAX: dffffc0000000000 RBX: ffffffff8d6ecc00 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff888024c27910 RDI: ffff888071e34000 RBP: ffff888071e34000 R08: 0000000000000001 R09: ffffffff8fcfb947 R10: 0000000000000001 R11: 0000000000000000 R12: ffff888024c27910 R13: ffff888071e34018 R14: 0000000000000000 R15: ffff88801ef74800 FS: 00007f321d897700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffffffffd6 CR3: 00000000722c3000 CR4: 00000000003506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: fifo_set_limit net/sched/sch_fifo.c:242 [inline] fifo_set_limit+0x198/0x210 net/sched/sch_fifo.c:227 tbf_change+0x6ec/0x16d0 net/sched/sch_tbf.c:418 qdisc_change net/sched/sch_api.c:1332 [inline] tc_modify_qdisc+0xd9a/0x1a60 net/sched/sch_api.c:1634 rtnetlink_rcv_msg+0x413/0xb80 net/core/rtnetlink.c:5572 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2504 netlink_unicast_kernel net/netlink/af_netlink.c:1314 [inline] netlink_unicast+0x533/0x7d0 net/netlink/af_netlink.c:1340 netlink_sendmsg+0x86d/0xdb0 net/netlink/af_netlink.c:1929 sock_sendmsg_nosec net/socket.c:704 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:724 ____sys_sendmsg+0x6e8/0x810 net/socket.c:2409 ___sys_sendmsg+0xf3/0x170 net/socket.c:2463 __sys_sendmsg+0xe5/0x1b0 net/socket.c:2492 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Fixes: fb0305ce1b03 ("net-sched: consolidate default fifo qdisc setup") Signed-off-by: Eric Dumazet Reported-by: syzbot Link: https://lore.kernel.org/r/20210930212239.3430364-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/sched/sch_fifo.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index 37c8aa75d70c5..56f4c1621e444 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -148,6 +148,9 @@ int fifo_set_limit(struct Qdisc *q, unsigned int limit) if (strncmp(q->ops->id + 1, "fifo", 4) != 0) return 0; + if (!q->ops->change) + return 0; + nla = kmalloc(nla_attr_size(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); if (nla) { nla->nla_type = RTM_NEWQDISC; From 4456ba2f542fd9893657ce30e5391e5d8a1af8c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Sat, 2 Oct 2021 11:04:09 +0200 Subject: [PATCH 1096/5344] powerpc/fsl/dts: Fix phy-connection-type for fm1mac3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit eed183abc0d3b8adb64fd1363b7cea7986cd58d6 ] Property phy-connection-type contains invalid value "sgmii-2500" per scheme defined in file ethernet-controller.yaml. Correct phy-connection-type value should be "2500base-x". Signed-off-by: Pali Rohár Fixes: 84e0f1c13806 ("powerpc/mpc85xx: Add MDIO bus muxing support to the board device tree(s)") Acked-by: Scott Wood Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- arch/powerpc/boot/dts/fsl/t1023rdb.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/boot/dts/fsl/t1023rdb.dts b/arch/powerpc/boot/dts/fsl/t1023rdb.dts index 5ba6fbfca2742..f82f85c65964c 100644 --- a/arch/powerpc/boot/dts/fsl/t1023rdb.dts +++ b/arch/powerpc/boot/dts/fsl/t1023rdb.dts @@ -154,7 +154,7 @@ fm1mac3: ethernet@e4000 { phy-handle = <&sgmii_aqr_phy3>; - phy-connection-type = "sgmii-2500"; + phy-connection-type = "2500base-x"; sleep = <&rcpm 0x20000000>; }; From 1bb5248dcab51d207d9a0ebe0b2760d23dcba87e Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 1 Oct 2021 19:20:33 +0300 Subject: [PATCH 1097/5344] ptp_pch: Load module automatically if ID matches [ Upstream commit 7cd8b1542a7ba0720c5a0a85ed414a122015228b ] The driver can't be loaded automatically because it misses module alias to be provided. Add corresponding MODULE_DEVICE_TABLE() call to the driver. Fixes: 863d08ece9bf ("supports eg20t ptp clock") Signed-off-by: Andy Shevchenko Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/ptp/ptp_pch.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/ptp/ptp_pch.c b/drivers/ptp/ptp_pch.c index dcd6e00c80467..a50656632df93 100644 --- a/drivers/ptp/ptp_pch.c +++ b/drivers/ptp/ptp_pch.c @@ -683,6 +683,7 @@ static const struct pci_device_id pch_ieee1588_pcidev_id[] = { }, {0} }; +MODULE_DEVICE_TABLE(pci, pch_ieee1588_pcidev_id); static struct pci_driver pch_driver = { .name = KBUILD_MODNAME, From 70f00935f677b886e3d3c511857c002f8d487681 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 28 Aug 2020 14:05:56 +0100 Subject: [PATCH 1098/5344] arm64: dts: freescale: Fix SP805 clock-names [ Upstream commit f2dc2359b75e1fd345fd710862f73db20dc55864 ] The SP805 binding sets the order of the clock-names to be: "wdog_clk", "apb_pclk" (in exactly that order). Change the order in the DTs for Freescale platforms to match that. The two clocks given in all nodes are actually the same, so that does not change any behaviour. Signed-off-by: Andre Przywara Signed-off-by: Shawn Guo Signed-off-by: Sasha Levin --- arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi | 4 ++-- arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi | 16 ++++++++-------- arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi | 16 ++++++++-------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi index 5716ac20bddd1..963091069ab30 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi @@ -496,14 +496,14 @@ compatible = "arm,sp805", "arm,primecell"; reg = <0x0 0xc000000 0x0 0x1000>; clocks = <&clockgen 4 15>, <&clockgen 4 15>; - clock-names = "apb_pclk", "wdog_clk"; + clock-names = "wdog_clk", "apb_pclk"; }; cluster1_core1_watchdog: watchdog@c010000 { compatible = "arm,sp805", "arm,primecell"; reg = <0x0 0xc010000 0x0 0x1000>; clocks = <&clockgen 4 15>, <&clockgen 4 15>; - clock-names = "apb_pclk", "wdog_clk"; + clock-names = "wdog_clk", "apb_pclk"; }; sai1: audio-controller@f100000 { diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi index c676d0771762f..407ebdb35cd2e 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi @@ -640,56 +640,56 @@ compatible = "arm,sp805-wdt", "arm,primecell"; reg = <0x0 0xc000000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; - clock-names = "apb_pclk", "wdog_clk"; + clock-names = "wdog_clk", "apb_pclk"; }; cluster1_core1_watchdog: wdt@c010000 { compatible = "arm,sp805-wdt", "arm,primecell"; reg = <0x0 0xc010000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; - clock-names = "apb_pclk", "wdog_clk"; + clock-names = "wdog_clk", "apb_pclk"; }; cluster1_core2_watchdog: wdt@c020000 { compatible = "arm,sp805-wdt", "arm,primecell"; reg = <0x0 0xc020000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; - clock-names = "apb_pclk", "wdog_clk"; + clock-names = "wdog_clk", "apb_pclk"; }; cluster1_core3_watchdog: wdt@c030000 { compatible = "arm,sp805-wdt", "arm,primecell"; reg = <0x0 0xc030000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; - clock-names = "apb_pclk", "wdog_clk"; + clock-names = "wdog_clk", "apb_pclk"; }; cluster2_core0_watchdog: wdt@c100000 { compatible = "arm,sp805-wdt", "arm,primecell"; reg = <0x0 0xc100000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; - clock-names = "apb_pclk", "wdog_clk"; + clock-names = "wdog_clk", "apb_pclk"; }; cluster2_core1_watchdog: wdt@c110000 { compatible = "arm,sp805-wdt", "arm,primecell"; reg = <0x0 0xc110000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; - clock-names = "apb_pclk", "wdog_clk"; + clock-names = "wdog_clk", "apb_pclk"; }; cluster2_core2_watchdog: wdt@c120000 { compatible = "arm,sp805-wdt", "arm,primecell"; reg = <0x0 0xc120000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; - clock-names = "apb_pclk", "wdog_clk"; + clock-names = "wdog_clk", "apb_pclk"; }; cluster2_core3_watchdog: wdt@c130000 { compatible = "arm,sp805-wdt", "arm,primecell"; reg = <0x0 0xc130000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; - clock-names = "apb_pclk", "wdog_clk"; + clock-names = "wdog_clk", "apb_pclk"; }; fsl_mc: fsl-mc@80c000000 { diff --git a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi index cdb2fa47637da..82f0fe6acbfb7 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi @@ -230,56 +230,56 @@ compatible = "arm,sp805-wdt", "arm,primecell"; reg = <0x0 0xc000000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; - clock-names = "apb_pclk", "wdog_clk"; + clock-names = "wdog_clk", "apb_pclk"; }; cluster1_core1_watchdog: wdt@c010000 { compatible = "arm,sp805-wdt", "arm,primecell"; reg = <0x0 0xc010000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; - clock-names = "apb_pclk", "wdog_clk"; + clock-names = "wdog_clk", "apb_pclk"; }; cluster2_core0_watchdog: wdt@c100000 { compatible = "arm,sp805-wdt", "arm,primecell"; reg = <0x0 0xc100000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; - clock-names = "apb_pclk", "wdog_clk"; + clock-names = "wdog_clk", "apb_pclk"; }; cluster2_core1_watchdog: wdt@c110000 { compatible = "arm,sp805-wdt", "arm,primecell"; reg = <0x0 0xc110000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; - clock-names = "apb_pclk", "wdog_clk"; + clock-names = "wdog_clk", "apb_pclk"; }; cluster3_core0_watchdog: wdt@c200000 { compatible = "arm,sp805-wdt", "arm,primecell"; reg = <0x0 0xc200000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; - clock-names = "apb_pclk", "wdog_clk"; + clock-names = "wdog_clk", "apb_pclk"; }; cluster3_core1_watchdog: wdt@c210000 { compatible = "arm,sp805-wdt", "arm,primecell"; reg = <0x0 0xc210000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; - clock-names = "apb_pclk", "wdog_clk"; + clock-names = "wdog_clk", "apb_pclk"; }; cluster4_core0_watchdog: wdt@c300000 { compatible = "arm,sp805-wdt", "arm,primecell"; reg = <0x0 0xc300000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; - clock-names = "apb_pclk", "wdog_clk"; + clock-names = "wdog_clk", "apb_pclk"; }; cluster4_core1_watchdog: wdt@c310000 { compatible = "arm,sp805-wdt", "arm,primecell"; reg = <0x0 0xc310000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; - clock-names = "apb_pclk", "wdog_clk"; + clock-names = "wdog_clk", "apb_pclk"; }; crypto: crypto@8000000 { From 7667616316968199711504db52a562df4808acd8 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Thu, 1 Oct 2020 11:11:30 +0200 Subject: [PATCH 1099/5344] arm64: dts: ls1028a: add missing CAN nodes [ Upstream commit 04fa4f03e3533f51b4db19cb487435f5862a0514 ] The LS1028A has two FlexCAN controller. These are compatible with the ones from the LX2160A. Add the nodes. The first controller was tested on the Kontron sl28 board. Signed-off-by: Michael Walle Signed-off-by: Shawn Guo Signed-off-by: Sasha Levin --- arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi index 963091069ab30..02ae6bfff5658 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi @@ -287,6 +287,24 @@ status = "disabled"; }; + can0: can@2180000 { + compatible = "fsl,ls1028ar1-flexcan", "fsl,lx2160ar1-flexcan"; + reg = <0x0 0x2180000 0x0 0x10000>; + interrupts = ; + clocks = <&sysclk>, <&clockgen 4 1>; + clock-names = "ipg", "per"; + status = "disabled"; + }; + + can1: can@2190000 { + compatible = "fsl,ls1028ar1-flexcan", "fsl,lx2160ar1-flexcan"; + reg = <0x0 0x2190000 0x0 0x10000>; + interrupts = ; + clocks = <&sysclk>, <&clockgen 4 1>; + clock-names = "ipg", "per"; + status = "disabled"; + }; + duart0: serial@21c0500 { compatible = "fsl,ns16550", "ns16550a"; reg = <0x00 0x21c0500 0x0 0x100>; From e5c4ef992d4c02cbcb4fbf9eb0f7d811544c0174 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Tue, 28 Sep 2021 15:49:40 +0200 Subject: [PATCH 1100/5344] ARM: imx6: disable the GIC CPU interface before calling stby-poweroff sequence [ Upstream commit 783f3db030563f7bcdfe2d26428af98ea1699a8e ] Any pending interrupt can prevent entering standby based power off state. To avoid it, disable the GIC CPU interface. Fixes: 8148d2136002 ("ARM: imx6: register pm_power_off handler if "fsl,pmic-stby-poweroff" is set") Signed-off-by: Oleksij Rempel Signed-off-by: Shawn Guo Signed-off-by: Sasha Levin --- arch/arm/mach-imx/pm-imx6.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/mach-imx/pm-imx6.c b/arch/arm/mach-imx/pm-imx6.c index baf3b47601af0..1b73e4e76310c 100644 --- a/arch/arm/mach-imx/pm-imx6.c +++ b/arch/arm/mach-imx/pm-imx6.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -618,6 +619,7 @@ static void __init imx6_pm_common_init(const struct imx6_pm_socdata static void imx6_pm_stby_poweroff(void) { + gic_cpu_if_down(0); imx6_set_lpm(STOP_POWER_OFF); imx6q_suspend_finish(0); From ee969f1e2511e5ff07c23349414b978a988406b1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Oct 2021 18:05:07 -0700 Subject: [PATCH 1101/5344] net: bridge: use nla_total_size_64bit() in br_get_linkxstats_size() [ Upstream commit dbe0b88064494b7bb6a9b2aa7e085b14a3112d44 ] bridge_fill_linkxstats() is using nla_reserve_64bit(). We must use nla_total_size_64bit() instead of nla_total_size() for corresponding data structure. Fixes: 1080ab95e3c7 ("net: bridge: add support for IGMP/MLD stats and export them via netlink") Signed-off-by: Eric Dumazet Cc: Nikolay Aleksandrov Cc: Vivien Didelot Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/bridge/br_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 8a664148f57aa..cbcbc19efcb34 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1536,7 +1536,7 @@ static size_t br_get_linkxstats_size(const struct net_device *dev, int attr) } return numvls * nla_total_size(sizeof(struct bridge_vlan_xstats)) + - nla_total_size(sizeof(struct br_mcast_stats)) + + nla_total_size_64bit(sizeof(struct br_mcast_stats)) + nla_total_size(0); } From ec1216b0aea124a70bc71fdd6ff84404897c2f99 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Mon, 4 Oct 2021 17:50:02 -0400 Subject: [PATCH 1102/5344] net: sfp: Fix typo in state machine debug string [ Upstream commit 25a9da6641f1f66006e93ddbefee13a437efa8c0 ] The string should be "tx_disable" to match the state enum. Fixes: 4005a7cb4f55 ("net: phy: sftp: print debug message with text, not numbers") Signed-off-by: Sean Anderson Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/phy/sfp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c index 27b67f12ec455..5657c604602e8 100644 --- a/drivers/net/phy/sfp.c +++ b/drivers/net/phy/sfp.c @@ -115,7 +115,7 @@ static const char * const sm_state_strings[] = { [SFP_S_LINK_UP] = "link_up", [SFP_S_TX_FAULT] = "tx_fault", [SFP_S_REINIT] = "reinit", - [SFP_S_TX_DISABLE] = "rx_disable", + [SFP_S_TX_DISABLE] = "tx_disable", }; static const char *sm_state_to_str(unsigned short sm_state) From 689bc4607f70d9a92ba356459c459f7773ce3b4c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Oct 2021 14:24:15 -0700 Subject: [PATCH 1103/5344] netlink: annotate data races around nlk->bound [ Upstream commit 7707a4d01a648e4c655101a469c956cb11273655 ] While existing code is correct, KCSAN is reporting a data-race in netlink_insert / netlink_sendmsg [1] It is correct to read nlk->bound without a lock, as netlink_autobind() will acquire all needed locks. [1] BUG: KCSAN: data-race in netlink_insert / netlink_sendmsg write to 0xffff8881031c8b30 of 1 bytes by task 18752 on cpu 0: netlink_insert+0x5cc/0x7f0 net/netlink/af_netlink.c:597 netlink_autobind+0xa9/0x150 net/netlink/af_netlink.c:842 netlink_sendmsg+0x479/0x7c0 net/netlink/af_netlink.c:1892 sock_sendmsg_nosec net/socket.c:703 [inline] sock_sendmsg net/socket.c:723 [inline] ____sys_sendmsg+0x360/0x4d0 net/socket.c:2392 ___sys_sendmsg net/socket.c:2446 [inline] __sys_sendmsg+0x1ed/0x270 net/socket.c:2475 __do_sys_sendmsg net/socket.c:2484 [inline] __se_sys_sendmsg net/socket.c:2482 [inline] __x64_sys_sendmsg+0x42/0x50 net/socket.c:2482 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff8881031c8b30 of 1 bytes by task 18751 on cpu 1: netlink_sendmsg+0x270/0x7c0 net/netlink/af_netlink.c:1891 sock_sendmsg_nosec net/socket.c:703 [inline] sock_sendmsg net/socket.c:723 [inline] __sys_sendto+0x2a8/0x370 net/socket.c:2019 __do_sys_sendto net/socket.c:2031 [inline] __se_sys_sendto net/socket.c:2027 [inline] __x64_sys_sendto+0x74/0x90 net/socket.c:2027 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x00 -> 0x01 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 18751 Comm: syz-executor.0 Not tainted 5.14.0-rc1-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: da314c9923fe ("netlink: Replace rhash_portid with bound") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/netlink/af_netlink.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index acc76a738cfd8..cb35680db9b29 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -585,7 +585,10 @@ static int netlink_insert(struct sock *sk, u32 portid) /* We need to ensure that the socket is hashed and visible. */ smp_wmb(); - nlk_sk(sk)->bound = portid; + /* Paired with lockless reads from netlink_bind(), + * netlink_connect() and netlink_sendmsg(). + */ + WRITE_ONCE(nlk_sk(sk)->bound, portid); err: release_sock(sk); @@ -1003,7 +1006,8 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, if (nlk->ngroups < BITS_PER_LONG) groups &= (1UL << nlk->ngroups) - 1; - bound = nlk->bound; + /* Paired with WRITE_ONCE() in netlink_insert() */ + bound = READ_ONCE(nlk->bound); if (bound) { /* Ensure nlk->portid is up-to-date. */ smp_rmb(); @@ -1089,8 +1093,9 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, /* No need for barriers here as we return to user-space without * using any of the bound attributes. + * Paired with WRITE_ONCE() in netlink_insert(). */ - if (!nlk->bound) + if (!READ_ONCE(nlk->bound)) err = netlink_autobind(sock); if (err == 0) { @@ -1879,7 +1884,8 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) dst_group = nlk->dst_group; } - if (!nlk->bound) { + /* Paired with WRITE_ONCE() in netlink_insert() */ + if (!READ_ONCE(nlk->bound)) { err = netlink_autobind(sock); if (err) goto out; From c2dedecb8e56cd1d5f2a613ad3881766a70d424a Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Wed, 8 Sep 2021 08:49:36 +0300 Subject: [PATCH 1104/5344] bus: ti-sysc: Use CLKDM_NOAUTO for dra7 dcan1 for errata i893 [ Upstream commit b13a270ace2e4c70653aa1d1d0394c553905802f ] Commit 94f6345712b3 ("bus: ti-sysc: Implement quirk handling for CLKDM_NOAUTO") should have also added the quirk for dra7 dcan1 in addition to dcan2 for errata i893 handling. Let's also pass the quirk flag for legacy mode booting for if "ti,hwmods" dts property is used with related dcan hwmod data. This should be only needed if anybody needs to git bisect earlier stable trees though. Fixes: 94f6345712b3 ("bus: ti-sysc: Implement quirk handling for CLKDM_NOAUTO") Signed-off-by: Tony Lindgren Signed-off-by: Sasha Levin --- arch/arm/mach-omap2/omap_hwmod.c | 2 ++ drivers/bus/ti-sysc.c | 3 +++ 2 files changed, 5 insertions(+) diff --git a/arch/arm/mach-omap2/omap_hwmod.c b/arch/arm/mach-omap2/omap_hwmod.c index eb74aa1826614..6289b288d60a6 100644 --- a/arch/arm/mach-omap2/omap_hwmod.c +++ b/arch/arm/mach-omap2/omap_hwmod.c @@ -3656,6 +3656,8 @@ int omap_hwmod_init_module(struct device *dev, oh->flags |= HWMOD_SWSUP_SIDLE_ACT; if (data->cfg->quirks & SYSC_QUIRK_SWSUP_MSTANDBY) oh->flags |= HWMOD_SWSUP_MSTANDBY; + if (data->cfg->quirks & SYSC_QUIRK_CLKDM_NOAUTO) + oh->flags |= HWMOD_CLKDM_NOAUTO; error = omap_hwmod_check_module(dev, oh, data, sysc_fields, rev_offs, sysc_offs, syss_offs, diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c index 90053c4a8290d..469ca73de4ce7 100644 --- a/drivers/bus/ti-sysc.c +++ b/drivers/bus/ti-sysc.c @@ -1388,6 +1388,9 @@ static const struct sysc_revision_quirk sysc_revision_quirks[] = { /* Quirks that need to be set based on detected module */ SYSC_QUIRK("aess", 0, 0, 0x10, -ENODEV, 0x40000000, 0xffffffff, SYSC_MODULE_QUIRK_AESS), + /* Errata i893 handling for dra7 dcan1 and 2 */ + SYSC_QUIRK("dcan", 0x4ae3c000, 0x20, -ENODEV, -ENODEV, 0xa3170504, 0xffffffff, + SYSC_QUIRK_CLKDM_NOAUTO), SYSC_QUIRK("dcan", 0x48480000, 0x20, -ENODEV, -ENODEV, 0xa3170504, 0xffffffff, SYSC_QUIRK_CLKDM_NOAUTO), SYSC_QUIRK("dss", 0x4832a000, 0, 0x10, 0x14, 0x00000020, 0xffffffff, From 0c7219f17b36c249775e8ed127460eb9f800fc37 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 21 Sep 2021 22:21:02 +0100 Subject: [PATCH 1105/5344] video: fbdev: gbefb: Only instantiate device when built for IP32 [ Upstream commit 11b8e2bb986d23157e82e267fb8cc6b281dfdee9 ] The gbefb driver not only registers a driver but also the device for that driver. This is all well and good when run on the IP32 machines that are supported by the driver but since the driver supports building with COMPILE_TEST we might also be building on other platforms which do not have this hardware and will crash instantiating the driver. Add an IS_ENABLED() check so we compile out the device registration if we don't have the Kconfig option for the machine enabled. Fixes: 552ccf6b259d290c0c ("video: fbdev: gbefb: add COMPILE_TEST support") Signed-off-by: Mark Brown Cc: Bartlomiej Zolnierkiewicz Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20210921212102.30803-1-broonie@kernel.org Signed-off-by: Maarten Lankhorst Signed-off-by: Sasha Levin --- drivers/video/fbdev/gbefb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/fbdev/gbefb.c b/drivers/video/fbdev/gbefb.c index b9f6a82a04953..6fdc6ab3ceb87 100644 --- a/drivers/video/fbdev/gbefb.c +++ b/drivers/video/fbdev/gbefb.c @@ -1269,7 +1269,7 @@ static struct platform_device *gbefb_device; static int __init gbefb_init(void) { int ret = platform_driver_register(&gbefb_driver); - if (!ret) { + if (IS_ENABLED(CONFIG_SGI_IP32) && !ret) { gbefb_device = platform_device_alloc("gbefb", 0); if (gbefb_device) { ret = platform_device_add(gbefb_device); From 3b97ca53454970ce771a945260f3fc068a9cdce2 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Sat, 11 Sep 2021 15:50:23 +0800 Subject: [PATCH 1106/5344] drm/nouveau/debugfs: fix file release memory leak [ Upstream commit f5a8703a9c418c6fc54eb772712dfe7641e3991c ] When using single_open() for opening, single_release() should be called, otherwise the 'op' allocated in single_open() will be leaked. Fixes: 6e9fc177399f ("drm/nouveau/debugfs: add copy of sysfs pstate interface ported to debugfs") Reported-by: Hulk Robot Signed-off-by: Yang Yingliang Reviewed-by: Karol Herbst Signed-off-by: Karol Herbst Link: https://patchwork.freedesktop.org/patch/msgid/20210911075023.3969054-2-yangyingliang@huawei.com Signed-off-by: Maarten Lankhorst Signed-off-by: Sasha Levin --- drivers/gpu/drm/nouveau/nouveau_debugfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/nouveau/nouveau_debugfs.c b/drivers/gpu/drm/nouveau/nouveau_debugfs.c index 3b13feca970f7..3c54d61e4fa94 100644 --- a/drivers/gpu/drm/nouveau/nouveau_debugfs.c +++ b/drivers/gpu/drm/nouveau/nouveau_debugfs.c @@ -207,6 +207,7 @@ static const struct file_operations nouveau_pstate_fops = { .open = nouveau_debugfs_pstate_open, .read = seq_read, .write = nouveau_debugfs_pstate_set, + .release = single_release, }; static struct drm_info_list nouveau_debugfs_list[] = { From ae374e4c847b88034392e6117904c2ef21ab8714 Mon Sep 17 00:00:00 2001 From: Catherine Sullivan Date: Tue, 5 Oct 2021 19:42:19 -0700 Subject: [PATCH 1107/5344] gve: Correct available tx qpl check [ Upstream commit d03477ee10f4bc35d3573cf1823814378ef2dca2 ] The qpl_map_size is rounded up to a multiple of sizeof(long), but the number of qpls doesn't have to be. Fixes: f5cedc84a30d2 ("gve: Add transmit and receive support") Signed-off-by: Catherine Sullivan Signed-off-by: Jeroen de Borst Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/google/gve/gve.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h index ebc37e2569221..f19edd4c6c5bb 100644 --- a/drivers/net/ethernet/google/gve/gve.h +++ b/drivers/net/ethernet/google/gve/gve.h @@ -391,7 +391,7 @@ struct gve_queue_page_list *gve_assign_rx_qpl(struct gve_priv *priv) gve_num_tx_qpls(priv)); /* we are out of rx qpls */ - if (id == priv->qpl_cfg.qpl_map_size) + if (id == gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv)) return NULL; set_bit(id, priv->qpl_cfg.qpl_id_map); From 4511b3a1943c38c46d38b796a36cb8f6e20813fa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Oct 2021 14:04:17 -0700 Subject: [PATCH 1108/5344] rtnetlink: fix if_nlmsg_stats_size() under estimation [ Upstream commit d34367991933d28bd7331f67a759be9a8c474014 ] rtnl_fill_statsinfo() is filling skb with one mandatory if_stats_msg structure. nlmsg_put(skb, pid, seq, type, sizeof(struct if_stats_msg), flags); But if_nlmsg_stats_size() never considered the needed storage. This bug did not show up because alloc_skb(X) allocates skb with extra tailroom, because of added alignments. This could very well be changed in the future to have deterministic behavior. Fixes: 10c9ead9f3c6 ("rtnetlink: add new RTM_GETSTATS message to dump link stats") Signed-off-by: Eric Dumazet Cc: Roopa Prabhu Acked-by: Roopa Prabhu Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 6fbc9cb09dc0e..a53b101ce41ae 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4950,7 +4950,7 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, static size_t if_nlmsg_stats_size(const struct net_device *dev, u32 filter_mask) { - size_t size = 0; + size_t size = NLMSG_ALIGN(sizeof(struct if_stats_msg)); if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, 0)) size += nla_total_size_64bit(sizeof(struct rtnl_link_stats64)); From d32208c4c2e68281d24b6cfd3dc1c43f728ef795 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Oct 2021 17:30:30 -0700 Subject: [PATCH 1109/5344] gve: fix gve_get_stats() [ Upstream commit 2f57d4975fa027eabd35fdf23a49f8222ef3abf2 ] gve_get_stats() can report wrong numbers if/when u64_stats_fetch_retry() returns true. What is needed here is to sample values in temporary variables, and only use them after each loop is ended. Fixes: f5cedc84a30d ("gve: Add transmit and receive support") Signed-off-by: Eric Dumazet Cc: Catherine Sullivan Cc: Sagi Shahar Cc: Jon Olson Cc: Willem de Bruijn Cc: Luigi Rizzo Cc: Jeroen de Borst Cc: Tao Liu Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/google/gve/gve_main.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index f8dfa7501f65a..5b450c6100add 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -30,6 +30,7 @@ static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s) { struct gve_priv *priv = netdev_priv(dev); unsigned int start; + u64 packets, bytes; int ring; if (priv->rx) { @@ -37,10 +38,12 @@ static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s) do { start = u64_stats_fetch_begin(&priv->rx[ring].statss); - s->rx_packets += priv->rx[ring].rpackets; - s->rx_bytes += priv->rx[ring].rbytes; + packets = priv->rx[ring].rpackets; + bytes = priv->rx[ring].rbytes; } while (u64_stats_fetch_retry(&priv->rx[ring].statss, start)); + s->rx_packets += packets; + s->rx_bytes += bytes; } } if (priv->tx) { @@ -48,10 +51,12 @@ static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s) do { start = u64_stats_fetch_begin(&priv->tx[ring].statss); - s->tx_packets += priv->tx[ring].pkt_done; - s->tx_bytes += priv->tx[ring].bytes_done; + packets = priv->tx[ring].pkt_done; + bytes = priv->tx[ring].bytes_done; } while (u64_stats_fetch_retry(&priv->tx[ring].statss, start)); + s->tx_packets += packets; + s->tx_bytes += bytes; } } } From 0b409f3f4018921160a0c57d5a774874ea8c7b5c Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Tue, 14 Sep 2021 10:54:42 +0200 Subject: [PATCH 1110/5344] i40e: fix endless loop under rtnl [ Upstream commit 857b6c6f665cca9828396d9743faf37fd09e9ac3 ] The loop in i40e_get_capabilities can never end. The problem is that although i40e_aq_discover_capabilities returns with an error if there's a firmware problem, the returned error is not checked. There is a check for pf->hw.aq.asq_last_status but that value is set to I40E_AQ_RC_OK on most firmware problems. When i40e_aq_discover_capabilities encounters a firmware problem, it will encounter the same problem on its next invocation. As the result, the loop becomes endless. We hit this with I40E_ERR_ADMIN_QUEUE_TIMEOUT but looking at the code, it can happen with a range of other firmware errors. I don't know what the correct behavior should be: whether the firmware should be retried a few times, or whether pf->hw.aq.asq_last_status should be always set to the encountered firmware error (but then it would be pointless and can be just replaced by the i40e_aq_discover_capabilities return value). However, the current behavior with an endless loop under the rtnl mutex(!) is unacceptable and Intel has not submitted a fix, although we explained the bug to them 7 months ago. This may not be the best possible fix but it's better than hanging the whole system on a firmware bug. Fixes: 56a62fc86895 ("i40e: init code and hardware support") Tested-by: Stefan Assmann Signed-off-by: Jiri Benc Reviewed-by: Jesse Brandeburg Tested-by: Dave Switzer Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index bfc59813e6d18..6311d0be7d609 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -9616,7 +9616,7 @@ static int i40e_get_capabilities(struct i40e_pf *pf, if (pf->hw.aq.asq_last_status == I40E_AQ_RC_ENOMEM) { /* retry with a larger buffer */ buf_len = data_size; - } else if (pf->hw.aq.asq_last_status != I40E_AQ_RC_OK) { + } else if (pf->hw.aq.asq_last_status != I40E_AQ_RC_OK || err) { dev_info(&pf->pdev->dev, "capability discovery failed, err %s aq_err %s\n", i40e_stat_str(&pf->hw, err), From f557005a504e31d6278b2a280b5f8083212a6c26 Mon Sep 17 00:00:00 2001 From: Sylwester Dziedziuch Date: Fri, 24 Sep 2021 11:40:41 +0200 Subject: [PATCH 1111/5344] i40e: Fix freeing of uninitialized misc IRQ vector [ Upstream commit 2e5a20573a926302b233b0c2e1077f5debc7ab2e ] When VSI set up failed in i40e_probe() as part of PF switch set up driver was trying to free misc IRQ vectors in i40e_clear_interrupt_scheme and produced a kernel Oops: Trying to free already-free IRQ 266 WARNING: CPU: 0 PID: 5 at kernel/irq/manage.c:1731 __free_irq+0x9a/0x300 Workqueue: events work_for_cpu_fn RIP: 0010:__free_irq+0x9a/0x300 Call Trace: ? synchronize_irq+0x3a/0xa0 free_irq+0x2e/0x60 i40e_clear_interrupt_scheme+0x53/0x190 [i40e] i40e_probe.part.108+0x134b/0x1a40 [i40e] ? kmem_cache_alloc+0x158/0x1c0 ? acpi_ut_update_ref_count.part.1+0x8e/0x345 ? acpi_ut_update_object_reference+0x15e/0x1e2 ? strstr+0x21/0x70 ? irq_get_irq_data+0xa/0x20 ? mp_check_pin_attr+0x13/0xc0 ? irq_get_irq_data+0xa/0x20 ? mp_map_pin_to_irq+0xd3/0x2f0 ? acpi_register_gsi_ioapic+0x93/0x170 ? pci_conf1_read+0xa4/0x100 ? pci_bus_read_config_word+0x49/0x70 ? do_pci_enable_device+0xcc/0x100 local_pci_probe+0x41/0x90 work_for_cpu_fn+0x16/0x20 process_one_work+0x1a7/0x360 worker_thread+0x1cf/0x390 ? create_worker+0x1a0/0x1a0 kthread+0x112/0x130 ? kthread_flush_work_fn+0x10/0x10 ret_from_fork+0x1f/0x40 The problem is that at that point misc IRQ vectors were not allocated yet and we get a call trace that driver is trying to free already free IRQ vectors. Add a check in i40e_clear_interrupt_scheme for __I40E_MISC_IRQ_REQUESTED PF state before calling i40e_free_misc_vector. This state is set only if misc IRQ vectors were properly initialized. Fixes: c17401a1dd21 ("i40e: use separate state bit for miscellaneous IRQ setup") Reported-by: PJ Waskiewicz Signed-off-by: Sylwester Dziedziuch Signed-off-by: Mateusz Palczewski Tested-by: Dave Switzer Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/i40e/i40e_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 6311d0be7d609..06fcd41495548 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -4817,7 +4817,8 @@ static void i40e_clear_interrupt_scheme(struct i40e_pf *pf) { int i; - i40e_free_misc_vector(pf); + if (test_bit(__I40E_MISC_IRQ_REQUESTED, pf->state)) + i40e_free_misc_vector(pf); i40e_put_lump(pf->irq_pile, pf->iwarp_base_vector, I40E_IWARP_IRQ_PILE_ID); From d1bb31bfb8f1d10f5ad950b7253a3a55948b1877 Mon Sep 17 00:00:00 2001 From: Mike Manning Date: Tue, 5 Oct 2021 14:03:42 +0100 Subject: [PATCH 1112/5344] net: prefer socket bound to interface when not in VRF [ Upstream commit 8d6c414cd2fb74aa6812e9bfec6178f8246c4f3a ] The commit 6da5b0f027a8 ("net: ensure unbound datagram socket to be chosen when not in a VRF") modified compute_score() so that a device match is always made, not just in the case of an l3mdev skb, then increments the score also for unbound sockets. This ensures that sockets bound to an l3mdev are never selected when not in a VRF. But as unbound and bound sockets are now scored equally, this results in the last opened socket being selected if there are matches in the default VRF for an unbound socket and a socket bound to a dev that is not an l3mdev. However, handling prior to this commit was to always select the bound socket in this case. Reinstate this handling by incrementing the score only for bound sockets. The required isolation due to choosing between an unbound socket and a socket bound to an l3mdev remains in place due to the device match always being made. The same approach is taken for compute_score() for stream sockets. Fixes: 6da5b0f027a8 ("net: ensure unbound datagram socket to be chosen when not in a VRF") Fixes: e78190581aff ("net: ensure unbound stream socket to be chosen when not in a VRF") Signed-off-by: Mike Manning Reviewed-by: David Ahern Link: https://lore.kernel.org/r/cf0a8523-b362-1edf-ee78-eef63cbbb428@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/ipv4/inet_hashtables.c | 4 +++- net/ipv4/udp.c | 3 ++- net/ipv6/inet6_hashtables.c | 2 +- net/ipv6/udp.c | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 006a34b185378..72fdf1fcbcaa9 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -239,8 +239,10 @@ static inline int compute_score(struct sock *sk, struct net *net, if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return -1; + score = sk->sk_bound_dev_if ? 2 : 1; - score = sk->sk_family == PF_INET ? 2 : 1; + if (sk->sk_family == PF_INET) + score++; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 579efc0a1f251..92d85332a5daf 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -386,7 +386,8 @@ static int compute_score(struct sock *sk, struct net *net, dif, sdif); if (!dev_match) return -1; - score += 4; + if (sk->sk_bound_dev_if) + score += 4; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index fbe9d4295eac3..ab12e00f6bfff 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -104,7 +104,7 @@ static inline int compute_score(struct sock *sk, struct net *net, if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return -1; - score = 1; + score = sk->sk_bound_dev_if ? 2 : 1; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 894d9102928ea..5248fd11e619c 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -133,7 +133,8 @@ static int compute_score(struct sock *sk, struct net *net, dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif); if (!dev_match) return -1; - score++; + if (sk->sk_bound_dev_if) + score++; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; From 2df0e26efb714b0536cbbec58a3c87ca017969c5 Mon Sep 17 00:00:00 2001 From: Jamie Iles Date: Wed, 22 Sep 2021 17:57:18 +0100 Subject: [PATCH 1113/5344] i2c: acpi: fix resource leak in reconfiguration device addition [ Upstream commit 6558b646ce1c2a872fe1c2c7cb116f05a2c1950f ] acpi_i2c_find_adapter_by_handle() calls bus_find_device() which takes a reference on the adapter which is never released which will result in a reference count leak and render the adapter unremovable. Make sure to put the adapter after creating the client in the same manner that we do for OF. Fixes: 525e6fabeae2 ("i2c / ACPI: add support for ACPI reconfigure notifications") Signed-off-by: Jamie Iles Acked-by: Mika Westerberg [wsa: fixed title] Signed-off-by: Wolfram Sang Signed-off-by: Sasha Levin --- drivers/i2c/i2c-core-acpi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/i2c/i2c-core-acpi.c b/drivers/i2c/i2c-core-acpi.c index c70983780ae79..fe466ee4c49bf 100644 --- a/drivers/i2c/i2c-core-acpi.c +++ b/drivers/i2c/i2c-core-acpi.c @@ -436,6 +436,7 @@ static int i2c_acpi_notify(struct notifier_block *nb, unsigned long value, break; i2c_acpi_register_device(adapter, adev, &info); + put_device(&adapter->dev); break; case ACPI_RECONFIG_DEVICE_REMOVE: if (!acpi_device_enumerated(adev)) From 50d56a9291a2fef51d4bd5c791eb42478fd06803 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Mon, 27 Sep 2021 15:06:14 +0800 Subject: [PATCH 1114/5344] bpf, s390: Fix potential memory leak about jit_data [ Upstream commit 686cb8b9f6b46787f035afe8fbd132a74e6b1bdd ] Make sure to free jit_data through kfree() in the error path. Fixes: 1c8f9b91c456 ("bpf: s390: add JIT support for multi-function programs") Signed-off-by: Tiezhu Yang Acked-by: Ilya Leoshkevich Reviewed-by: Christian Borntraeger Signed-off-by: Christian Borntraeger Signed-off-by: Vasily Gorbik Signed-off-by: Sasha Levin --- arch/s390/net/bpf_jit_comp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 2d29966276296..f63e4cb6c9b31 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1385,7 +1385,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) jit.addrs = kvcalloc(fp->len + 1, sizeof(*jit.addrs), GFP_KERNEL); if (jit.addrs == NULL) { fp = orig_fp; - goto out; + goto free_addrs; } /* * Three initial passes: From a60882079d296dc93cdde41f64c23a422767d170 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Sat, 2 Oct 2021 17:21:20 -0700 Subject: [PATCH 1115/5344] RISC-V: Include clone3() on rv32 [ Upstream commit 59a4e0d5511ba61353ea9a4efdb1b86c23ecf134 ] As far as I can tell this should be enabled on rv32 as well, I'm not sure why it's rv64-only. checksyscalls is complaining about our lack of clone3() on rv32. Fixes: 56ac5e213933 ("riscv: enable sys_clone3 syscall for rv64") Signed-off-by: Palmer Dabbelt Reviewed-by: Arnd Bergmann Acked-by: Christian Brauner Signed-off-by: Palmer Dabbelt Signed-off-by: Sasha Levin --- arch/riscv/include/uapi/asm/unistd.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/riscv/include/uapi/asm/unistd.h b/arch/riscv/include/uapi/asm/unistd.h index 13ce76cc5affe..80dff2c2bf677 100644 --- a/arch/riscv/include/uapi/asm/unistd.h +++ b/arch/riscv/include/uapi/asm/unistd.h @@ -18,9 +18,10 @@ #ifdef __LP64__ #define __ARCH_WANT_NEW_STAT #define __ARCH_WANT_SET_GET_RLIMIT -#define __ARCH_WANT_SYS_CLONE3 #endif /* __LP64__ */ +#define __ARCH_WANT_SYS_CLONE3 + #include /* From 1c5d4d791b9c0fab02e8b05ca1f42416248c0e0a Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Tue, 3 Aug 2021 13:35:24 +0200 Subject: [PATCH 1116/5344] x86/platform/olpc: Correct ifdef symbol to intended CONFIG_OLPC_XO15_SCI commit 4758fd801f919b8b9acad78d2e49a195ec2be46b upstream. The refactoring in the commit in Fixes introduced an ifdef CONFIG_OLPC_XO1_5_SCI, however the config symbol is actually called "CONFIG_OLPC_XO15_SCI". Fortunately, ./scripts/checkkconfigsymbols.py warns: OLPC_XO1_5_SCI Referencing files: arch/x86/platform/olpc/olpc.c Correct this ifdef condition to the intended config symbol. Fixes: ec9964b48033 ("Platform: OLPC: Move EC-specific functionality out from x86") Suggested-by: Randy Dunlap Signed-off-by: Lukas Bulwahn Signed-off-by: Borislav Petkov Cc: Link: https://lkml.kernel.org/r/20210803113531.30720-3-lukas.bulwahn@gmail.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/platform/olpc/olpc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c index ee2beda590d0d..1d4a00e767ece 100644 --- a/arch/x86/platform/olpc/olpc.c +++ b/arch/x86/platform/olpc/olpc.c @@ -274,7 +274,7 @@ static struct olpc_ec_driver ec_xo1_driver = { static struct olpc_ec_driver ec_xo1_5_driver = { .ec_cmd = olpc_xo1_ec_cmd, -#ifdef CONFIG_OLPC_XO1_5_SCI +#ifdef CONFIG_OLPC_XO15_SCI /* * XO-1.5 EC wakeups are available when olpc-xo15-sci driver is * compiled in From 63aea6e30fa233081b1e74aa84bc79fc2f6bf34c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 30 Sep 2021 19:21:39 +0200 Subject: [PATCH 1117/5344] x86/hpet: Use another crystalball to evaluate HPET usability commit 6e3cd95234dc1eda488f4f487c281bac8fef4d9b upstream. On recent Intel systems the HPET stops working when the system reaches PC10 idle state. The approach of adding PCI ids to the early quirks to disable HPET on these systems is a whack a mole game which makes no sense. Check for PC10 instead and force disable HPET if supported. The check is overbroad as it does not take ACPI, intel_idle enablement and command line parameters into account. That's fine as long as there is at least PMTIMER available to calibrate the TSC frequency. The decision can be overruled by adding "hpet=force" on the kernel command line. Remove the related early PCI quirks for affected Ice Cake and Coffin Lake systems as they are not longer required. That should also cover all other systems, i.e. Tiger Rag and newer generations, which are most likely affected by this as well. Fixes: Yet another hardware trainwreck Reported-by: Jakub Kicinski Signed-off-by: Thomas Gleixner Tested-by: Jakub Kicinski Reviewed-by: Rafael J. Wysocki Cc: stable@vger.kernel.org Cc: Kai-Heng Feng Cc: Bjorn Helgaas Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/early-quirks.c | 6 --- arch/x86/kernel/hpet.c | 81 ++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 2f9ec14be3b11..6f6b1d04dadf9 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -710,12 +710,6 @@ static struct chipset early_qrk[] __initdata = { */ { PCI_VENDOR_ID_INTEL, 0x0f00, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, - { PCI_VENDOR_ID_INTEL, 0x3e20, - PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, - { PCI_VENDOR_ID_INTEL, 0x3ec4, - PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, - { PCI_VENDOR_ID_INTEL, 0x8a12, - PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, { PCI_VENDOR_ID_BROADCOM, 0x4331, PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset}, {} diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index c6f791bc481eb..9834d221e390f 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -9,6 +9,7 @@ #include #include +#include #undef pr_fmt #define pr_fmt(fmt) "hpet: " fmt @@ -806,6 +807,83 @@ static bool __init hpet_counting(void) return false; } +static bool __init mwait_pc10_supported(void) +{ + unsigned int eax, ebx, ecx, mwait_substates; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return false; + + if (!cpu_feature_enabled(X86_FEATURE_MWAIT)) + return false; + + if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) + return false; + + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates); + + return (ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) && + (ecx & CPUID5_ECX_INTERRUPT_BREAK) && + (mwait_substates & (0xF << 28)); +} + +/* + * Check whether the system supports PC10. If so force disable HPET as that + * stops counting in PC10. This check is overbroad as it does not take any + * of the following into account: + * + * - ACPI tables + * - Enablement of intel_idle + * - Command line arguments which limit intel_idle C-state support + * + * That's perfectly fine. HPET is a piece of hardware designed by committee + * and the only reasons why it is still in use on modern systems is the + * fact that it is impossible to reliably query TSC and CPU frequency via + * CPUID or firmware. + * + * If HPET is functional it is useful for calibrating TSC, but this can be + * done via PMTIMER as well which seems to be the last remaining timer on + * X86/INTEL platforms that has not been completely wreckaged by feature + * creep. + * + * In theory HPET support should be removed altogether, but there are older + * systems out there which depend on it because TSC and APIC timer are + * dysfunctional in deeper C-states. + * + * It's only 20 years now that hardware people have been asked to provide + * reliable and discoverable facilities which can be used for timekeeping + * and per CPU timer interrupts. + * + * The probability that this problem is going to be solved in the + * forseeable future is close to zero, so the kernel has to be cluttered + * with heuristics to keep up with the ever growing amount of hardware and + * firmware trainwrecks. Hopefully some day hardware people will understand + * that the approach of "This can be fixed in software" is not sustainable. + * Hope dies last... + */ +static bool __init hpet_is_pc10_damaged(void) +{ + unsigned long long pcfg; + + /* Check whether PC10 substates are supported */ + if (!mwait_pc10_supported()) + return false; + + /* Check whether PC10 is enabled in PKG C-state limit */ + rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, pcfg); + if ((pcfg & 0xF) < 8) + return false; + + if (hpet_force_user) { + pr_warn("HPET force enabled via command line, but dysfunctional in PC10.\n"); + return false; + } + + pr_info("HPET dysfunctional in PC10. Force disabled.\n"); + boot_hpet_disable = true; + return true; +} + /** * hpet_enable - Try to setup the HPET timer. Returns 1 on success. */ @@ -819,6 +897,9 @@ int __init hpet_enable(void) if (!is_hpet_capable()) return 0; + if (hpet_is_pc10_damaged()) + return 0; + hpet_set_mapping(); if (!hpet_virt_address) return 0; From a106df1f08a742ec3aebb2d584e8c6e63a87e5d0 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Tue, 3 Aug 2021 13:35:25 +0200 Subject: [PATCH 1118/5344] x86/Kconfig: Correct reference to MWINCHIP3D commit 225bac2dc5d192e55f2c50123ee539b1edf8a411 upstream. Commit in Fixes intended to exclude the Winchip series and referred to CONFIG_WINCHIP3D, but the config symbol is called CONFIG_MWINCHIP3D. Hence, scripts/checkkconfigsymbols.py warns: WINCHIP3D Referencing files: arch/x86/Kconfig Correct the reference to the intended config symbol. Fixes: 69b8d3fcabdc ("x86/Kconfig: Exclude i586-class CPUs lacking PAE support from the HIGHMEM64G Kconfig group") Suggested-by: Randy Dunlap Signed-off-by: Lukas Bulwahn Signed-off-by: Borislav Petkov Cc: Link: https://lkml.kernel.org/r/20210803113531.30720-4-lukas.bulwahn@gmail.com [manually adjusted the change to the state on the v4.19.y and v5.4.y stable tree] Signed-off-by: Lukas Bulwahn Signed-off-by: Greg Kroah-Hartman --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cd5a6c7ff30aa..c581fef615838 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1426,7 +1426,7 @@ config HIGHMEM4G config HIGHMEM64G bool "64GB" - depends on !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !WINCHIP3D && !MK6 + depends on !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !MWINCHIP3D && !MK6 select X86_PAE ---help--- Select this if you have a 32-bit processor and more than 4 From 86f6ec07bad05d9256b7d4cebc5753e237f7a6df Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 13 Oct 2021 10:08:21 +0200 Subject: [PATCH 1119/5344] Linux 5.4.153 Link: https://lore.kernel.org/r/20211011134503.715740503@linuxfoundation.org Tested-by: Florian Fainelli Tested-by: Shuah Khan Tested-by: Hulk Robot Link: https://lore.kernel.org/r/20211012064436.577746139@linuxfoundation.org Link: https://lore.kernel.org/r/20211012093344.002301190@linuxfoundation.org Tested-by: Jon Hunter Tested-by: Shuah Khan Tested-by: Linux Kernel Functional Testing Tested-by: Guenter Roeck Tested-by: Florian Fainelli Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index ffcdc36c56f54..df9b1d07ca097 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 4 -SUBLEVEL = 152 +SUBLEVEL = 153 EXTRAVERSION = NAME = Kleptomaniac Octopus From ce9e93c11851c4e401e729efcda8b5f1c5de3597 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 28 Sep 2021 13:32:33 -0700 Subject: [PATCH 1120/5344] net: phy: bcm7xxx: Fixed indirect MMD operations commit d88fd1b546ff19c8040cfaea76bf16aed1c5a0bb upstream. When EEE support was added to the 28nm EPHY it was assumed that it would be able to support the standard clause 45 over clause 22 register access method. It turns out that the PHY does not support that, which is the very reason for using the indirect shadow mode 2 bank 3 access method. Implement {read,write}_mmd to allow the standard PHY library routines pertaining to EEE querying and configuration to work correctly on these PHYs. This forces us to implement a __phy_set_clr_bits() function that does not grab the MDIO bus lock since the PHY driver's {read,write}_mmd functions are always called with that lock held. Fixes: 83ee102a6998 ("net: phy: bcm7xxx: add support for 28nm EPHY") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/phy/bcm7xxx.c | 114 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 110 insertions(+), 4 deletions(-) diff --git a/drivers/net/phy/bcm7xxx.c b/drivers/net/phy/bcm7xxx.c index af8eabe7a6d44..d372626c603d4 100644 --- a/drivers/net/phy/bcm7xxx.c +++ b/drivers/net/phy/bcm7xxx.c @@ -26,7 +26,12 @@ #define MII_BCM7XXX_SHD_2_ADDR_CTRL 0xe #define MII_BCM7XXX_SHD_2_CTRL_STAT 0xf #define MII_BCM7XXX_SHD_2_BIAS_TRIM 0x1a +#define MII_BCM7XXX_SHD_3_PCS_CTRL 0x0 +#define MII_BCM7XXX_SHD_3_PCS_STATUS 0x1 +#define MII_BCM7XXX_SHD_3_EEE_CAP 0x2 #define MII_BCM7XXX_SHD_3_AN_EEE_ADV 0x3 +#define MII_BCM7XXX_SHD_3_EEE_LP 0x4 +#define MII_BCM7XXX_SHD_3_EEE_WK_ERR 0x5 #define MII_BCM7XXX_SHD_3_PCS_CTRL_2 0x6 #define MII_BCM7XXX_PCS_CTRL_2_DEF 0x4400 #define MII_BCM7XXX_SHD_3_AN_STAT 0xb @@ -210,25 +215,37 @@ static int bcm7xxx_28nm_resume(struct phy_device *phydev) return genphy_config_aneg(phydev); } -static int phy_set_clr_bits(struct phy_device *dev, int location, - int set_mask, int clr_mask) +static int __phy_set_clr_bits(struct phy_device *dev, int location, + int set_mask, int clr_mask) { int v, ret; - v = phy_read(dev, location); + v = __phy_read(dev, location); if (v < 0) return v; v &= ~clr_mask; v |= set_mask; - ret = phy_write(dev, location, v); + ret = __phy_write(dev, location, v); if (ret < 0) return ret; return v; } +static int phy_set_clr_bits(struct phy_device *dev, int location, + int set_mask, int clr_mask) +{ + int ret; + + mutex_lock(&dev->mdio.bus->mdio_lock); + ret = __phy_set_clr_bits(dev, location, set_mask, clr_mask); + mutex_unlock(&dev->mdio.bus->mdio_lock); + + return ret; +} + static int bcm7xxx_28nm_ephy_01_afe_config_init(struct phy_device *phydev) { int ret; @@ -392,6 +409,93 @@ static int bcm7xxx_28nm_ephy_config_init(struct phy_device *phydev) return bcm7xxx_28nm_ephy_apd_enable(phydev); } +#define MII_BCM7XXX_REG_INVALID 0xff + +static u8 bcm7xxx_28nm_ephy_regnum_to_shd(u16 regnum) +{ + switch (regnum) { + case MDIO_CTRL1: + return MII_BCM7XXX_SHD_3_PCS_CTRL; + case MDIO_STAT1: + return MII_BCM7XXX_SHD_3_PCS_STATUS; + case MDIO_PCS_EEE_ABLE: + return MII_BCM7XXX_SHD_3_EEE_CAP; + case MDIO_AN_EEE_ADV: + return MII_BCM7XXX_SHD_3_AN_EEE_ADV; + case MDIO_AN_EEE_LPABLE: + return MII_BCM7XXX_SHD_3_EEE_LP; + case MDIO_PCS_EEE_WK_ERR: + return MII_BCM7XXX_SHD_3_EEE_WK_ERR; + default: + return MII_BCM7XXX_REG_INVALID; + } +} + +static bool bcm7xxx_28nm_ephy_dev_valid(int devnum) +{ + return devnum == MDIO_MMD_AN || devnum == MDIO_MMD_PCS; +} + +static int bcm7xxx_28nm_ephy_read_mmd(struct phy_device *phydev, + int devnum, u16 regnum) +{ + u8 shd = bcm7xxx_28nm_ephy_regnum_to_shd(regnum); + int ret; + + if (!bcm7xxx_28nm_ephy_dev_valid(devnum) || + shd == MII_BCM7XXX_REG_INVALID) + return -EOPNOTSUPP; + + /* set shadow mode 2 */ + ret = __phy_set_clr_bits(phydev, MII_BCM7XXX_TEST, + MII_BCM7XXX_SHD_MODE_2, 0); + if (ret < 0) + return ret; + + /* Access the desired shadow register address */ + ret = __phy_write(phydev, MII_BCM7XXX_SHD_2_ADDR_CTRL, shd); + if (ret < 0) + goto reset_shadow_mode; + + ret = __phy_read(phydev, MII_BCM7XXX_SHD_2_CTRL_STAT); + +reset_shadow_mode: + /* reset shadow mode 2 */ + __phy_set_clr_bits(phydev, MII_BCM7XXX_TEST, 0, + MII_BCM7XXX_SHD_MODE_2); + return ret; +} + +static int bcm7xxx_28nm_ephy_write_mmd(struct phy_device *phydev, + int devnum, u16 regnum, u16 val) +{ + u8 shd = bcm7xxx_28nm_ephy_regnum_to_shd(regnum); + int ret; + + if (!bcm7xxx_28nm_ephy_dev_valid(devnum) || + shd == MII_BCM7XXX_REG_INVALID) + return -EOPNOTSUPP; + + /* set shadow mode 2 */ + ret = __phy_set_clr_bits(phydev, MII_BCM7XXX_TEST, + MII_BCM7XXX_SHD_MODE_2, 0); + if (ret < 0) + return ret; + + /* Access the desired shadow register address */ + ret = __phy_write(phydev, MII_BCM7XXX_SHD_2_ADDR_CTRL, shd); + if (ret < 0) + goto reset_shadow_mode; + + /* Write the desired value in the shadow register */ + __phy_write(phydev, MII_BCM7XXX_SHD_2_CTRL_STAT, val); + +reset_shadow_mode: + /* reset shadow mode 2 */ + return __phy_set_clr_bits(phydev, MII_BCM7XXX_TEST, 0, + MII_BCM7XXX_SHD_MODE_2); +} + static int bcm7xxx_28nm_ephy_resume(struct phy_device *phydev) { int ret; @@ -563,6 +667,8 @@ static int bcm7xxx_28nm_probe(struct phy_device *phydev) .get_strings = bcm_phy_get_strings, \ .get_stats = bcm7xxx_28nm_get_phy_stats, \ .probe = bcm7xxx_28nm_probe, \ + .read_mmd = bcm7xxx_28nm_ephy_read_mmd, \ + .write_mmd = bcm7xxx_28nm_ephy_write_mmd, \ } #define BCM7XXX_40NM_EPHY(_oui, _name) \ From 38bd3e7562021421a5806812984dd6cdd9c07218 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 16 Jul 2021 20:20:22 +0800 Subject: [PATCH 1121/5344] ext4: correct the error path of ext4_write_inline_data_end() [ Upstream commit 55ce2f649b9e88111270333a8127e23f4f8f42d7 ] Current error path of ext4_write_inline_data_end() is not correct. Firstly, it should pass out the error value if ext4_get_inode_loc() return fail, or else it could trigger infinite loop if we inject error here. And then it's better to add inode to orphan list if it return fail in ext4_journal_stop(), otherwise we could not restore inline xattr entry after power failure. Finally, we need to reset the 'ret' value if ext4_write_inline_data_end() return success in ext4_write_end() and ext4_journalled_write_end(), otherwise we could not get the error return value of ext4_journal_stop(). Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20210716122024.1105856-3-yi.zhang@huawei.com Signed-off-by: Sasha Levin --- fs/ext4/inline.c | 15 +++++---------- fs/ext4/inode.c | 7 +++++-- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 46151bda62368..cdb10e9fded65 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -733,18 +733,13 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, void *kaddr; struct ext4_iloc iloc; - if (unlikely(copied < len)) { - if (!PageUptodate(page)) { - copied = 0; - goto out; - } - } + if (unlikely(copied < len) && !PageUptodate(page)) + return 0; ret = ext4_get_inode_loc(inode, &iloc); if (ret) { ext4_std_error(inode->i_sb, ret); - copied = 0; - goto out; + return ret; } ext4_write_lock_xattr(inode, &no_expand); @@ -757,7 +752,7 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, (void) ext4_find_inline_data_nolock(inode); kaddr = kmap_atomic(page); - ext4_write_inline_data(inode, &iloc, kaddr, pos, len); + ext4_write_inline_data(inode, &iloc, kaddr, pos, copied); kunmap_atomic(kaddr); SetPageUptodate(page); /* clear page dirty so that writepages wouldn't work for us. */ @@ -766,7 +761,7 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, ext4_write_unlock_xattr(inode, &no_expand); brelse(iloc.bh); mark_inode_dirty(inode); -out: + return copied; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 48b467353f6f1..dcbd8ac8d4711 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1439,6 +1439,7 @@ static int ext4_write_end(struct file *file, goto errout; } copied = ret; + ret = 0; } else copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); @@ -1465,13 +1466,14 @@ static int ext4_write_end(struct file *file, if (i_size_changed || inline_data) ext4_mark_inode_dirty(handle, inode); +errout: if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them */ ext4_orphan_add(handle, inode); -errout: + ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; @@ -1554,6 +1556,7 @@ static int ext4_journalled_write_end(struct file *file, goto errout; } copied = ret; + ret = 0; } else if (unlikely(copied < len) && !PageUptodate(page)) { copied = 0; ext4_journalled_zero_new_buffers(handle, page, from, to); @@ -1583,6 +1586,7 @@ static int ext4_journalled_write_end(struct file *file, ret = ret2; } +errout: if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside @@ -1590,7 +1594,6 @@ static int ext4_journalled_write_end(struct file *file, */ ext4_orphan_add(handle, inode); -errout: ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; From 6a8db9e3b0e5c08726c4b521ad3092e2181495f5 Mon Sep 17 00:00:00 2001 From: Mizuho Mori Date: Thu, 29 Jul 2021 20:03:25 +0900 Subject: [PATCH 1122/5344] HID: apple: Fix logical maximum and usage maximum of Magic Keyboard JIS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 67fd71ba16a37c663d139f5ba5296f344d80d072 ] Apple Magic Keyboard(JIS)'s Logical Maximum and Usage Maximum are wrong. Below is a report descriptor. 0x05, 0x01, /* Usage Page (Desktop), */ 0x09, 0x06, /* Usage (Keyboard), */ 0xA1, 0x01, /* Collection (Application), */ 0x85, 0x01, /* Report ID (1), */ 0x05, 0x07, /* Usage Page (Keyboard), */ 0x15, 0x00, /* Logical Minimum (0), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x19, 0xE0, /* Usage Minimum (KB Leftcontrol), */ 0x29, 0xE7, /* Usage Maximum (KB Right GUI), */ 0x75, 0x01, /* Report Size (1), */ 0x95, 0x08, /* Report Count (8), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x05, /* Report Count (5), */ 0x75, 0x01, /* Report Size (1), */ 0x05, 0x08, /* Usage Page (LED), */ 0x19, 0x01, /* Usage Minimum (01h), */ 0x29, 0x05, /* Usage Maximum (05h), */ 0x91, 0x02, /* Output (Variable), */ 0x95, 0x01, /* Report Count (1), */ 0x75, 0x03, /* Report Size (3), */ 0x91, 0x03, /* Output (Constant, Variable), */ 0x95, 0x08, /* Report Count (8), */ 0x75, 0x01, /* Report Size (1), */ 0x15, 0x00, /* Logical Minimum (0), */ 0x25, 0x01, /* Logical Maximum (1), */ here is a report descriptor which is parsed one in kernel. see sys/kernel/debug/hid//rdesc 05 01 09 06 a1 01 85 01 05 07 15 00 25 01 19 e0 29 e7 75 01 95 08 81 02 95 05 75 01 05 08 19 01 29 05 91 02 95 01 75 03 91 03 95 08 75 01 15 00 25 01 06 00 ff 09 03 81 03 95 06 75 08 15 00 25 [65] 05 07 19 00 29 [65] 81 00 95 01 75 01 15 00 25 01 05 0c 09 b8 81 02 95 01 75 01 06 01 ff 09 03 81 02 95 01 75 06 81 03 06 02 ff 09 55 85 55 15 00 26 ff 00 75 08 95 40 b1 a2 c0 06 00 ff 09 14 a1 01 85 90 05 84 75 01 95 03 15 00 25 01 09 61 05 85 09 44 09 46 81 02 95 05 81 01 75 08 95 01 15 00 26 ff 00 09 65 81 02 c0 00 Position 64(Logical Maximum) and 70(Usage Maximum) are 101. Both should be 0xE7 to support JIS specific keys(ろ, Eisu, Kana, |) support. position 117 is also 101 but not related(it is Usage 65h). There are no difference of product id between JIS and ANSI. They are same 0x0267. Signed-off-by: Mizuho Mori Signed-off-by: Jiri Kosina Signed-off-by: Sasha Levin --- drivers/hid/hid-apple.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c index 6909c045fece1..07df64daf7dae 100644 --- a/drivers/hid/hid-apple.c +++ b/drivers/hid/hid-apple.c @@ -301,12 +301,19 @@ static int apple_event(struct hid_device *hdev, struct hid_field *field, /* * MacBook JIS keyboard has wrong logical maximum + * Magic Keyboard JIS has wrong logical maximum */ static __u8 *apple_report_fixup(struct hid_device *hdev, __u8 *rdesc, unsigned int *rsize) { struct apple_sc *asc = hid_get_drvdata(hdev); + if(*rsize >=71 && rdesc[70] == 0x65 && rdesc[64] == 0x65) { + hid_info(hdev, + "fixing up Magic Keyboard JIS report descriptor\n"); + rdesc[64] = rdesc[70] = 0xe7; + } + if ((asc->quirks & APPLE_RDESC_JIS) && *rsize >= 60 && rdesc[53] == 0x65 && rdesc[59] == 0x65) { hid_info(hdev, From 4af96f14c8e98c25b62426e019ce7a05e7e71dff Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Sun, 12 Sep 2021 22:24:33 +0100 Subject: [PATCH 1123/5344] netfilter: ip6_tables: zero-initialize fragment offset [ Upstream commit 310e2d43c3ad429c1fba4b175806cf1f55ed73a6 ] ip6tables only sets the `IP6T_F_PROTO` flag on a rule if a protocol is specified (`-p tcp`, for example). However, if the flag is not set, `ip6_packet_match` doesn't call `ipv6_find_hdr` for the skb, in which case the fragment offset is left uninitialized and a garbage value is passed to each matcher. Signed-off-by: Jeremy Sowden Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/ipv6/netfilter/ip6_tables.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 8bb543b0e775e..41268612bdd4e 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -273,6 +273,7 @@ ip6t_do_table(struct sk_buff *skb, * things we don't know, ie. tcp syn flag or ports). If the * rule is also a fragment-specific rule, non-fragments won't * match it. */ + acpar.fragoff = 0; acpar.hotdrop = false; acpar.state = state; From 6faade40ca660d9578f0df9a2c7925ab40ab5199 Mon Sep 17 00:00:00 2001 From: Joshua-Dickens Date: Tue, 14 Sep 2021 13:28:25 -0400 Subject: [PATCH 1124/5344] HID: wacom: Add new Intuos BT (CTL-4100WL/CTL-6100WL) device IDs [ Upstream commit 0c8fbaa553077630e8eae45bd9676cfc01836aeb ] Add the new PIDs to wacom_wac.c to support the new models in the Intuos series. [jkosina@suse.cz: fix changelog] Signed-off-by: Joshua Dickens Reviewed-by: Ping Cheng Signed-off-by: Jiri Kosina Signed-off-by: Sasha Levin --- drivers/hid/wacom_wac.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index d5425bc1ad61a..f6be2e70a4967 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -4715,6 +4715,12 @@ static const struct wacom_features wacom_features_0x393 = { "Wacom Intuos Pro S", 31920, 19950, 8191, 63, INTUOSP2S_BT, WACOM_INTUOS3_RES, WACOM_INTUOS3_RES, 7, .touch_max = 10 }; +static const struct wacom_features wacom_features_0x3c6 = + { "Wacom Intuos BT S", 15200, 9500, 4095, 63, + INTUOSHT3_BT, WACOM_INTUOS_RES, WACOM_INTUOS_RES, 4 }; +static const struct wacom_features wacom_features_0x3c8 = + { "Wacom Intuos BT M", 21600, 13500, 4095, 63, + INTUOSHT3_BT, WACOM_INTUOS_RES, WACOM_INTUOS_RES, 4 }; static const struct wacom_features wacom_features_HID_ANY_ID = { "Wacom HID", .type = HID_GENERIC, .oVid = HID_ANY_ID, .oPid = HID_ANY_ID }; @@ -4888,6 +4894,8 @@ const struct hid_device_id wacom_ids[] = { { USB_DEVICE_WACOM(0x37A) }, { USB_DEVICE_WACOM(0x37B) }, { BT_DEVICE_WACOM(0x393) }, + { BT_DEVICE_WACOM(0x3c6) }, + { BT_DEVICE_WACOM(0x3c8) }, { USB_DEVICE_WACOM(0x4001) }, { USB_DEVICE_WACOM(0x4004) }, { USB_DEVICE_WACOM(0x5000) }, From b55c9bb6c2b1ce56239ab17e4f305ba5f8df6b6b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 15 Sep 2021 16:46:38 +0200 Subject: [PATCH 1125/5344] netfilter: nf_nat_masquerade: make async masq_inet6_event handling generic [ Upstream commit 30db406923b9285a9bac06a6af5e74bd6d0f1d06 ] masq_inet6_event is called asynchronously from system work queue, because the inet6 notifier is atomic and nf_iterate_cleanup can sleep. The ipv4 and device notifiers call nf_iterate_cleanup directly. This is legal, but these notifiers are called with RTNL mutex held. A large conntrack table with many devices coming and going will have severe impact on the system usability, with 'ip a' blocking for several seconds. This change places the defer code into a helper and makes it more generic so ipv4 and ifdown notifiers can be converted to defer the cleanup walk as well in a follow patch. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nf_nat_masquerade.c | 122 ++++++++++++++++++------------ 1 file changed, 75 insertions(+), 47 deletions(-) diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c index 8e8a65d46345b..415919a6ac1ad 100644 --- a/net/netfilter/nf_nat_masquerade.c +++ b/net/netfilter/nf_nat_masquerade.c @@ -9,8 +9,19 @@ #include +struct masq_dev_work { + struct work_struct work; + struct net *net; + union nf_inet_addr addr; + int ifindex; + int (*iter)(struct nf_conn *i, void *data); +}; + +#define MAX_MASQ_WORKER_COUNT 16 + static DEFINE_MUTEX(masq_mutex); static unsigned int masq_refcnt __read_mostly; +static atomic_t masq_worker_count __read_mostly; unsigned int nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, @@ -63,6 +74,63 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4); +static void iterate_cleanup_work(struct work_struct *work) +{ + struct masq_dev_work *w; + + w = container_of(work, struct masq_dev_work, work); + + nf_ct_iterate_cleanup_net(w->net, w->iter, (void *)w, 0, 0); + + put_net(w->net); + kfree(w); + atomic_dec(&masq_worker_count); + module_put(THIS_MODULE); +} + +/* Iterate conntrack table in the background and remove conntrack entries + * that use the device/address being removed. + * + * In case too many work items have been queued already or memory allocation + * fails iteration is skipped, conntrack entries will time out eventually. + */ +static void nf_nat_masq_schedule(struct net *net, union nf_inet_addr *addr, + int ifindex, + int (*iter)(struct nf_conn *i, void *data), + gfp_t gfp_flags) +{ + struct masq_dev_work *w; + + if (atomic_read(&masq_worker_count) > MAX_MASQ_WORKER_COUNT) + return; + + net = maybe_get_net(net); + if (!net) + return; + + if (!try_module_get(THIS_MODULE)) + goto err_module; + + w = kzalloc(sizeof(*w), gfp_flags); + if (w) { + /* We can overshoot MAX_MASQ_WORKER_COUNT, no big deal */ + atomic_inc(&masq_worker_count); + + INIT_WORK(&w->work, iterate_cleanup_work); + w->ifindex = ifindex; + w->net = net; + w->iter = iter; + if (addr) + w->addr = *addr; + schedule_work(&w->work); + return; + } + + module_put(THIS_MODULE); + err_module: + put_net(net); +} + static int device_cmp(struct nf_conn *i, void *ifindex) { const struct nf_conn_nat *nat = nfct_nat(i); @@ -136,8 +204,6 @@ static struct notifier_block masq_inet_notifier = { }; #if IS_ENABLED(CONFIG_IPV6) -static atomic_t v6_worker_count __read_mostly; - static int nat_ipv6_dev_get_saddr(struct net *net, const struct net_device *dev, const struct in6_addr *daddr, unsigned int srcprefs, @@ -187,13 +253,6 @@ nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6); -struct masq_dev_work { - struct work_struct work; - struct net *net; - struct in6_addr addr; - int ifindex; -}; - static int inet6_cmp(struct nf_conn *ct, void *work) { struct masq_dev_work *w = (struct masq_dev_work *)work; @@ -204,21 +263,7 @@ static int inet6_cmp(struct nf_conn *ct, void *work) tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; - return ipv6_addr_equal(&w->addr, &tuple->dst.u3.in6); -} - -static void iterate_cleanup_work(struct work_struct *work) -{ - struct masq_dev_work *w; - - w = container_of(work, struct masq_dev_work, work); - - nf_ct_iterate_cleanup_net(w->net, inet6_cmp, (void *)w, 0, 0); - - put_net(w->net); - kfree(w); - atomic_dec(&v6_worker_count); - module_put(THIS_MODULE); + return nf_inet_addr_cmp(&w->addr, &tuple->dst.u3); } /* atomic notifier; can't call nf_ct_iterate_cleanup_net (it can sleep). @@ -233,36 +278,19 @@ static int masq_inet6_event(struct notifier_block *this, { struct inet6_ifaddr *ifa = ptr; const struct net_device *dev; - struct masq_dev_work *w; - struct net *net; + union nf_inet_addr addr; - if (event != NETDEV_DOWN || atomic_read(&v6_worker_count) >= 16) + if (event != NETDEV_DOWN) return NOTIFY_DONE; dev = ifa->idev->dev; - net = maybe_get_net(dev_net(dev)); - if (!net) - return NOTIFY_DONE; - if (!try_module_get(THIS_MODULE)) - goto err_module; + memset(&addr, 0, sizeof(addr)); - w = kmalloc(sizeof(*w), GFP_ATOMIC); - if (w) { - atomic_inc(&v6_worker_count); + addr.in6 = ifa->addr; - INIT_WORK(&w->work, iterate_cleanup_work); - w->ifindex = dev->ifindex; - w->net = net; - w->addr = ifa->addr; - schedule_work(&w->work); - - return NOTIFY_DONE; - } - - module_put(THIS_MODULE); - err_module: - put_net(net); + nf_nat_masq_schedule(dev_net(dev), &addr, dev->ifindex, inet6_cmp, + GFP_ATOMIC); return NOTIFY_DONE; } From 4de5fe4c9d611a2583d2f41489e2508d400c297c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 15 Sep 2021 16:46:39 +0200 Subject: [PATCH 1126/5344] netfilter: nf_nat_masquerade: defer conntrack walk to work queue [ Upstream commit 7970a19b71044bf4dc2c1becc200275bdf1884d4 ] The ipv4 and device notifiers are called with RTNL mutex held. The table walk can take some time, better not block other RTNL users. 'ip a' has been reported to block for up to 20 seconds when conntrack table has many entries and device down events are frequent (e.g., PPP). Reported-and-tested-by: Martin Zaharinov Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nf_nat_masquerade.c | 50 +++++++++++++++---------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c index 415919a6ac1ad..acd73f717a088 100644 --- a/net/netfilter/nf_nat_masquerade.c +++ b/net/netfilter/nf_nat_masquerade.c @@ -131,13 +131,14 @@ static void nf_nat_masq_schedule(struct net *net, union nf_inet_addr *addr, put_net(net); } -static int device_cmp(struct nf_conn *i, void *ifindex) +static int device_cmp(struct nf_conn *i, void *arg) { const struct nf_conn_nat *nat = nfct_nat(i); + const struct masq_dev_work *w = arg; if (!nat) return 0; - return nat->masq_index == (int)(long)ifindex; + return nat->masq_index == w->ifindex; } static int masq_device_event(struct notifier_block *this, @@ -153,8 +154,8 @@ static int masq_device_event(struct notifier_block *this, * and forget them. */ - nf_ct_iterate_cleanup_net(net, device_cmp, - (void *)(long)dev->ifindex, 0, 0); + nf_nat_masq_schedule(net, NULL, dev->ifindex, + device_cmp, GFP_KERNEL); } return NOTIFY_DONE; @@ -162,35 +163,45 @@ static int masq_device_event(struct notifier_block *this, static int inet_cmp(struct nf_conn *ct, void *ptr) { - struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; - struct net_device *dev = ifa->ifa_dev->dev; struct nf_conntrack_tuple *tuple; + struct masq_dev_work *w = ptr; - if (!device_cmp(ct, (void *)(long)dev->ifindex)) + if (!device_cmp(ct, ptr)) return 0; tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; - return ifa->ifa_address == tuple->dst.u3.ip; + return nf_inet_addr_cmp(&w->addr, &tuple->dst.u3); } static int masq_inet_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct in_device *idev = ((struct in_ifaddr *)ptr)->ifa_dev; - struct net *net = dev_net(idev->dev); + const struct in_ifaddr *ifa = ptr; + const struct in_device *idev; + const struct net_device *dev; + union nf_inet_addr addr; + + if (event != NETDEV_DOWN) + return NOTIFY_DONE; /* The masq_dev_notifier will catch the case of the device going * down. So if the inetdev is dead and being destroyed we have * no work to do. Otherwise this is an individual address removal * and we have to perform the flush. */ + idev = ifa->ifa_dev; if (idev->dead) return NOTIFY_DONE; - if (event == NETDEV_DOWN) - nf_ct_iterate_cleanup_net(net, inet_cmp, ptr, 0, 0); + memset(&addr, 0, sizeof(addr)); + + addr.ip = ifa->ifa_address; + + dev = idev->dev; + nf_nat_masq_schedule(dev_net(idev->dev), &addr, dev->ifindex, + inet_cmp, GFP_KERNEL); return NOTIFY_DONE; } @@ -253,19 +264,6 @@ nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6); -static int inet6_cmp(struct nf_conn *ct, void *work) -{ - struct masq_dev_work *w = (struct masq_dev_work *)work; - struct nf_conntrack_tuple *tuple; - - if (!device_cmp(ct, (void *)(long)w->ifindex)) - return 0; - - tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; - - return nf_inet_addr_cmp(&w->addr, &tuple->dst.u3); -} - /* atomic notifier; can't call nf_ct_iterate_cleanup_net (it can sleep). * * Defer it to the system workqueue. @@ -289,7 +287,7 @@ static int masq_inet6_event(struct notifier_block *this, addr.in6 = ifa->addr; - nf_nat_masq_schedule(dev_net(dev), &addr, dev->ifindex, inet6_cmp, + nf_nat_masq_schedule(dev_net(dev), &addr, dev->ifindex, inet_cmp, GFP_ATOMIC); return NOTIFY_DONE; } From b012c88fe2d5753f24cb33009f8c0e36a558df8a Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 27 Aug 2021 22:42:30 +0800 Subject: [PATCH 1127/5344] mac80211: Drop frames from invalid MAC address in ad-hoc mode [ Upstream commit a6555f844549cd190eb060daef595f94d3de1582 ] WARNING: CPU: 1 PID: 9 at net/mac80211/sta_info.c:554 sta_info_insert_rcu+0x121/0x12a0 Modules linked in: CPU: 1 PID: 9 Comm: kworker/u8:1 Not tainted 5.14.0-rc7+ #253 Workqueue: phy3 ieee80211_iface_work RIP: 0010:sta_info_insert_rcu+0x121/0x12a0 ... Call Trace: ieee80211_ibss_finish_sta+0xbc/0x170 ieee80211_ibss_work+0x13f/0x7d0 ieee80211_iface_work+0x37a/0x500 process_one_work+0x357/0x850 worker_thread+0x41/0x4d0 If an Ad-Hoc node receives packets with invalid source MAC address, it hits a WARN_ON in sta_info_insert_check(), this can spam the log. Signed-off-by: YueHaibing Link: https://lore.kernel.org/r/20210827144230.39944-1-yuehaibing@huawei.com Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin --- net/mac80211/rx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 670d84e54db73..c7e6bf7c22c78 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3952,7 +3952,8 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) if (!bssid) return false; if (ether_addr_equal(sdata->vif.addr, hdr->addr2) || - ether_addr_equal(sdata->u.ibss.bssid, hdr->addr2)) + ether_addr_equal(sdata->u.ibss.bssid, hdr->addr2) || + !is_valid_ether_addr(hdr->addr2)) return false; if (ieee80211_is_beacon(hdr->frame_control)) return true; From 6523dba38c746dc76c59ca7377dc2d7cd0904786 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 25 Jul 2021 17:19:00 +0000 Subject: [PATCH 1128/5344] m68k: Handle arrivals of multiple signals correctly [ Upstream commit 4bb0bd81ce5e97092dfda6a106d414b703ec0ee8 ] When we have several pending signals, have entered with the kernel with large exception frame *and* have already built at least one sigframe, regs->stkadj is going to be non-zero and regs->format/sr/pc are going to be junk - the real values are in shifted exception stack frame we'd built when putting together the first sigframe. If that happens, subsequent sigframes are going to be garbage. Not hard to fix - just need to find the "adjusted" frame first and look for format/vector/sr/pc in it. Signed-off-by: Al Viro Tested-by: Michael Schmitz Reviewed-by: Michael Schmitz Tested-by: Finn Thain Link: https://lore.kernel.org/r/YP2dBIAPTaVvHiZ6@zeniv-ca.linux.org.uk Signed-off-by: Geert Uytterhoeven Signed-off-by: Sasha Levin --- arch/m68k/kernel/signal.c | 88 +++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 46 deletions(-) diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c index 617d105a92197..fb4c850de7652 100644 --- a/arch/m68k/kernel/signal.c +++ b/arch/m68k/kernel/signal.c @@ -448,7 +448,7 @@ static inline void save_fpu_state(struct sigcontext *sc, struct pt_regs *regs) if (CPU_IS_060 ? sc->sc_fpstate[2] : sc->sc_fpstate[0]) { fpu_version = sc->sc_fpstate[0]; - if (CPU_IS_020_OR_030 && + if (CPU_IS_020_OR_030 && !regs->stkadj && regs->vector >= (VEC_FPBRUC * 4) && regs->vector <= (VEC_FPNAN * 4)) { /* Clear pending exception in 68882 idle frame */ @@ -511,7 +511,7 @@ static inline int rt_save_fpu_state(struct ucontext __user *uc, struct pt_regs * if (!(CPU_IS_060 || CPU_IS_COLDFIRE)) context_size = fpstate[1]; fpu_version = fpstate[0]; - if (CPU_IS_020_OR_030 && + if (CPU_IS_020_OR_030 && !regs->stkadj && regs->vector >= (VEC_FPBRUC * 4) && regs->vector <= (VEC_FPNAN * 4)) { /* Clear pending exception in 68882 idle frame */ @@ -829,18 +829,24 @@ asmlinkage int do_rt_sigreturn(struct pt_regs *regs, struct switch_stack *sw) return 0; } +static inline struct pt_regs *rte_regs(struct pt_regs *regs) +{ + return (void *)regs + regs->stkadj; +} + static void setup_sigcontext(struct sigcontext *sc, struct pt_regs *regs, unsigned long mask) { + struct pt_regs *tregs = rte_regs(regs); sc->sc_mask = mask; sc->sc_usp = rdusp(); sc->sc_d0 = regs->d0; sc->sc_d1 = regs->d1; sc->sc_a0 = regs->a0; sc->sc_a1 = regs->a1; - sc->sc_sr = regs->sr; - sc->sc_pc = regs->pc; - sc->sc_formatvec = regs->format << 12 | regs->vector; + sc->sc_sr = tregs->sr; + sc->sc_pc = tregs->pc; + sc->sc_formatvec = tregs->format << 12 | tregs->vector; save_a5_state(sc, regs); save_fpu_state(sc, regs); } @@ -848,6 +854,7 @@ static void setup_sigcontext(struct sigcontext *sc, struct pt_regs *regs, static inline int rt_setup_ucontext(struct ucontext __user *uc, struct pt_regs *regs) { struct switch_stack *sw = (struct switch_stack *)regs - 1; + struct pt_regs *tregs = rte_regs(regs); greg_t __user *gregs = uc->uc_mcontext.gregs; int err = 0; @@ -868,9 +875,9 @@ static inline int rt_setup_ucontext(struct ucontext __user *uc, struct pt_regs * err |= __put_user(sw->a5, &gregs[13]); err |= __put_user(sw->a6, &gregs[14]); err |= __put_user(rdusp(), &gregs[15]); - err |= __put_user(regs->pc, &gregs[16]); - err |= __put_user(regs->sr, &gregs[17]); - err |= __put_user((regs->format << 12) | regs->vector, &uc->uc_formatvec); + err |= __put_user(tregs->pc, &gregs[16]); + err |= __put_user(tregs->sr, &gregs[17]); + err |= __put_user((tregs->format << 12) | tregs->vector, &uc->uc_formatvec); err |= rt_save_fpu_state(uc, regs); return err; } @@ -887,13 +894,14 @@ static int setup_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) { struct sigframe __user *frame; - int fsize = frame_extra_sizes(regs->format); + struct pt_regs *tregs = rte_regs(regs); + int fsize = frame_extra_sizes(tregs->format); struct sigcontext context; int err = 0, sig = ksig->sig; if (fsize < 0) { pr_debug("setup_frame: Unknown frame format %#x\n", - regs->format); + tregs->format); return -EFAULT; } @@ -904,7 +912,7 @@ static int setup_frame(struct ksignal *ksig, sigset_t *set, err |= __put_user(sig, &frame->sig); - err |= __put_user(regs->vector, &frame->code); + err |= __put_user(tregs->vector, &frame->code); err |= __put_user(&frame->sc, &frame->psc); if (_NSIG_WORDS > 1) @@ -929,34 +937,28 @@ static int setup_frame(struct ksignal *ksig, sigset_t *set, push_cache ((unsigned long) &frame->retcode); - /* - * Set up registers for signal handler. All the state we are about - * to destroy is successfully copied to sigframe. - */ - wrusp ((unsigned long) frame); - regs->pc = (unsigned long) ksig->ka.sa.sa_handler; - adjustformat(regs); - /* * This is subtle; if we build more than one sigframe, all but the * first one will see frame format 0 and have fsize == 0, so we won't * screw stkadj. */ - if (fsize) + if (fsize) { regs->stkadj = fsize; - - /* Prepare to skip over the extra stuff in the exception frame. */ - if (regs->stkadj) { - struct pt_regs *tregs = - (struct pt_regs *)((ulong)regs + regs->stkadj); + tregs = rte_regs(regs); pr_debug("Performing stackadjust=%04lx\n", regs->stkadj); - /* This must be copied with decreasing addresses to - handle overlaps. */ tregs->vector = 0; tregs->format = 0; - tregs->pc = regs->pc; tregs->sr = regs->sr; } + + /* + * Set up registers for signal handler. All the state we are about + * to destroy is successfully copied to sigframe. + */ + wrusp ((unsigned long) frame); + tregs->pc = (unsigned long) ksig->ka.sa.sa_handler; + adjustformat(regs); + return 0; } @@ -964,7 +966,8 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) { struct rt_sigframe __user *frame; - int fsize = frame_extra_sizes(regs->format); + struct pt_regs *tregs = rte_regs(regs); + int fsize = frame_extra_sizes(tregs->format); int err = 0, sig = ksig->sig; if (fsize < 0) { @@ -1013,34 +1016,27 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, push_cache ((unsigned long) &frame->retcode); - /* - * Set up registers for signal handler. All the state we are about - * to destroy is successfully copied to sigframe. - */ - wrusp ((unsigned long) frame); - regs->pc = (unsigned long) ksig->ka.sa.sa_handler; - adjustformat(regs); - /* * This is subtle; if we build more than one sigframe, all but the * first one will see frame format 0 and have fsize == 0, so we won't * screw stkadj. */ - if (fsize) + if (fsize) { regs->stkadj = fsize; - - /* Prepare to skip over the extra stuff in the exception frame. */ - if (regs->stkadj) { - struct pt_regs *tregs = - (struct pt_regs *)((ulong)regs + regs->stkadj); + tregs = rte_regs(regs); pr_debug("Performing stackadjust=%04lx\n", regs->stkadj); - /* This must be copied with decreasing addresses to - handle overlaps. */ tregs->vector = 0; tregs->format = 0; - tregs->pc = regs->pc; tregs->sr = regs->sr; } + + /* + * Set up registers for signal handler. All the state we are about + * to destroy is successfully copied to sigframe. + */ + wrusp ((unsigned long) frame); + tregs->pc = (unsigned long) ksig->ka.sa.sa_handler; + adjustformat(regs); return 0; } From 77c71e625c9d8556e9864931d0b608bda8dd4e4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B4=87?= Date: Fri, 24 Sep 2021 10:35:58 +0800 Subject: [PATCH 1129/5344] net: prevent user from passing illegal stab size [ Upstream commit b193e15ac69d56f35e1d8e2b5d16cbd47764d053 ] We observed below report when playing with netlink sock: UBSAN: shift-out-of-bounds in net/sched/sch_api.c:580:10 shift exponent 249 is too large for 32-bit type CPU: 0 PID: 685 Comm: a.out Not tainted Call Trace: dump_stack_lvl+0x8d/0xcf ubsan_epilogue+0xa/0x4e __ubsan_handle_shift_out_of_bounds+0x161/0x182 __qdisc_calculate_pkt_len+0xf0/0x190 __dev_queue_xmit+0x2ed/0x15b0 it seems like kernel won't check the stab log value passing from user, and will use the insane value later to calculate pkt_len. This patch just add a check on the size/cell_log to avoid insane calculation. Reported-by: Abaci Signed-off-by: Michael Wang Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/net/pkt_sched.h | 1 + net/sched/sch_api.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index b16f9236de147..d1585b54fb0bd 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -11,6 +11,7 @@ #include #define DEFAULT_TX_QUEUE_LEN 1000 +#define STAB_SIZE_LOG_MAX 30 struct qdisc_walker { int stop; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 3b1b5ee521379..e70f990334083 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -510,6 +510,12 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt, return stab; } + if (s->size_log > STAB_SIZE_LOG_MAX || + s->cell_log > STAB_SIZE_LOG_MAX) { + NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table"); + return ERR_PTR(-EINVAL); + } + stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL); if (!stab) return ERR_PTR(-ENOMEM); From 655b9a022298b2ca0e22877bbe5c27dfd595a103 Mon Sep 17 00:00:00 2001 From: MichelleJin Date: Mon, 27 Sep 2021 03:34:57 +0000 Subject: [PATCH 1130/5344] mac80211: check return value of rhashtable_init [ Upstream commit 111461d573741c17eafad029ac93474fa9adcce0 ] When rhashtable_init() fails, it returns -EINVAL. However, since error return value of rhashtable_init is not checked, it can cause use of uninitialized pointers. So, fix unhandled errors of rhashtable_init. Signed-off-by: MichelleJin Link: https://lore.kernel.org/r/20210927033457.1020967-4-shjy180909@gmail.com Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin --- net/mac80211/mesh_pathtbl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index 1708b64d41094..d7ae7415d54d0 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -60,7 +60,10 @@ static struct mesh_table *mesh_table_alloc(void) atomic_set(&newtbl->entries, 0); spin_lock_init(&newtbl->gates_lock); spin_lock_init(&newtbl->walk_lock); - rhashtable_init(&newtbl->rhead, &mesh_rht_params); + if (rhashtable_init(&newtbl->rhead, &mesh_rht_params)) { + kfree(newtbl); + return NULL; + } return newtbl; } From 5ac649e6ca69d102116eceeb8d0b42c2d4320f87 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 27 Sep 2021 14:48:23 -0700 Subject: [PATCH 1131/5344] net: sun: SUNVNET_COMMON should depend on INET [ Upstream commit 103bde372f084206c6972be543ecc247ebbff9f3 ] When CONFIG_INET is not set, there are failing references to IPv4 functions, so make this driver depend on INET. Fixes these build errors: sparc64-linux-ld: drivers/net/ethernet/sun/sunvnet_common.o: in function `sunvnet_start_xmit_common': sunvnet_common.c:(.text+0x1a68): undefined reference to `__icmp_send' sparc64-linux-ld: drivers/net/ethernet/sun/sunvnet_common.o: in function `sunvnet_poll_common': sunvnet_common.c:(.text+0x358c): undefined reference to `ip_send_check' Signed-off-by: Randy Dunlap Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Aaron Young Cc: Rashmi Narasimhan Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/sun/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/sun/Kconfig b/drivers/net/ethernet/sun/Kconfig index 7b982e02ea3a4..1080a2a3e13a2 100644 --- a/drivers/net/ethernet/sun/Kconfig +++ b/drivers/net/ethernet/sun/Kconfig @@ -73,6 +73,7 @@ config CASSINI config SUNVNET_COMMON tristate "Common routines to support Sun Virtual Networking" depends on SUN_LDOMS + depends on INET default m config SUNVNET From 7f317e3787d6363c37c0af72d70c74a08f6c3b4a Mon Sep 17 00:00:00 2001 From: Leslie Shi Date: Thu, 23 Sep 2021 16:05:31 +0800 Subject: [PATCH 1132/5344] drm/amdgpu: fix gart.bo pin_count leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 66805763a97f8f7bdf742fc0851d85c02ed9411f ] gmc_v{9,10}_0_gart_disable() isn't called matched with correspoding gart_enbale function in SRIOV case. This will lead to gart.bo pin_count leak on driver unload. Cc: Hawking Zhang Signed-off-by: Leslie Shi Signed-off-by: Guchun Chen Reviewed-by: Christian König Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 3 ++- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index f642e066e67a2..85ee0e849647e 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -903,6 +903,8 @@ static int gmc_v10_0_hw_fini(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; + gmc_v10_0_gart_disable(adev); + if (amdgpu_sriov_vf(adev)) { /* full access mode, so don't touch any GMC register */ DRM_DEBUG("For SRIOV client, shouldn't do anything.\n"); @@ -910,7 +912,6 @@ static int gmc_v10_0_hw_fini(void *handle) } amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0); - gmc_v10_0_gart_disable(adev); return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 688111ef814de..63205de4a5656 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1526,6 +1526,8 @@ static int gmc_v9_0_hw_fini(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; + gmc_v9_0_gart_disable(adev); + if (amdgpu_sriov_vf(adev)) { /* full access mode, so don't touch any GMC register */ DRM_DEBUG("For SRIOV client, shouldn't do anything.\n"); @@ -1534,7 +1536,6 @@ static int gmc_v9_0_hw_fini(void *handle) amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0); amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0); - gmc_v9_0_gart_disable(adev); return 0; } From ee163267b7f896890421b35ec49e30b33de85973 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 24 Sep 2021 17:51:53 +0800 Subject: [PATCH 1133/5344] scsi: ses: Fix unsigned comparison with less than zero [ Upstream commit dd689ed5aa905daf4ba4c99319a52aad6ea0a796 ] Fix the following coccicheck warning: ./drivers/scsi/ses.c:137:10-16: WARNING: Unsigned expression compared with zero: result > 0. Link: https://lore.kernel.org/r/1632477113-90378-1-git-send-email-jiapeng.chong@linux.alibaba.com Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/ses.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/ses.c b/drivers/scsi/ses.c index 43e682297fd5f..0a1734f34587d 100644 --- a/drivers/scsi/ses.c +++ b/drivers/scsi/ses.c @@ -118,7 +118,7 @@ static int ses_recv_diag(struct scsi_device *sdev, int page_code, static int ses_send_diag(struct scsi_device *sdev, int page_code, void *buf, int bufflen) { - u32 result; + int result; unsigned char cmd[] = { SEND_DIAGNOSTIC, From f01fef86243bae83d25380c48b9a04d3dcf2c83d Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sat, 25 Sep 2021 00:03:30 +0100 Subject: [PATCH 1134/5344] scsi: virtio_scsi: Fix spelling mistake "Unsupport" -> "Unsupported" [ Upstream commit cced4c0ec7c06f5230a2958907a409c849762293 ] There are a couple of spelling mistakes in pr_info and pr_err messages. Fix them. Link: https://lore.kernel.org/r/20210924230330.143785-1-colin.king@canonical.com Signed-off-by: Colin Ian King Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/virtio_scsi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index bfec84aacd90b..cb833c5fb9ce2 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -297,7 +297,7 @@ static void virtscsi_handle_transport_reset(struct virtio_scsi *vscsi, } break; default: - pr_info("Unsupport virtio scsi event reason %x\n", event->reason); + pr_info("Unsupported virtio scsi event reason %x\n", event->reason); } } @@ -381,7 +381,7 @@ static void virtscsi_handle_event(struct work_struct *work) virtscsi_handle_param_change(vscsi, event); break; default: - pr_err("Unsupport virtio scsi event %x\n", event->event); + pr_err("Unsupported virtio scsi event %x\n", event->event); } virtscsi_kick_event(vscsi, event_node); } From 347674923b3987eee6af4b5dfa0a706196dcb680 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Sep 2021 15:31:11 +0200 Subject: [PATCH 1135/5344] sched: Always inline is_percpu_thread() [ Upstream commit 83d40a61046f73103b4e5d8f1310261487ff63b0 ] vmlinux.o: warning: objtool: check_preemption_disabled()+0x81: call to is_percpu_thread() leaves .noinstr.text section Reported-by: Stephen Rothwell Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210928084218.063371959@infradead.org Signed-off-by: Sasha Levin --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index da4f5f2baced5..f76c84f1d5665 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1526,7 +1526,7 @@ extern struct pid *cad_pid; #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) #define used_math() tsk_used_math(current) -static inline bool is_percpu_thread(void) +static __always_inline bool is_percpu_thread(void) { #ifdef CONFIG_SMP return (current->flags & PF_NO_SETAFFINITY) && From 20eaa1bff287c59456a88a11408a14827e105a48 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 17 Oct 2021 10:42:35 +0200 Subject: [PATCH 1136/5344] Linux 5.4.154 Link: https://lore.kernel.org/r/20211014145207.314256898@linuxfoundation.org Tested-by: Jon Hunter Tested-by: Florian Fainelli Tested-by: Shuah Khan Tested-by: Hulk Robot Tested-by: Sudip Mukherjee Tested-by: Linux Kernel Functional Testing Tested-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index df9b1d07ca097..3358f56a37f06 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 4 -SUBLEVEL = 153 +SUBLEVEL = 154 EXTRAVERSION = NAME = Kleptomaniac Octopus From 9c048da0c6593634ee8e9cd99d0997246440cdaf Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 14 Dec 2020 15:26:14 +0100 Subject: [PATCH 1137/5344] ovl: simplify file splice commit 82a763e61e2b601309d696d4fa514c77d64ee1be upstream. generic_file_splice_read() and iter_file_splice_write() will call back into f_op->iter_read() and f_op->iter_write() respectively. These already do the real file lookup and cred override. So the code in ovl_splice_read() and ovl_splice_write() is redundant. In addition the ovl_file_accessed() call in ovl_splice_write() is incorrect, though probably harmless. Fix by calling generic_file_splice_read() and iter_file_splice_write() directly. Signed-off-by: Miklos Szeredi [reported to resolve issues with 1a980b8cbf00 ("ovl: add splice file read write helper")] Signed-off-by: Greg Kroah-Hartman --- fs/overlayfs/file.c | 46 ++------------------------------------------- 1 file changed, 2 insertions(+), 44 deletions(-) diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 8d43faf5bda04..2ba8b328f4d0c 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -299,48 +299,6 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) return ret; } -static ssize_t ovl_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) -{ - ssize_t ret; - struct fd real; - const struct cred *old_cred; - - ret = ovl_real_fdget(in, &real); - if (ret) - return ret; - - old_cred = ovl_override_creds(file_inode(in)->i_sb); - ret = generic_file_splice_read(real.file, ppos, pipe, len, flags); - revert_creds(old_cred); - - ovl_file_accessed(in); - fdput(real); - return ret; -} - -static ssize_t -ovl_splice_write(struct pipe_inode_info *pipe, struct file *out, - loff_t *ppos, size_t len, unsigned int flags) -{ - struct fd real; - const struct cred *old_cred; - ssize_t ret; - - ret = ovl_real_fdget(out, &real); - if (ret) - return ret; - - old_cred = ovl_override_creds(file_inode(out)->i_sb); - ret = iter_file_splice_write(pipe, real.file, ppos, len, flags); - revert_creds(old_cred); - - ovl_file_accessed(out); - fdput(real); - return ret; -} - static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct fd real; @@ -701,8 +659,8 @@ const struct file_operations ovl_file_operations = { .fadvise = ovl_fadvise, .unlocked_ioctl = ovl_ioctl, .compat_ioctl = ovl_compat_ioctl, - .splice_read = ovl_splice_read, - .splice_write = ovl_splice_write, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .copy_file_range = ovl_copy_file_range, .remap_file_range = ovl_remap_file_range, From 4ac910dead08fba1aa6625d7194a532542d1d31a Mon Sep 17 00:00:00 2001 From: Jonas Hahnfeld Date: Tue, 12 Oct 2021 22:09:07 +0200 Subject: [PATCH 1138/5344] ALSA: usb-audio: Add quirk for VF0770 commit 48827e1d6af58f219e89c7ec08dccbca28c7694e upstream. The device advertises 8 formats, but only a rate of 48kHz is honored by the hardware and 24 bits give chopped audio, so only report the one working combination. This fixes out-of-the-box audio experience with PipeWire which otherwise attempts to choose S24_3LE (while PulseAudio defaulted to S16_LE). Signed-off-by: Jonas Hahnfeld Cc: Link: https://lore.kernel.org/r/20211012200906.3492-1-hahnjo@hahnjo.de Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/quirks-table.h | 42 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/sound/usb/quirks-table.h b/sound/usb/quirks-table.h index 441335abb4018..9620ae0003ce4 100644 --- a/sound/usb/quirks-table.h +++ b/sound/usb/quirks-table.h @@ -125,6 +125,48 @@ .bInterfaceClass = USB_CLASS_AUDIO, }, +/* + * Creative Technology, Ltd Live! Cam Sync HD [VF0770] + * The device advertises 8 formats, but only a rate of 48kHz is honored by the + * hardware and 24 bits give chopped audio, so only report the one working + * combination. + */ +{ + USB_DEVICE(0x041e, 0x4095), + .driver_info = (unsigned long) &(const struct snd_usb_audio_quirk) { + .ifnum = QUIRK_ANY_INTERFACE, + .type = QUIRK_COMPOSITE, + .data = &(const struct snd_usb_audio_quirk[]) { + { + .ifnum = 2, + .type = QUIRK_AUDIO_STANDARD_MIXER, + }, + { + .ifnum = 3, + .type = QUIRK_AUDIO_FIXED_ENDPOINT, + .data = &(const struct audioformat) { + .formats = SNDRV_PCM_FMTBIT_S16_LE, + .channels = 2, + .fmt_bits = 16, + .iface = 3, + .altsetting = 4, + .altset_idx = 4, + .endpoint = 0x82, + .ep_attr = 0x05, + .rates = SNDRV_PCM_RATE_48000, + .rate_min = 48000, + .rate_max = 48000, + .nr_rates = 1, + .rate_table = (unsigned int[]) { 48000 }, + }, + }, + { + .ifnum = -1 + }, + }, + }, +}, + /* * HP Wireless Audio * When not ignored, causes instability issues for some users, forcing them to From 9876ccdeb458f3f6e271d05cd9131427efe9cbb8 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 30 Sep 2021 13:41:14 +0200 Subject: [PATCH 1139/5344] ALSA: seq: Fix a potential UAF by wrong private_free call order commit 1f8763c59c4ec6254d629fe77c0a52220bd907aa upstream. John Keeping reported and posted a patch for a potential UAF in rawmidi sequencer destruction: the snd_rawmidi_dev_seq_free() may be called after the associated rawmidi object got already freed. After a deeper look, it turned out that the bug is rather the incorrect private_free call order for a snd_seq_device. The snd_seq_device private_free gets called at the release callback of the sequencer device object, while this was rather expected to be executed at the snd_device call chains that runs at the beginning of the whole card-free procedure. It's been broken since the rewrite of sequencer-device binding (although it hasn't surfaced because the sequencer device release happens usually right along with the card device release). This patch corrects the private_free call to be done in the right place, at snd_seq_device_dev_free(). Fixes: 7c37ae5c625a ("ALSA: seq: Rewrite sequencer device binding with standard bus") Reported-and-tested-by: John Keeping Cc: Link: https://lore.kernel.org/r/20210930114114.8645-1-tiwai@suse.de Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/seq_device.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/sound/core/seq_device.c b/sound/core/seq_device.c index e9dbad93f9d09..c9223049551c4 100644 --- a/sound/core/seq_device.c +++ b/sound/core/seq_device.c @@ -147,6 +147,8 @@ static int snd_seq_device_dev_free(struct snd_device *device) struct snd_seq_device *dev = device->device_data; cancel_autoload_drivers(); + if (dev->private_free) + dev->private_free(dev); put_device(&dev->dev); return 0; } @@ -174,11 +176,7 @@ static int snd_seq_device_dev_disconnect(struct snd_device *device) static void snd_seq_dev_release(struct device *dev) { - struct snd_seq_device *sdev = to_seq_dev(dev); - - if (sdev->private_free) - sdev->private_free(sdev); - kfree(sdev); + kfree(to_seq_dev(dev)); } /* From 9c3dff3e11f18b19a8c84437ff52157061a4329a Mon Sep 17 00:00:00 2001 From: Werner Sembach Date: Fri, 1 Oct 2021 15:31:10 +0200 Subject: [PATCH 1140/5344] ALSA: hda/realtek: Complete partial device name to avoid ambiguity commit 1f8d398e1cd8813f8ec16d55c086e8270a9c18ab upstream. The string "Clevo X170" is not enough to unambiguously identify the correct device. Fixing it so another Clevo barebone name starting with "X170" can be added without causing confusion. Signed-off-by: Werner Sembach Cc: Link: https://lore.kernel.org/r/20211001133111.428249-2-wse@tuxedocomputers.com Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index abe371c01fba2..a152f026ac9b7 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -2539,7 +2539,7 @@ static const struct snd_pci_quirk alc882_fixup_tbl[] = { SND_PCI_QUIRK(0x1558, 0x67e1, "Clevo PB71[DE][CDF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x67e5, "Clevo PC70D[PRS](?:-D|-G)?", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x70d1, "Clevo PC70[ER][CDF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), - SND_PCI_QUIRK(0x1558, 0x7714, "Clevo X170", ALC1220_FIXUP_CLEVO_PB51ED_PINS), + SND_PCI_QUIRK(0x1558, 0x7714, "Clevo X170SM", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x9501, "Clevo P950HR", ALC1220_FIXUP_CLEVO_P950), SND_PCI_QUIRK(0x1558, 0x9506, "Clevo P955HQ", ALC1220_FIXUP_CLEVO_P950), SND_PCI_QUIRK(0x1558, 0x950a, "Clevo P955H[PR]", ALC1220_FIXUP_CLEVO_P950), From 7ac7cfb562db350bea89c9bd7bfdd9aa37076e40 Mon Sep 17 00:00:00 2001 From: Werner Sembach Date: Fri, 1 Oct 2021 15:31:11 +0200 Subject: [PATCH 1141/5344] ALSA: hda/realtek: Add quirk for Clevo X170KM-G commit cc03069a397005da24f6783835c274d5aedf6043 upstream. This applies a SND_PCI_QUIRK(...) to the Clevo X170KM-G barebone. This fixes the issue of the devices internal Speaker not working. Signed-off-by: Werner Sembach Cc: Link: https://lore.kernel.org/r/20211001133111.428249-3-wse@tuxedocomputers.com Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index a152f026ac9b7..1fbfe6d9d9efc 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -2540,6 +2540,7 @@ static const struct snd_pci_quirk alc882_fixup_tbl[] = { SND_PCI_QUIRK(0x1558, 0x67e5, "Clevo PC70D[PRS](?:-D|-G)?", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x70d1, "Clevo PC70[ER][CDF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x7714, "Clevo X170SM", ALC1220_FIXUP_CLEVO_PB51ED_PINS), + SND_PCI_QUIRK(0x1558, 0x7715, "Clevo X170KM-G", ALC1220_FIXUP_CLEVO_PB51ED), SND_PCI_QUIRK(0x1558, 0x9501, "Clevo P950HR", ALC1220_FIXUP_CLEVO_P950), SND_PCI_QUIRK(0x1558, 0x9506, "Clevo P955HQ", ALC1220_FIXUP_CLEVO_P950), SND_PCI_QUIRK(0x1558, 0x950a, "Clevo P955H[PR]", ALC1220_FIXUP_CLEVO_P950), From fbc9089f35eb0bf9b5e61fad0ea3f6b097bf2971 Mon Sep 17 00:00:00 2001 From: Kailang Yang Date: Tue, 5 Oct 2021 14:35:14 +0800 Subject: [PATCH 1142/5344] ALSA: hda/realtek - ALC236 headset MIC recording issue commit 5aec98913095ed3b4424ed6c5fdeb6964e9734da upstream. In power save mode, the recording voice from headset mic will 2s more delay. Add this patch will solve this issue. [ minor coding style fix by tiwai ] Signed-off-by: Kailang Yang Tested-by: Kai-Heng Feng Cc: Link: https://lore.kernel.org/r/ccb0cdd5bbd7486eabbd8d987d384cb0@realtek.com Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 1fbfe6d9d9efc..9b3a24360c57e 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -517,6 +517,8 @@ static void alc_shutup_pins(struct hda_codec *codec) struct alc_spec *spec = codec->spec; switch (codec->core.vendor_id) { + case 0x10ec0236: + case 0x10ec0256: case 0x10ec0283: case 0x10ec0286: case 0x10ec0288: @@ -3522,7 +3524,8 @@ static void alc256_shutup(struct hda_codec *codec) /* If disable 3k pulldown control for alc257, the Mic detection will not work correctly * when booting with headset plugged. So skip setting it for the codec alc257 */ - if (codec->core.vendor_id != 0x10ec0257) + if (spec->codec_variant != ALC269_TYPE_ALC257 && + spec->codec_variant != ALC269_TYPE_ALC256) alc_update_coef_idx(codec, 0x46, 0, 3 << 12); if (!spec->no_shutup_pins) From 5dc7ecc9b4190774e90e8800f99399a7bd3fe4cf Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Tue, 12 Oct 2021 19:47:48 +0800 Subject: [PATCH 1143/5344] ALSA: hda/realtek: Fix the mic type detection issue for ASUS G551JW commit a3fd1a986e499a06ac5ef95c3a39aa4611e7444c upstream. We need to define the codec pin 0x1b to be the mic, but somehow the mic doesn't support hot plugging detection, and Windows also has this issue, so we set it to phantom headset-mic. Also the determine_headset_type() often returns the omtp type by a mistake when we plug a ctia headset, this makes the mic can't record sound at all. Because most of the headset are ctia type nowadays and some machines have the fixed ctia type audio jack, it is possible this machine has the fixed ctia jack too. Here we set this mic jack to fixed ctia type, this could avoid the mic type detection mistake and make the ctia headset work stable. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=214537 Reported-and-tested-by: msd Cc: Signed-off-by: Hui Wang Link: https://lore.kernel.org/r/20211012114748.5238-1-hui.wang@canonical.com Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 9b3a24360c57e..c4837c78a8624 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9692,6 +9692,9 @@ enum { ALC671_FIXUP_HP_HEADSET_MIC2, ALC662_FIXUP_ACER_X2660G_HEADSET_MODE, ALC662_FIXUP_ACER_NITRO_HEADSET_MODE, + ALC668_FIXUP_ASUS_NO_HEADSET_MIC, + ALC668_FIXUP_HEADSET_MIC, + ALC668_FIXUP_MIC_DET_COEF, }; static const struct hda_fixup alc662_fixups[] = { @@ -10075,6 +10078,29 @@ static const struct hda_fixup alc662_fixups[] = { .chained = true, .chain_id = ALC662_FIXUP_USI_FUNC }, + [ALC668_FIXUP_ASUS_NO_HEADSET_MIC] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x1b, 0x04a1112c }, + { } + }, + .chained = true, + .chain_id = ALC668_FIXUP_HEADSET_MIC + }, + [ALC668_FIXUP_HEADSET_MIC] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc269_fixup_headset_mic, + .chained = true, + .chain_id = ALC668_FIXUP_MIC_DET_COEF + }, + [ALC668_FIXUP_MIC_DET_COEF] = { + .type = HDA_FIXUP_VERBS, + .v.verbs = (const struct hda_verb[]) { + { 0x20, AC_VERB_SET_COEF_INDEX, 0x15 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x0d60 }, + {} + }, + }, }; static const struct snd_pci_quirk alc662_fixup_tbl[] = { @@ -10110,6 +10136,7 @@ static const struct snd_pci_quirk alc662_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x15a7, "ASUS UX51VZH", ALC662_FIXUP_BASS_16), SND_PCI_QUIRK(0x1043, 0x177d, "ASUS N551", ALC668_FIXUP_ASUS_Nx51), SND_PCI_QUIRK(0x1043, 0x17bd, "ASUS N751", ALC668_FIXUP_ASUS_Nx51), + SND_PCI_QUIRK(0x1043, 0x185d, "ASUS G551JW", ALC668_FIXUP_ASUS_NO_HEADSET_MIC), SND_PCI_QUIRK(0x1043, 0x1963, "ASUS X71SL", ALC662_FIXUP_ASUS_MODE8), SND_PCI_QUIRK(0x1043, 0x1b73, "ASUS N55SF", ALC662_FIXUP_BASS_16), SND_PCI_QUIRK(0x1043, 0x1bf3, "ASUS N76VZ", ALC662_FIXUP_BASS_MODE4_CHMAP), From f874c59d1b932b76f69cc4f1d6293371e80c1fad Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Oct 2021 14:35:07 -0400 Subject: [PATCH 1144/5344] nds32/ftrace: Fix Error: invalid operands (*UND* and *UND* sections) for `^' commit be358af1191b1b2fedebd8f3421cafdc8edacc7d upstream. I received a build failure for a new patch I'm working on the nds32 architecture, and when I went to test it, I couldn't get to my build error, because it failed to build with a bunch of: Error: invalid operands (*UND* and *UND* sections) for `^' issues with various files. Those files were temporary asm files that looked like: kernel/.tmp_mc_fork.s I decided to look deeper, and found that the "mc" portion of that name stood for "mcount", and was created by the recordmcount.pl script. One that I wrote over a decade ago. Once I knew the source of the problem, I was able to investigate it further. The way the recordmcount.pl script works (BTW, there's a C version that simply modifies the ELF object) is by doing an "objdump" on the object file. Looks for all the calls to "mcount", and creates an offset of those locations from some global variable it can use (usually a global function name, found with <.*>:). Creates a asm file that is a table of references to these locations, using the found variable/function. Compiles it and links it back into the original object file. This asm file is called ".tmp_mc_.s". The problem here is that the objdump produced by the nds32 object file, contains things that look like: 0000159a <.L3^B1>: 159a: c6 00 beqz38 $r6, 159a <.L3^B1> 159a: R_NDS32_9_PCREL_RELA .text+0x159e 159c: 84 d2 movi55 $r6, #-14 159e: 80 06 mov55 $r0, $r6 15a0: ec 3c addi10.sp #0x3c Where ".L3^B1 is somehow selected as the "global" variable to index off of. Then the assembly file that holds the mcount locations looks like this: .section __mcount_loc,"a",@progbits .align 2 .long .L3^B1 + -5522 .long .L3^B1 + -5384 .long .L3^B1 + -5270 .long .L3^B1 + -5098 .long .L3^B1 + -4970 .long .L3^B1 + -4758 .long .L3^B1 + -4122 [...] And when it is compiled back to an object to link to the original object, the compile fails on the "^" symbol. Simple solution for now, is to have the perl script ignore using function symbols that have an "^" in the name. Link: https://lkml.kernel.org/r/20211014143507.4ad2c0f7@gandalf.local.home Cc: stable@vger.kernel.org Acked-by: Greentime Hu Fixes: fbf58a52ac088 ("nds32/ftrace: Add RECORD_MCOUNT support") Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- scripts/recordmcount.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 4f84657f55c23..f459ae883a0a6 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -222,7 +222,7 @@ sub check_objcopy $local_regex = "^[0-9a-fA-F]+\\s+t\\s+(\\S+)"; $weak_regex = "^[0-9a-fA-F]+\\s+([wW])\\s+(\\S+)"; $section_regex = "Disassembly of section\\s+(\\S+):"; -$function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:"; +$function_regex = "^([0-9a-fA-F]+)\\s+<([^^]*?)>:"; $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s(mcount|__fentry__)\$"; $section_type = '@progbits'; $mcount_adjust = 0; From aa0612c46b1a64261b1ae7e91a5983d137bfc27c Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Tue, 5 Oct 2021 14:08:36 +0200 Subject: [PATCH 1145/5344] s390: fix strrchr() implementation commit 8e0ab8e26b72a80e991c66a8abc16e6c856abe3d upstream. Fix two problems found in the strrchr() implementation for s390 architectures: evaluate empty strings (return the string address instead of NULL, if '\0' is passed as second argument); evaluate the first character of non-empty strings (the current implementation stops at the second). Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: stable@vger.kernel.org Reported-by: Heiko Carstens (incorrect behavior with empty strings) Signed-off-by: Roberto Sassu Link: https://lore.kernel.org/r/20211005120836.60630-1-roberto.sassu@huawei.com Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik Signed-off-by: Greg Kroah-Hartman --- arch/s390/lib/string.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/s390/lib/string.c b/arch/s390/lib/string.c index 0e30e6e43b0c5..18fbbb679851d 100644 --- a/arch/s390/lib/string.c +++ b/arch/s390/lib/string.c @@ -246,14 +246,13 @@ EXPORT_SYMBOL(strcmp); #ifdef __HAVE_ARCH_STRRCHR char *strrchr(const char *s, int c) { - size_t len = __strend(s) - s; - - if (len) - do { - if (s[len] == (char) c) - return (char *) s + len; - } while (--len > 0); - return NULL; + ssize_t len = __strend(s) - s; + + do { + if (s[len] == (char)c) + return (char *)s + len; + } while (--len >= 0); + return NULL; } EXPORT_SYMBOL(strrchr); #endif From 7da95c770fbf917d4292f27b5427295ddfbe976c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 24 Sep 2021 00:35:42 +0000 Subject: [PATCH 1146/5344] csky: don't let sigreturn play with priveleged bits of status register commit fbd63c08cdcca5fb1315aca3172b3c9c272cfb4f upstream. csky restore_sigcontext() blindly overwrites regs->sr with the value it finds in sigcontext. Attacker can store whatever they want in there, which includes things like S-bit. Userland shouldn't be able to set that, or anything other than C flag (bit 0). Do the same thing other architectures with protected bits in flags register do - preserve everything that shouldn't be settable in user mode, picking the rest from the value saved is sigcontext. Signed-off-by: Al Viro Signed-off-by: Guo Ren Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/csky/kernel/signal.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/csky/kernel/signal.c b/arch/csky/kernel/signal.c index d44f5fe0731d4..fb394f5f83819 100644 --- a/arch/csky/kernel/signal.c +++ b/arch/csky/kernel/signal.c @@ -52,10 +52,14 @@ static long restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) { int err = 0; + unsigned long sr = regs->sr; /* sc_pt_regs is structured the same as the start of pt_regs */ err |= __copy_from_user(regs, &sc->sc_pt_regs, sizeof(struct pt_regs)); + /* BIT(0) of regs->sr is Condition Code/Carry bit */ + regs->sr = (sr & ~1) | (regs->sr & 1); + /* Restore the floating-point state. */ err |= restore_fpu_state(sc); From 1a88f546486f3bfd6fd4d0d8a5441bbc2270bd1e Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Fri, 24 Sep 2021 15:33:38 +0800 Subject: [PATCH 1147/5344] csky: Fixup regs.sr broken in ptrace commit af89ebaa64de726ca0a39bbb0bf0c81a1f43ad50 upstream. gpr_get() return the entire pt_regs (include sr) to userspace, if we don't restore the C bit in gpr_set, it may break the ALU result in that context. So the C flag bit is part of gpr context, that's why riscv totally remove the C bit in the ISA. That makes sr reg clear from userspace to supervisor privilege. Signed-off-by: Guo Ren Cc: Al Viro Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/csky/kernel/ptrace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/csky/kernel/ptrace.c b/arch/csky/kernel/ptrace.c index 313623a19ecbf..885194720fe0b 100644 --- a/arch/csky/kernel/ptrace.c +++ b/arch/csky/kernel/ptrace.c @@ -96,7 +96,8 @@ static int gpr_set(struct task_struct *target, if (ret) return ret; - regs.sr = task_pt_regs(target)->sr; + /* BIT(0) of regs.sr is Condition Code/Carry bit */ + regs.sr = (regs.sr & BIT(0)) | (task_pt_regs(target)->sr & ~BIT(0)); #ifdef CONFIG_CPU_HAS_HILO regs.dcsr = task_pt_regs(target)->dcsr; #endif From a3559b98bf681c0d169908711135c4a04ee0810e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 14 Sep 2021 14:57:59 +0800 Subject: [PATCH 1148/5344] btrfs: unlock newly allocated extent buffer after error commit 19ea40dddf1833db868533958ca066f368862211 upstream. [BUG] There is a bug report that injected ENOMEM error could leave a tree block locked while we return to user-space: BTRFS info (device loop0): enabling ssd optimizations FAULT_INJECTION: forcing a failure. name failslab, interval 1, probability 0, space 0, times 0 CPU: 0 PID: 7579 Comm: syz-executor Not tainted 5.15.0-rc1 #16 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x8d/0xcf lib/dump_stack.c:106 fail_dump lib/fault-inject.c:52 [inline] should_fail+0x13c/0x160 lib/fault-inject.c:146 should_failslab+0x5/0x10 mm/slab_common.c:1328 slab_pre_alloc_hook.constprop.99+0x4e/0xc0 mm/slab.h:494 slab_alloc_node mm/slub.c:3120 [inline] slab_alloc mm/slub.c:3214 [inline] kmem_cache_alloc+0x44/0x280 mm/slub.c:3219 btrfs_alloc_delayed_extent_op fs/btrfs/delayed-ref.h:299 [inline] btrfs_alloc_tree_block+0x38c/0x670 fs/btrfs/extent-tree.c:4833 __btrfs_cow_block+0x16f/0x7d0 fs/btrfs/ctree.c:415 btrfs_cow_block+0x12a/0x300 fs/btrfs/ctree.c:570 btrfs_search_slot+0x6b0/0xee0 fs/btrfs/ctree.c:1768 btrfs_insert_empty_items+0x80/0xf0 fs/btrfs/ctree.c:3905 btrfs_new_inode+0x311/0xa60 fs/btrfs/inode.c:6530 btrfs_create+0x12b/0x270 fs/btrfs/inode.c:6783 lookup_open+0x660/0x780 fs/namei.c:3282 open_last_lookups fs/namei.c:3352 [inline] path_openat+0x465/0xe20 fs/namei.c:3557 do_filp_open+0xe3/0x170 fs/namei.c:3588 do_sys_openat2+0x357/0x4a0 fs/open.c:1200 do_sys_open+0x87/0xd0 fs/open.c:1216 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x34/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x46ae99 Code: f7 d8 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f46711b9c48 EFLAGS: 00000246 ORIG_RAX: 0000000000000055 RAX: ffffffffffffffda RBX: 000000000078c0a0 RCX: 000000000046ae99 RDX: 0000000000000000 RSI: 00000000000000a1 RDI: 0000000020005800 RBP: 00007f46711b9c80 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000017 R13: 0000000000000000 R14: 000000000078c0a0 R15: 00007ffc129da6e0 ================================================ WARNING: lock held when returning to user space! 5.15.0-rc1 #16 Not tainted ------------------------------------------------ syz-executor/7579 is leaving the kernel with locks still held! 1 lock held by syz-executor/7579: #0: ffff888104b73da8 (btrfs-tree-01/1){+.+.}-{3:3}, at: __btrfs_tree_lock+0x2e/0x1a0 fs/btrfs/locking.c:112 [CAUSE] In btrfs_alloc_tree_block(), after btrfs_init_new_buffer(), the new extent buffer @buf is locked, but if later operations like adding delayed tree ref fail, we just free @buf without unlocking it, resulting above warning. [FIX] Unlock @buf in out_free_buf: label. Reported-by: Hao Sun Link: https://lore.kernel.org/linux-btrfs/CACkBjsZ9O6Zr0KK1yGn=1rQi6Crh1yeCRdTSBxx9R99L4xdn-Q@mail.gmail.com/ CC: stable@vger.kernel.org # 5.4+ Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent-tree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5273965226534..19d2104c04629 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4596,6 +4596,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, out_free_delayed: btrfs_free_delayed_extent_op(extent_op); out_free_buf: + btrfs_tree_unlock(buf); free_extent_buffer(buf); out_free_reserved: btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); From 1e2e87720bb6d24b720ba4c02ea0738c66fb657d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 1 Oct 2021 13:52:31 +0100 Subject: [PATCH 1149/5344] btrfs: deal with errors when replaying dir entry during log replay commit e15ac6413745e3def00e663de00aea5a717311c1 upstream. At replay_one_one(), we are treating any error returned from btrfs_lookup_dir_item() or from btrfs_lookup_dir_index_item() as meaning that there is no existing directory entry in the fs/subvolume tree. This is not correct since we can get errors such as, for example, -EIO when reading extent buffers while searching the fs/subvolume's btree. So fix that and return the error to the caller when it is not -ENOENT. CC: stable@vger.kernel.org # 4.14+ Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-log.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 8ea4b3da85d1a..31d12d41e766e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1985,7 +1985,14 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, ret = -EINVAL; goto out; } - if (IS_ERR_OR_NULL(dst_di)) { + + if (dst_di == ERR_PTR(-ENOENT)) + dst_di = NULL; + + if (IS_ERR(dst_di)) { + ret = PTR_ERR(dst_di); + goto out; + } else if (!dst_di) { /* we need a sequence number to insert, so we only * do inserts for the BTRFS_DIR_INDEX_KEY types */ From 720c1ddf2bcd9779bc720f5de8f3560d49931554 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 1 Oct 2021 13:52:32 +0100 Subject: [PATCH 1150/5344] btrfs: deal with errors when adding inode reference during log replay commit 52db77791fe24538c8aa2a183248399715f6b380 upstream. At __inode_add_ref(), we treating any error returned from btrfs_lookup_dir_item() or from btrfs_lookup_dir_index_item() as meaning that there is no existing directory entry in the fs/subvolume tree. This is not correct since we can get errors such as, for example, -EIO when reading extent buffers while searching the fs/subvolume's btree. So fix that and return the error to the caller when it is not -ENOENT. CC: stable@vger.kernel.org # 4.14+ Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-log.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 31d12d41e766e..c5251770a8d83 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1160,7 +1160,10 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, /* look for a conflicting sequence number */ di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), ref_index, name, namelen, 0); - if (di && !IS_ERR(di)) { + if (IS_ERR(di)) { + if (PTR_ERR(di) != -ENOENT) + return PTR_ERR(di); + } else if (di) { ret = drop_one_dir_item(trans, root, path, dir, di); if (ret) return ret; @@ -1170,7 +1173,9 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, /* look for a conflicting name */ di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, namelen, 0); - if (di && !IS_ERR(di)) { + if (IS_ERR(di)) { + return PTR_ERR(di); + } else if (di) { ret = drop_one_dir_item(trans, root, path, dir, di); if (ret) return ret; From d43f6a1bd63b0f825877ca4d3e1c203b42975af6 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 1 Oct 2021 13:48:18 +0100 Subject: [PATCH 1151/5344] btrfs: check for error when looking up inode during dir entry replay commit cfd312695b71df04c3a2597859ff12c470d1e2e4 upstream. At replay_one_name(), we are treating any error from btrfs_lookup_inode() as if the inode does not exists. Fix this by checking for an error and returning it to the caller. CC: stable@vger.kernel.org # 4.14+ Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-log.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c5251770a8d83..9d358dafef367 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1949,8 +1949,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct btrfs_key log_key; struct inode *dir; u8 log_type; - int exists; - int ret = 0; + bool exists; + int ret; bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); bool name_added = false; @@ -1970,12 +1970,12 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, name_len); btrfs_dir_item_key_to_cpu(eb, di, &log_key); - exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); - if (exists == 0) - exists = 1; - else - exists = 0; + ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); btrfs_release_path(path); + if (ret < 0) + goto out; + exists = (ret == 0); + ret = 0; if (key->type == BTRFS_DIR_ITEM_KEY) { dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, From 711d171f4a5e389c216cac9d1685fffd16f46bd6 Mon Sep 17 00:00:00 2001 From: Chris Packham Date: Fri, 13 Mar 2020 16:13:12 +1300 Subject: [PATCH 1152/5344] watchdog: orion: use 0 for unset heartbeat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit bb914088bd8a91c382f54d469367b2e5508b5493 upstream. If the heartbeat module param is not specified we would get an error message watchdog: f1020300.watchdog: driver supplied timeout (4294967295) out of range watchdog: f1020300.watchdog: falling back to default timeout (171) This is because we were initialising heartbeat to -1. By removing the initialisation (thus letting the C run time initialise it to 0) we silence the warning message and the default timeout is still used. Signed-off-by: Chris Packham Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20200313031312.1485-1-chris.packham@alliedtelesis.co.nz Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck Signed-off-by: Marek Behún Signed-off-by: Greg Kroah-Hartman --- drivers/watchdog/orion_wdt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/watchdog/orion_wdt.c b/drivers/watchdog/orion_wdt.c index 8e6dfe76f9c9d..4ddb4ea2e4a32 100644 --- a/drivers/watchdog/orion_wdt.c +++ b/drivers/watchdog/orion_wdt.c @@ -52,7 +52,7 @@ #define WDT_A370_RATIO (1 << WDT_A370_RATIO_SHIFT) static bool nowayout = WATCHDOG_NOWAYOUT; -static int heartbeat = -1; /* module parameter (seconds) */ +static int heartbeat; /* module parameter (seconds) */ struct orion_watchdog; From 2c00092bf57d1711df73ab8b1b81fa67a80b36e9 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 1 Oct 2021 20:36:44 +0300 Subject: [PATCH 1153/5344] mei: me: add Ice Lake-N device id. commit 75c10c5e7a715550afdd51ef8cfd1d975f48f9e1 upstream. Add Ice Lake-N device ID. The device can be found on MacBookPro16,2 [1]. [1]: https://linux-hardware.org/?probe=f1c5cf0c43 Signed-off-by: Andy Shevchenko Cc: stable Link: https://lore.kernel.org/r/20211001173644.16068-1-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/hw-me-regs.h | 1 + drivers/misc/mei/pci-me.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/misc/mei/hw-me-regs.h b/drivers/misc/mei/hw-me-regs.h index e56dc47540646..556133daa25e9 100644 --- a/drivers/misc/mei/hw-me-regs.h +++ b/drivers/misc/mei/hw-me-regs.h @@ -90,6 +90,7 @@ #define MEI_DEV_ID_CDF 0x18D3 /* Cedar Fork */ #define MEI_DEV_ID_ICP_LP 0x34E0 /* Ice Lake Point LP */ +#define MEI_DEV_ID_ICP_N 0x38E0 /* Ice Lake Point N */ #define MEI_DEV_ID_TGP_LP 0xA0E0 /* Tiger Lake Point LP */ diff --git a/drivers/misc/mei/pci-me.c b/drivers/misc/mei/pci-me.c index 75ab2ffbf235f..d7233f2238651 100644 --- a/drivers/misc/mei/pci-me.c +++ b/drivers/misc/mei/pci-me.c @@ -103,6 +103,7 @@ static const struct pci_device_id mei_me_pci_tbl[] = { {MEI_PCI_DEVICE(MEI_DEV_ID_CMP_H_3, MEI_ME_PCH8_CFG)}, {MEI_PCI_DEVICE(MEI_DEV_ID_ICP_LP, MEI_ME_PCH12_CFG)}, + {MEI_PCI_DEVICE(MEI_DEV_ID_ICP_N, MEI_ME_PCH12_CFG)}, {MEI_PCI_DEVICE(MEI_DEV_ID_TGP_LP, MEI_ME_PCH12_CFG)}, From eee2c48c94cb26284becfef014ffa60d2a292191 Mon Sep 17 00:00:00 2001 From: Jonathan Bell Date: Fri, 8 Oct 2021 12:25:43 +0300 Subject: [PATCH 1154/5344] xhci: guard accesses to ep_state in xhci_endpoint_reset() commit a01ba2a3378be85538e0183ae5367c1bc1d5aaf3 upstream. See https://github.com/raspberrypi/linux/issues/3981 Two read-modify-write cycles on ep->ep_state are not guarded by xhci->lock. Fix these. Fixes: f5249461b504 ("xhci: Clear the host side toggle manually when endpoint is soft reset") Cc: stable@vger.kernel.org Signed-off-by: Jonathan Bell Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20211008092547.3996295-2-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index 02a2afd130eb6..4ef7484dff8b2 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -3173,10 +3173,13 @@ static void xhci_endpoint_reset(struct usb_hcd *hcd, return; /* Bail out if toggle is already being cleared by a endpoint reset */ + spin_lock_irqsave(&xhci->lock, flags); if (ep->ep_state & EP_HARD_CLEAR_TOGGLE) { ep->ep_state &= ~EP_HARD_CLEAR_TOGGLE; + spin_unlock_irqrestore(&xhci->lock, flags); return; } + spin_unlock_irqrestore(&xhci->lock, flags); /* Only interrupt and bulk ep's use data toggle, USB2 spec 5.5.4-> */ if (usb_endpoint_xfer_control(&host_ep->desc) || usb_endpoint_xfer_isoc(&host_ep->desc)) @@ -3262,8 +3265,10 @@ static void xhci_endpoint_reset(struct usb_hcd *hcd, xhci_free_command(xhci, cfg_cmd); cleanup: xhci_free_command(xhci, stop_cmd); + spin_lock_irqsave(&xhci->lock, flags); if (ep->ep_state & EP_SOFT_CLEAR_TOGGLE) ep->ep_state &= ~EP_SOFT_CLEAR_TOGGLE; + spin_unlock_irqrestore(&xhci->lock, flags); } static int xhci_check_streams_endpoint(struct xhci_hcd *xhci, From 6b11b6d834dd612c5ab597cbb257d7c796f8e817 Mon Sep 17 00:00:00 2001 From: Pavankumar Kondeti Date: Fri, 8 Oct 2021 12:25:46 +0300 Subject: [PATCH 1155/5344] xhci: Fix command ring pointer corruption while aborting a command commit ff0e50d3564f33b7f4b35cadeabd951d66cfc570 upstream. The command ring pointer is located at [6:63] bits of the command ring control register (CRCR). All the control bits like command stop, abort are located at [0:3] bits. While aborting a command, we read the CRCR and set the abort bit and write to the CRCR. The read will always give command ring pointer as all zeros. So we essentially write only the control bits. Since we split the 64 bit write into two 32 bit writes, there is a possibility of xHC command ring stopped before the upper dword (all zeros) is written. If that happens, xHC updates the upper dword of its internal command ring pointer with all zeros. Next time, when the command ring is restarted, we see xHC memory access failures. Fix this issue by only writing to the lower dword of CRCR where all control bits are located. Cc: stable@vger.kernel.org Signed-off-by: Pavankumar Kondeti Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20211008092547.3996295-5-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-ring.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index 1228b3d92db06..2c92fd19e7822 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -339,16 +339,22 @@ static void xhci_handle_stopped_cmd_ring(struct xhci_hcd *xhci, /* Must be called with xhci->lock held, releases and aquires lock back */ static int xhci_abort_cmd_ring(struct xhci_hcd *xhci, unsigned long flags) { - u64 temp_64; + u32 temp_32; int ret; xhci_dbg(xhci, "Abort command ring\n"); reinit_completion(&xhci->cmd_ring_stop_completion); - temp_64 = xhci_read_64(xhci, &xhci->op_regs->cmd_ring); - xhci_write_64(xhci, temp_64 | CMD_RING_ABORT, - &xhci->op_regs->cmd_ring); + /* + * The control bits like command stop, abort are located in lower + * dword of the command ring control register. Limit the write + * to the lower dword to avoid corrupting the command ring pointer + * in case if the command ring is stopped by the time upper dword + * is written. + */ + temp_32 = readl(&xhci->op_regs->cmd_ring); + writel(temp_32 | CMD_RING_ABORT, &xhci->op_regs->cmd_ring); /* Section 4.6.1.2 of xHCI 1.0 spec says software should also time the * completion of the Command Abort operation. If CRR is not negated in 5 From bf8c503b7e16524134f0c708fb42d50e98dcb42b Mon Sep 17 00:00:00 2001 From: Nikolay Martynov Date: Fri, 8 Oct 2021 12:25:47 +0300 Subject: [PATCH 1156/5344] xhci: Enable trust tx length quirk for Fresco FL11 USB controller commit ea0f69d8211963c4b2cc1998b86779a500adb502 upstream. Tested on SD5200T TB3 dock which has Fresco Logic FL1100 USB 3.0 Host Controller. Before this patch streaming video from USB cam made mouse and keyboard connected to the same USB bus unusable. Also video was jerky. With this patch streaming video doesn't have any effect on other periferals and video is smooth. Cc: stable@vger.kernel.org Signed-off-by: Nikolay Martynov Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20211008092547.3996295-6-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index d242779297ba7..ae8fa4ff05ee3 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -28,6 +28,7 @@ #define PCI_VENDOR_ID_FRESCO_LOGIC 0x1b73 #define PCI_DEVICE_ID_FRESCO_LOGIC_PDK 0x1000 #define PCI_DEVICE_ID_FRESCO_LOGIC_FL1009 0x1009 +#define PCI_DEVICE_ID_FRESCO_LOGIC_FL1100 0x1100 #define PCI_DEVICE_ID_FRESCO_LOGIC_FL1400 0x1400 #define PCI_VENDOR_ID_ETRON 0x1b6f @@ -98,6 +99,7 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci) /* Look for vendor-specific quirks */ if (pdev->vendor == PCI_VENDOR_ID_FRESCO_LOGIC && (pdev->device == PCI_DEVICE_ID_FRESCO_LOGIC_PDK || + pdev->device == PCI_DEVICE_ID_FRESCO_LOGIC_FL1100 || pdev->device == PCI_DEVICE_ID_FRESCO_LOGIC_FL1400)) { if (pdev->device == PCI_DEVICE_ID_FRESCO_LOGIC_PDK && pdev->revision == 0x0) { From 09114c680d27c5da1645d7b3953c7996df534d76 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 27 Sep 2021 14:13:57 +0200 Subject: [PATCH 1157/5344] cb710: avoid NULL pointer subtraction commit 42641042c10c757fe10cc09088cf3f436cec5007 upstream. clang-14 complains about an unusual way of converting a pointer to an integer: drivers/misc/cb710/sgbuf2.c:50:15: error: performing pointer subtraction with a null pointer has undefined behavior [-Werror,-Wnull-pointer-subtraction] return ((ptr - NULL) & 3) != 0; Replace this with a normal cast to uintptr_t. Fixes: 5f5bac8272be ("mmc: Driver for CB710/720 memory card reader (MMC part)") Cc: stable Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20210927121408.939246-1-arnd@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/cb710/sgbuf2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/cb710/sgbuf2.c b/drivers/misc/cb710/sgbuf2.c index dfd2969e36289..d52e079bb324e 100644 --- a/drivers/misc/cb710/sgbuf2.c +++ b/drivers/misc/cb710/sgbuf2.c @@ -47,7 +47,7 @@ static inline bool needs_unaligned_copy(const void *ptr) #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS return false; #else - return ((ptr - NULL) & 3) != 0; + return ((uintptr_t)ptr & 3) != 0; #endif } From a2e75c572a2557ef86e4e32bfc95ee8ea7a90255 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 1 Sep 2021 08:33:19 +0200 Subject: [PATCH 1158/5344] efi/cper: use stack buffer for error record decoding commit b3a72ca80351917cc23f9e24c35f3c3979d3c121 upstream. Joe reports that using a statically allocated buffer for converting CPER error records into human readable text is probably a bad idea. Even though we are not aware of any actual issues, a stack buffer is clearly a better choice here anyway, so let's move the buffer into the stack frames of the two functions that refer to it. Cc: Reported-by: Joe Perches Signed-off-by: Ard Biesheuvel Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/efi/cper.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c index e48298687b76d..6157038114a2a 100644 --- a/drivers/firmware/efi/cper.c +++ b/drivers/firmware/efi/cper.c @@ -25,8 +25,6 @@ #include #include -static char rcd_decode_str[CPER_REC_LEN]; - /* * CPER record ID need to be unique even after reboot, because record * ID is used as index for ERST storage, while CPER records from @@ -299,6 +297,7 @@ const char *cper_mem_err_unpack(struct trace_seq *p, struct cper_mem_err_compact *cmem) { const char *ret = trace_seq_buffer_ptr(p); + char rcd_decode_str[CPER_REC_LEN]; if (cper_mem_err_location(cmem, rcd_decode_str)) trace_seq_printf(p, "%s", rcd_decode_str); @@ -313,6 +312,7 @@ static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem, int len) { struct cper_mem_err_compact cmem; + char rcd_decode_str[CPER_REC_LEN]; /* Don't trust UEFI 2.1/2.2 structure with bad validation bits */ if (len == sizeof(struct cper_sec_mem_err_old) && From c31b9b35625f610a87e91f36bb7db73d15089ef4 Mon Sep 17 00:00:00 2001 From: Zhang Jianhua Date: Thu, 23 Sep 2021 10:53:40 +0800 Subject: [PATCH 1159/5344] efi: Change down_interruptible() in virt_efi_reset_system() to down_trylock() commit 38fa3206bf441911258e5001ac8b6738693f8d82 upstream. While reboot the system by sysrq, the following bug will be occur. BUG: sleeping function called from invalid context at kernel/locking/semaphore.c:90 in_atomic(): 0, irqs_disabled(): 128, non_block: 0, pid: 10052, name: rc.shutdown CPU: 3 PID: 10052 Comm: rc.shutdown Tainted: G W O 5.10.0 #1 Call trace: dump_backtrace+0x0/0x1c8 show_stack+0x18/0x28 dump_stack+0xd0/0x110 ___might_sleep+0x14c/0x160 __might_sleep+0x74/0x88 down_interruptible+0x40/0x118 virt_efi_reset_system+0x3c/0xd0 efi_reboot+0xd4/0x11c machine_restart+0x60/0x9c emergency_restart+0x1c/0x2c sysrq_handle_reboot+0x1c/0x2c __handle_sysrq+0xd0/0x194 write_sysrq_trigger+0xbc/0xe4 proc_reg_write+0xd4/0xf0 vfs_write+0xa8/0x148 ksys_write+0x6c/0xd8 __arm64_sys_write+0x18/0x28 el0_svc_common.constprop.3+0xe4/0x16c do_el0_svc+0x1c/0x2c el0_svc+0x20/0x30 el0_sync_handler+0x80/0x17c el0_sync+0x158/0x180 The reason for this problem is that irq has been disabled in machine_restart() and then it calls down_interruptible() in virt_efi_reset_system(), which would occur sleep in irq context, it is dangerous! Commit 99409b935c9a("locking/semaphore: Add might_sleep() to down_*() family") add might_sleep() in down_interruptible(), so the bug info is here. down_trylock() can solve this problem, cause there is no might_sleep. -------- Cc: Signed-off-by: Zhang Jianhua Signed-off-by: Ard Biesheuvel Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/efi/runtime-wrappers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/efi/runtime-wrappers.c b/drivers/firmware/efi/runtime-wrappers.c index 65fffaa222108..136b9c7f9ac95 100644 --- a/drivers/firmware/efi/runtime-wrappers.c +++ b/drivers/firmware/efi/runtime-wrappers.c @@ -414,7 +414,7 @@ static void virt_efi_reset_system(int reset_type, unsigned long data_size, efi_char16_t *data) { - if (down_interruptible(&efi_runtime_lock)) { + if (down_trylock(&efi_runtime_lock)) { pr_warn("failed to invoke the reset_system() runtime service:\n" "could not get exclusive access to the firmware\n"); return; From a6a55f373c6faf40708ba8632441c0b516fe2fb3 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Wed, 6 Oct 2021 00:16:31 +0200 Subject: [PATCH 1160/5344] usb: musb: dsps: Fix the probe error path commit c2115b2b16421d93d4993f3fe4c520e91d6fe801 upstream. Commit 7c75bde329d7 ("usb: musb: musb_dsps: request_irq() after initializing musb") has inverted the calls to dsps_setup_optional_vbus_irq() and dsps_create_musb_pdev() without updating correctly the error path. dsps_create_musb_pdev() allocates and registers a new platform device which must be unregistered and freed with platform_device_unregister(), and this is missing upon dsps_setup_optional_vbus_irq() error. While on the master branch it seems not to trigger any issue, I observed a kernel crash because of a NULL pointer dereference with a v5.10.70 stable kernel where the patch mentioned above was backported. With this kernel version, -EPROBE_DEFER is returned the first time dsps_setup_optional_vbus_irq() is called which triggers the probe to error out without unregistering the platform device. Unfortunately, on the Beagle Bone Black Wireless, the platform device still living in the system is being used by the USB Ethernet gadget driver, which during the boot phase triggers the crash. My limited knowledge of the musb world prevents me to revert this commit which was sent to silence a robot warning which, as far as I understand, does not make sense. The goal of this patch was to prevent an IRQ to fire before the platform device being registered. I think this cannot ever happen due to the fact that enabling the interrupts is done by the ->enable() callback of the platform musb device, and this platform device must be already registered in order for the core or any other user to use this callback. Hence, I decided to fix the error path, which might prevent future errors on mainline kernels while also fixing older ones. Fixes: 7c75bde329d7 ("usb: musb: musb_dsps: request_irq() after initializing musb") Cc: stable@vger.kernel.org Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/r/20211005221631.1529448-1-miquel.raynal@bootlin.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/musb/musb_dsps.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/musb/musb_dsps.c b/drivers/usb/musb/musb_dsps.c index 89d659cef5c63..1b9ebd756f097 100644 --- a/drivers/usb/musb/musb_dsps.c +++ b/drivers/usb/musb/musb_dsps.c @@ -899,11 +899,13 @@ static int dsps_probe(struct platform_device *pdev) if (usb_get_dr_mode(&pdev->dev) == USB_DR_MODE_PERIPHERAL) { ret = dsps_setup_optional_vbus_irq(pdev, glue); if (ret) - goto err; + goto unregister_pdev; } return 0; +unregister_pdev: + platform_device_unregister(glue->musb); err: pm_runtime_disable(&pdev->dev); iounmap(glue->usbss_base); From d3e06b685329c801b07d019244e653ed93841242 Mon Sep 17 00:00:00 2001 From: Michael Cullen Date: Fri, 15 Oct 2021 13:17:50 -0700 Subject: [PATCH 1161/5344] Input: xpad - add support for another USB ID of Nacon GC-100 commit 3378a07daa6cdd11e042797454c706d1c69f9ca6 upstream. The Nacon GX100XF is already mapped, but it seems there is a Nacon GC-100 (identified as NC5136Wht PCGC-100WHITE though I believe other colours exist) with a different USB ID when in XInput mode. Signed-off-by: Michael Cullen Link: https://lore.kernel.org/r/20211015192051.5196-1-michael@michaelcullen.name Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/joystick/xpad.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index e5f1e3cf9179f..ba101afcfc27f 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -331,6 +331,7 @@ static const struct xpad_device { { 0x24c6, 0x5b03, "Thrustmaster Ferrari 458 Racing Wheel", 0, XTYPE_XBOX360 }, { 0x24c6, 0x5d04, "Razer Sabertooth", 0, XTYPE_XBOX360 }, { 0x24c6, 0xfafe, "Rock Candy Gamepad for Xbox 360", 0, XTYPE_XBOX360 }, + { 0x3285, 0x0607, "Nacon GC-100", 0, XTYPE_XBOX360 }, { 0x3767, 0x0101, "Fanatec Speedster 3 Forceshock Wheel", 0, XTYPE_XBOX }, { 0xffff, 0xffff, "Chinese-made Xbox Controller", 0, XTYPE_XBOX }, { 0x0000, 0x0000, "Generic X-Box pad", 0, XTYPE_UNKNOWN } @@ -447,6 +448,7 @@ static const struct usb_device_id xpad_table[] = { XPAD_XBOXONE_VENDOR(0x24c6), /* PowerA Controllers */ XPAD_XBOXONE_VENDOR(0x2e24), /* Hyperkin Duke X-Box One pad */ XPAD_XBOX360_VENDOR(0x2f24), /* GameSir Controllers */ + XPAD_XBOX360_VENDOR(0x3285), /* Nacon GC-100 */ { } }; From 33e66f417c3a38c481a4cd9e1c4201d7c48102c1 Mon Sep 17 00:00:00 2001 From: Aleksander Morgado Date: Thu, 7 Oct 2021 14:25:01 +0200 Subject: [PATCH 1162/5344] USB: serial: qcserial: add EM9191 QDL support commit 11c52d250b34a0862edc29db03fbec23b30db6da upstream. When the module boots into QDL download mode it exposes the 1199:90d2 ids, which can be mapped to the qcserial driver, and used to run firmware upgrades (e.g. with the qmi-firmware-update program). T: Bus=01 Lev=03 Prnt=08 Port=03 Cnt=01 Dev#= 10 Spd=480 MxCh= 0 D: Ver= 2.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1199 ProdID=90d2 Rev=00.00 S: Manufacturer=Sierra Wireless, Incorporated S: Product=Sierra Wireless EM9191 S: SerialNumber=8W0382004102A109 C: #Ifs= 1 Cfg#= 1 Atr=a0 MxPwr=2mA I: If#=0x0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=10 Driver=qcserial Signed-off-by: Aleksander Morgado Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/qcserial.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/serial/qcserial.c b/drivers/usb/serial/qcserial.c index 0f60363c1bbc8..b1b9923162a04 100644 --- a/drivers/usb/serial/qcserial.c +++ b/drivers/usb/serial/qcserial.c @@ -165,6 +165,7 @@ static const struct usb_device_id id_table[] = { {DEVICE_SWI(0x1199, 0x907b)}, /* Sierra Wireless EM74xx */ {DEVICE_SWI(0x1199, 0x9090)}, /* Sierra Wireless EM7565 QDL */ {DEVICE_SWI(0x1199, 0x9091)}, /* Sierra Wireless EM7565 */ + {DEVICE_SWI(0x1199, 0x90d2)}, /* Sierra Wireless EM9191 QDL */ {DEVICE_SWI(0x413c, 0x81a2)}, /* Dell Wireless 5806 Gobi(TM) 4G LTE Mobile Broadband Card */ {DEVICE_SWI(0x413c, 0x81a3)}, /* Dell Wireless 5570 HSPA+ (42Mbps) Mobile Broadband Card */ {DEVICE_SWI(0x413c, 0x81a4)}, /* Dell Wireless 5570e HSPA+ (42Mbps) Mobile Broadband Card */ From e5f638d7ad5b4c97bc2cae191cff8129b8ba0aa1 Mon Sep 17 00:00:00 2001 From: Yu-Tung Chang Date: Thu, 30 Sep 2021 10:11:12 +0800 Subject: [PATCH 1163/5344] USB: serial: option: add Quectel EC200S-CN module support commit 2263eb7370060bdb0013bc14e1a7c9bf33617a55 upstream. Add usb product id of the Quectel EC200S-CN module. usb-devices output for 0x6002: T: Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 3 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=2c7c ProdID=6002 Rev=03.18 S: Manufacturer=Android S: Product=Android S: SerialNumber=0000 C: #Ifs= 5 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#=0x0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=06 Prot=00 Driver=cdc_ether I: If#=0x1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=cdc_ether I: If#=0x2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none) I: If#=0x3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none) I: If#=0x4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none) Signed-off-by: Yu-Tung Chang Link: https://lore.kernel.org/r/20210930021112.330396-1-mtwget@gmail.com Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/option.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 82016d9781460..39eafa49e0ac8 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -251,6 +251,7 @@ static void option_instat_callback(struct urb *urb); #define QUECTEL_PRODUCT_EP06 0x0306 #define QUECTEL_PRODUCT_EM12 0x0512 #define QUECTEL_PRODUCT_RM500Q 0x0800 +#define QUECTEL_PRODUCT_EC200S_CN 0x6002 #define QUECTEL_PRODUCT_EC200T 0x6026 #define CMOTECH_VENDOR_ID 0x16d8 @@ -1128,6 +1129,7 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM500Q, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM500Q, 0xff, 0xff, 0x10), .driver_info = ZLP }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC200S_CN, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC200T, 0xff, 0, 0) }, { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_6001) }, From 00c0e6d8a99b44e2b30a46cab7756b86dab6ebff Mon Sep 17 00:00:00 2001 From: Daniele Palmas Date: Mon, 4 Oct 2021 12:56:55 +0200 Subject: [PATCH 1164/5344] USB: serial: option: add Telit LE910Cx composition 0x1204 commit f5a8a07edafed8bede17a95ef8940fe3a57a77d5 upstream. Add the following Telit LE910Cx composition: 0x1204: tty, adb, mbim, tty, tty, tty, tty Signed-off-by: Daniele Palmas Link: https://lore.kernel.org/r/20211004105655.8515-1-dnlplm@gmail.com Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/option.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 39eafa49e0ac8..b6df428c3b4f3 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1229,6 +1229,8 @@ static const struct usb_device_id option_ids[] = { .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1203, 0xff), /* Telit LE910Cx (RNDIS) */ .driver_info = NCTRL(2) | RSVD(3) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1204, 0xff), /* Telit LE910Cx (MBIM) */ + .driver_info = NCTRL(0) | RSVD(1) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910_USBCFG4), .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) | RSVD(3) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920), From c4e4d854fe9651ebe608fb7f1d902cb2db3ba487 Mon Sep 17 00:00:00 2001 From: Tomaz Solc Date: Wed, 6 Oct 2021 14:57:50 +0200 Subject: [PATCH 1165/5344] USB: serial: option: add prod. id for Quectel EG91 commit c184accc4a42c7872dc8e8d0fc97a740dc61fe24 upstream. Adding support for Quectel EG91 LTE module. The interface layout is same as for EG95. usb-devices output: T: Bus=01 Lev=02 Prnt=02 Port=00 Cnt=01 Dev#= 3 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=2c7c ProdID=0191 Rev=03.18 S: Manufacturer=Android S: Product=Android C: #Ifs= 5 Cfg#= 1 Atr=a0 MxPwr=500mA I: If#=0x0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none) I: If#=0x1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none) I: If#=0x2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none) I: If#=0x3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none) I: If#=0x4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan Interfaces: 0: Diag 1: GNSS 2: AT-command interface/modem 3: Modem 4: QMI Signed-off-by: Tomaz Solc Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/option.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index b6df428c3b4f3..a1e9cbe518c74 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -246,6 +246,7 @@ static void option_instat_callback(struct urb *urb); /* These Quectel products use Quectel's vendor ID */ #define QUECTEL_PRODUCT_EC21 0x0121 #define QUECTEL_PRODUCT_EC25 0x0125 +#define QUECTEL_PRODUCT_EG91 0x0191 #define QUECTEL_PRODUCT_EG95 0x0195 #define QUECTEL_PRODUCT_BG96 0x0296 #define QUECTEL_PRODUCT_EP06 0x0306 @@ -1112,6 +1113,9 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC25, 0xff, 0xff, 0xff), .driver_info = NUMEP2 }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC25, 0xff, 0, 0) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EG91, 0xff, 0xff, 0xff), + .driver_info = NUMEP2 }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EG91, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EG95, 0xff, 0xff, 0xff), .driver_info = NUMEP2 }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EG95, 0xff, 0, 0) }, From a699107c7225cb595797deeb02baeab483113ca8 Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Mon, 11 Oct 2021 07:39:21 +0200 Subject: [PATCH 1166/5344] virtio: write back F_VERSION_1 before validate commit 2f9a174f918e29608564c7a4e8329893ab604fb4 upstream. The virtio specification virtio-v1.1-cs01 states: "Transitional devices MUST detect Legacy drivers by detecting that VIRTIO_F_VERSION_1 has not been acknowledged by the driver." This is exactly what QEMU as of 6.1 has done relying solely on VIRTIO_F_VERSION_1 for detecting that. However, the specification also says: "... the driver MAY read (but MUST NOT write) the device-specific configuration fields to check that it can support the device ..." before setting FEATURES_OK. In that case, any transitional device relying solely on VIRTIO_F_VERSION_1 for detecting legacy drivers will return data in legacy format. In particular, this implies that it is in big endian format for big endian guests. This naturally confuses the driver which expects little endian in the modern mode. It is probably a good idea to amend the spec to clarify that VIRTIO_F_VERSION_1 can only be relied on after the feature negotiation is complete. Before validate callback existed, config space was only read after FEATURES_OK. However, we already have two regressions, so let's address this here as well. The regressions affect the VIRTIO_NET_F_MTU feature of virtio-net and the VIRTIO_BLK_F_BLK_SIZE feature of virtio-blk for BE guests when virtio 1.0 is used on both sides. The latter renders virtio-blk unusable with DASD backing, because things simply don't work with the default. See Fixes tags for relevant commits. For QEMU, we can work around the issue by writing out the feature bits with VIRTIO_F_VERSION_1 bit set. We (ab)use the finalize_features config op for this. This isn't enough to address all vhost devices since these do not get the features until FEATURES_OK, however it looks like the affected devices actually never handled the endianness for legacy mode correctly, so at least that's not a regression. No devices except virtio net and virtio blk seem to be affected. Long term the right thing to do is to fix the hypervisors. Cc: #v4.11 Signed-off-by: Halil Pasic Fixes: 82e89ea077b9 ("virtio-blk: Add validation for block size in config space") Fixes: fe36cbe0671e ("virtio_net: clear MTU when out of range") Reported-by: markver@us.ibm.com Reviewed-by: Cornelia Huck Link: https://lore.kernel.org/r/20211011053921.1198936-1-pasic@linux.ibm.com Signed-off-by: Michael S. Tsirkin Signed-off-by: Greg Kroah-Hartman --- drivers/virtio/virtio.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index 59a05f1b81054..91627e7443260 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -225,6 +225,17 @@ static int virtio_dev_probe(struct device *_d) driver_features_legacy = driver_features; } + /* + * Some devices detect legacy solely via F_VERSION_1. Write + * F_VERSION_1 to force LE config space accesses before FEATURES_OK for + * these when needed. + */ + if (drv->validate && !virtio_legacy_is_little_endian() + && device_features & BIT_ULL(VIRTIO_F_VERSION_1)) { + dev->features = BIT_ULL(VIRTIO_F_VERSION_1); + dev->config->finalize_features(dev); + } + if (device_features & (1ULL << VIRTIO_F_VERSION_1)) dev->features = driver_features & device_features; else From e196abe26a7497e2265da1abe904d7c500cfad38 Mon Sep 17 00:00:00 2001 From: Hans Potsch Date: Wed, 6 Oct 2021 14:13:32 +0200 Subject: [PATCH 1167/5344] EDAC/armada-xp: Fix output of uncorrectable error counter commit d9b7748ffc45250b4d7bcf22404383229bc495f5 upstream. The number of correctable errors is displayed as uncorrectable errors because the "SBE" error count is passed to both calls of edac_mc_handle_error(). Pass the correct uncorrectable error count to the second edac_mc_handle_error() call when logging uncorrectable errors. [ bp: Massage commit message. ] Fixes: 7f6998a41257 ("ARM: 8888/1: EDAC: Add driver for the Marvell Armada XP SDRAM and L2 cache ECC") Signed-off-by: Hans Potsch Signed-off-by: Borislav Petkov Cc: Link: https://lkml.kernel.org/r/20211006121332.58788-1-hans.potsch@nokia.com Signed-off-by: Greg Kroah-Hartman --- drivers/edac/armada_xp_edac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/edac/armada_xp_edac.c b/drivers/edac/armada_xp_edac.c index 7f227bdcbc845..8c63447154730 100644 --- a/drivers/edac/armada_xp_edac.c +++ b/drivers/edac/armada_xp_edac.c @@ -178,7 +178,7 @@ static void axp_mc_check(struct mem_ctl_info *mci) "details unavailable (multiple errors)"); if (cnt_dbe) edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, - cnt_sbe, /* error count */ + cnt_dbe, /* error count */ 0, 0, 0, /* pfn, offset, syndrome */ -1, -1, -1, /* top, mid, low layer */ mci->ctl_name, From ebd5538f9a4783d6f283fa56058b8b969a422ca9 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 13 Oct 2021 13:45:11 +0100 Subject: [PATCH 1168/5344] nvmem: Fix shift-out-of-bound (UBSAN) with byte size cells commit 5d388fa01fa6eb310ac023a363a6cb216d9d8fe9 upstream. If a cell has 'nbits' equal to a multiple of BITS_PER_BYTE the logic *p &= GENMASK((cell->nbits%BITS_PER_BYTE) - 1, 0); will become undefined behavior because nbits modulo BITS_PER_BYTE is 0, and we subtract one from that making a large number that is then shifted more than the number of bits that fit into an unsigned long. UBSAN reports this problem: UBSAN: shift-out-of-bounds in drivers/nvmem/core.c:1386:8 shift exponent 64 is too large for 64-bit type 'unsigned long' CPU: 6 PID: 7 Comm: kworker/u16:0 Not tainted 5.15.0-rc3+ #9 Hardware name: Google Lazor (rev3+) with KB Backlight (DT) Workqueue: events_unbound deferred_probe_work_func Call trace: dump_backtrace+0x0/0x170 show_stack+0x24/0x30 dump_stack_lvl+0x64/0x7c dump_stack+0x18/0x38 ubsan_epilogue+0x10/0x54 __ubsan_handle_shift_out_of_bounds+0x180/0x194 __nvmem_cell_read+0x1ec/0x21c nvmem_cell_read+0x58/0x94 nvmem_cell_read_variable_common+0x4c/0xb0 nvmem_cell_read_variable_le_u32+0x40/0x100 a6xx_gpu_init+0x170/0x2f4 adreno_bind+0x174/0x284 component_bind_all+0xf0/0x264 msm_drm_bind+0x1d8/0x7a0 try_to_bring_up_master+0x164/0x1ac __component_add+0xbc/0x13c component_add+0x20/0x2c dp_display_probe+0x340/0x384 platform_probe+0xc0/0x100 really_probe+0x110/0x304 __driver_probe_device+0xb8/0x120 driver_probe_device+0x4c/0xfc __device_attach_driver+0xb0/0x128 bus_for_each_drv+0x90/0xdc __device_attach+0xc8/0x174 device_initial_probe+0x20/0x2c bus_probe_device+0x40/0xa4 deferred_probe_work_func+0x7c/0xb8 process_one_work+0x128/0x21c process_scheduled_works+0x40/0x54 worker_thread+0x1ec/0x2a8 kthread+0x138/0x158 ret_from_fork+0x10/0x20 Fix it by making sure there are any bits to mask out. Fixes: 69aba7948cbe ("nvmem: Add a simple NVMEM framework for consumers") Cc: Douglas Anderson Cc: stable@vger.kernel.org Signed-off-by: Stephen Boyd Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20211013124511.18726-1-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Greg Kroah-Hartman --- drivers/nvmem/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c index 6da270e8c6746..c0f4324d8f7c8 100644 --- a/drivers/nvmem/core.c +++ b/drivers/nvmem/core.c @@ -954,7 +954,8 @@ static void nvmem_shift_read_buffer_in_place(struct nvmem_cell *cell, void *buf) *p-- = 0; /* clear msb bits if any leftover in the last byte */ - *p &= GENMASK((cell->nbits%BITS_PER_BYTE) - 1, 0); + if (cell->nbits % BITS_PER_BYTE) + *p &= GENMASK((cell->nbits % BITS_PER_BYTE) - 1, 0); } static int __nvmem_cell_read(struct nvmem_device *nvmem, From bb9b000af4ef06efba993a110110f73d4f0b78d2 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 6 Oct 2021 19:34:55 +0200 Subject: [PATCH 1169/5344] x86/Kconfig: Do not enable AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT automatically commit 711885906b5c2df90746a51f4cd674f1ab9fbb1d upstream. This Kconfig option was added initially so that memory encryption is enabled by default on machines which support it. However, devices which have DMA masks that are less than the bit position of the encryption bit, aka C-bit, require the use of an IOMMU or the use of SWIOTLB. If the IOMMU is disabled or in passthrough mode, the kernel would switch to SWIOTLB bounce-buffering for those transfers. In order to avoid that, 2cc13bb4f59f ("iommu: Disable passthrough mode when SME is active") disables the default IOMMU passthrough mode so that devices for which the default 256K DMA is insufficient, can use the IOMMU instead. However 2, there are cases where the IOMMU is disabled in the BIOS, etc. (think the usual hardware folk "oops, I dropped the ball there" cases) or a driver doesn't properly use the DMA APIs or a device has a firmware or hardware bug, e.g.: ea68573d408f ("drm/amdgpu: Fail to load on RAVEN if SME is active") However 3, in the above GPU use case, there are APIs like Vulkan and some OpenGL/OpenCL extensions which are under the assumption that user-allocated memory can be passed in to the kernel driver and both the GPU and CPU can do coherent and concurrent access to the same memory. That cannot work with SWIOTLB bounce buffers, of course. So, in order for those devices to function, drop the "default y" for the SME by default active option so that users who want to have SME enabled, will need to either enable it in their config or use "mem_encrypt=on" on the kernel command line. [ tlendacky: Generalize commit message. ] Fixes: 7744ccdbc16f ("x86/mm: Add Secure Memory Encryption (SME) support") Reported-by: Paul Menzel Signed-off-by: Borislav Petkov Acked-by: Alex Deucher Acked-by: Tom Lendacky Cc: Link: https://lkml.kernel.org/r/8bbacd0e-4580-3194-19d2-a0ecad7df09c@molgen.mpg.de Signed-off-by: Greg Kroah-Hartman --- arch/x86/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c581fef615838..828737dbf6138 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1542,7 +1542,6 @@ config AMD_MEM_ENCRYPT config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT bool "Activate AMD Secure Memory Encryption (SME) by default" - default y depends on AMD_MEM_ENCRYPT ---help--- Say yes to have system memory encrypted by default if running on From 8a5fb258ead24867e6da7f99fa987dec076cd5e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 11 Oct 2021 09:02:03 +0200 Subject: [PATCH 1170/5344] powerpc/xive: Discard disabled interrupts in get_irqchip_state() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 6f779e1d359b8d5801f677c1d49dcfa10bf95674 upstream. When an interrupt is passed through, the KVM XIVE device calls the set_vcpu_affinity() handler which raises the P bit to mask the interrupt and to catch any in-flight interrupts while routing the interrupt to the guest. On the guest side, drivers (like some Intels) can request at probe time some MSIs and call synchronize_irq() to check that there are no in flight interrupts. This will call the XIVE get_irqchip_state() handler which will always return true as the interrupt P bit has been set on the host side and lock the CPU in an infinite loop. Fix that by discarding disabled interrupts in get_irqchip_state(). Fixes: da15c03b047d ("powerpc/xive: Implement get_irqchip_state method for XIVE to fix shutdown race") Cc: stable@vger.kernel.org #v5.4+ Signed-off-by: Cédric Le Goater Tested-by: seeteena Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211011070203.99726-1-clg@kaod.org Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/sysdev/xive/common.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 2d4c09a77910f..d4467da279668 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -990,7 +990,8 @@ static int xive_get_irqchip_state(struct irq_data *data, * interrupt to be inactive in that case. */ *state = (pq != XIVE_ESB_INVALID) && !xd->stale_p && - (xd->saved_p || !!(pq & XIVE_ESB_VAL_P)); + (xd->saved_p || (!!(pq & XIVE_ESB_VAL_P) && + !irqd_irq_disabled(data))); return 0; default: return -EINVAL; From bff22c6913b4e367d7059ea829f0f22a66d5778e Mon Sep 17 00:00:00 2001 From: Billy Tsai Date: Tue, 31 Aug 2021 15:14:44 +0800 Subject: [PATCH 1171/5344] iio: adc: aspeed: set driver data when adc probe. commit eb795cd97365a3d3d9da3926d234a7bc32a3bb15 upstream. Fix the issue when adc remove will get the null driver data. Fixed: commit 573803234e72 ("iio: Aspeed ADC") Signed-off-by: Billy Tsai Link: https://lore.kernel.org/r/20210831071458.2334-2-billy_tsai@aspeedtech.com Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/adc/aspeed_adc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iio/adc/aspeed_adc.c b/drivers/iio/adc/aspeed_adc.c index d3fc39df535dc..552baf327d0c4 100644 --- a/drivers/iio/adc/aspeed_adc.c +++ b/drivers/iio/adc/aspeed_adc.c @@ -184,6 +184,7 @@ static int aspeed_adc_probe(struct platform_device *pdev) data = iio_priv(indio_dev); data->dev = &pdev->dev; + platform_set_drvdata(pdev, indio_dev); res = platform_get_resource(pdev, IORESOURCE_MEM, 0); data->base = devm_ioremap_resource(&pdev->dev, res); From eceb26fd77b81770d868bdc0b9c76cc3cddfc9cd Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 21 Aug 2021 12:37:24 +0200 Subject: [PATCH 1172/5344] iio: adc128s052: Fix the error handling path of 'adc128_probe()' commit bbcf40816b547b3c37af49168950491d20d81ce1 upstream. A successful 'regulator_enable()' call should be balanced by a corresponding 'regulator_disable()' call in the error handling path of the probe, as already done in the remove function. Update the error handling path accordingly. Fixes: 913b86468674 ("iio: adc: Add TI ADC128S052") Signed-off-by: Christophe JAILLET Reviewed-by: Alexandru Ardelean Link: https://lore.kernel.org/r/85189f1cfcf6f5f7b42d8730966f2a074b07b5f5.1629542160.git.christophe.jaillet@wanadoo.fr Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/adc/ti-adc128s052.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/iio/adc/ti-adc128s052.c b/drivers/iio/adc/ti-adc128s052.c index 1e5a936b5b6ad..4267d756cd50e 100644 --- a/drivers/iio/adc/ti-adc128s052.c +++ b/drivers/iio/adc/ti-adc128s052.c @@ -172,7 +172,13 @@ static int adc128_probe(struct spi_device *spi) mutex_init(&adc->lock); ret = iio_device_register(indio_dev); + if (ret) + goto err_disable_regulator; + return 0; + +err_disable_regulator: + regulator_disable(adc->reg); return ret; } From 49b1ab862cea708cb62e7bbc307f55c65e1868a0 Mon Sep 17 00:00:00 2001 From: Hui Liu Date: Sun, 26 Sep 2021 15:30:28 +0800 Subject: [PATCH 1173/5344] iio: mtk-auxadc: fix case IIO_CHAN_INFO_PROCESSED commit c2980c64c7fd4585d684574c92d1624d44961edd upstream. The previous driver does't apply the necessary scaling to take the voltage range into account. We change readback value from raw data to input voltage to fix case IIO_CHAN_INFO_PROCESSED. Fixes: ace4cdfe67be ("iio: adc: mt2701: Add Mediatek auxadc driver for mt2701.") Signed-off-by: Hui Liu Link: https://lore.kernel.org/r/20210926073028.11045-2-hui.liu@mediatek.com Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/adc/mt6577_auxadc.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/iio/adc/mt6577_auxadc.c b/drivers/iio/adc/mt6577_auxadc.c index 2449d91e47665..9cdb9084c64e8 100644 --- a/drivers/iio/adc/mt6577_auxadc.c +++ b/drivers/iio/adc/mt6577_auxadc.c @@ -82,6 +82,10 @@ static const struct iio_chan_spec mt6577_auxadc_iio_channels[] = { MT6577_AUXADC_CHANNEL(15), }; +/* For Voltage calculation */ +#define VOLTAGE_FULL_RANGE 1500 /* VA voltage */ +#define AUXADC_PRECISE 4096 /* 12 bits */ + static int mt_auxadc_get_cali_data(int rawdata, bool enable_cali) { return rawdata; @@ -191,6 +195,10 @@ static int mt6577_auxadc_read_raw(struct iio_dev *indio_dev, } if (adc_dev->dev_comp->sample_data_cali) *val = mt_auxadc_get_cali_data(*val, true); + + /* Convert adc raw data to voltage: 0 - 1500 mV */ + *val = *val * VOLTAGE_FULL_RANGE / AUXADC_PRECISE; + return IIO_VAL_INT; default: From a6ed37a26f1bf3c00eafdc0a91a68bbca65183ec Mon Sep 17 00:00:00 2001 From: Jiri Valek - 2N Date: Mon, 20 Sep 2021 14:53:48 +0200 Subject: [PATCH 1174/5344] iio: light: opt3001: Fixed timeout error when 0 lux commit 26d90b5590579def54382a2fc34cfbe8518a9851 upstream. Reading from sensor returned timeout error under zero light conditions. Signed-off-by: Jiri Valek - 2N Fixes: ac663db3678a ("iio: light: opt3001: enable operation w/o IRQ") Link: https://lore.kernel.org/r/20210920125351.6569-1-valek@2n.cz Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/light/opt3001.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/iio/light/opt3001.c b/drivers/iio/light/opt3001.c index 92004a2563ea8..5588066c83246 100644 --- a/drivers/iio/light/opt3001.c +++ b/drivers/iio/light/opt3001.c @@ -275,6 +275,8 @@ static int opt3001_get_lux(struct opt3001 *opt, int *val, int *val2) ret = wait_event_timeout(opt->result_ready_queue, opt->result_ready, msecs_to_jiffies(OPT3001_RESULT_READY_LONG)); + if (ret == 0) + return -ETIMEDOUT; } else { /* Sleep for result ready time */ timeout = (opt->int_time == OPT3001_INT_TIME_SHORT) ? @@ -311,9 +313,7 @@ static int opt3001_get_lux(struct opt3001 *opt, int *val, int *val2) /* Disallow IRQ to access the device while lock is active */ opt->ok_to_ignore_lock = false; - if (ret == 0) - return -ETIMEDOUT; - else if (ret < 0) + if (ret < 0) return ret; if (opt->use_irq) { From b479105adefe89e474edbdc6164321f56a7ee0f8 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 9 Sep 2021 12:13:36 +0300 Subject: [PATCH 1175/5344] iio: ssp_sensors: add more range checking in ssp_parse_dataframe() commit 8167c9a375ccceed19048ad9d68cb2d02ed276e0 upstream. The "idx" is validated at the start of the loop but it gets incremented during the iteration so it needs to be checked again. Fixes: 50dd64d57eee ("iio: common: ssp_sensors: Add sensorhub driver") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20210909091336.GA26312@kili Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/common/ssp_sensors/ssp_spi.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/iio/common/ssp_sensors/ssp_spi.c b/drivers/iio/common/ssp_sensors/ssp_spi.c index 7db3d5886e3ee..7d886642af6d8 100644 --- a/drivers/iio/common/ssp_sensors/ssp_spi.c +++ b/drivers/iio/common/ssp_sensors/ssp_spi.c @@ -273,6 +273,8 @@ static int ssp_parse_dataframe(struct ssp_data *data, char *dataframe, int len) for (idx = 0; idx < len;) { switch (dataframe[idx++]) { case SSP_MSG2AP_INST_BYPASS_DATA: + if (idx >= len) + return -EPROTO; sd = dataframe[idx++]; if (sd < 0 || sd >= SSP_SENSOR_MAX) { dev_err(SSP_DEV, @@ -282,10 +284,13 @@ static int ssp_parse_dataframe(struct ssp_data *data, char *dataframe, int len) if (indio_devs[sd]) { spd = iio_priv(indio_devs[sd]); - if (spd->process_data) + if (spd->process_data) { + if (idx >= len) + return -EPROTO; spd->process_data(indio_devs[sd], &dataframe[idx], data->timestamp); + } } else { dev_err(SSP_DEV, "no client for frame\n"); } @@ -293,6 +298,8 @@ static int ssp_parse_dataframe(struct ssp_data *data, char *dataframe, int len) idx += ssp_offset_map[sd]; break; case SSP_MSG2AP_INST_DEBUG_DATA: + if (idx >= len) + return -EPROTO; sd = ssp_print_mcu_debug(dataframe, &idx, len); if (sd) { dev_err(SSP_DEV, From fd157ffe823de375c3c11d10138fdfaee3da8915 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 14 Sep 2021 13:53:33 +0300 Subject: [PATCH 1176/5344] iio: ssp_sensors: fix error code in ssp_print_mcu_debug() commit 4170d3dd1467e9d78cb9af374b19357dc324b328 upstream. The ssp_print_mcu_debug() function should return negative error codes on error. Returning "length" is meaningless. This change does not affect runtime because the callers only care about zero/non-zero. Reported-by: Jonathan Cameron Fixes: 50dd64d57eee ("iio: common: ssp_sensors: Add sensorhub driver") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20210914105333.GA11657@kili Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/common/ssp_sensors/ssp_spi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/common/ssp_sensors/ssp_spi.c b/drivers/iio/common/ssp_sensors/ssp_spi.c index 7d886642af6d8..d98e0a04add05 100644 --- a/drivers/iio/common/ssp_sensors/ssp_spi.c +++ b/drivers/iio/common/ssp_sensors/ssp_spi.c @@ -137,7 +137,7 @@ static int ssp_print_mcu_debug(char *data_frame, int *data_index, if (length > received_len - *data_index || length <= 0) { ssp_dbg("[SSP]: MSG From MCU-invalid debug length(%d/%d)\n", length, received_len); - return length ? length : -EPROTO; + return -EPROTO; } ssp_dbg("[SSP]: MSG From MCU - %s\n", &data_frame[*data_index]); From 432f8cbff6059847cbb061f866c5aa1b60651d0b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 16 Aug 2021 21:39:54 +0300 Subject: [PATCH 1177/5344] iio: dac: ti-dac5571: fix an error code in probe() commit f7a28df7db84eb3410e9eca37832efa5aed93338 upstream. If we have an unexpected number of channels then return -EINVAL instead of returning success. Fixes: df38a4a72a3b ("iio: dac: add TI DAC5571 family support") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20210816183954.GB2068@kili Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/dac/ti-dac5571.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iio/dac/ti-dac5571.c b/drivers/iio/dac/ti-dac5571.c index 3a2bb0efe50de..51c41ccf00ad9 100644 --- a/drivers/iio/dac/ti-dac5571.c +++ b/drivers/iio/dac/ti-dac5571.c @@ -352,6 +352,7 @@ static int dac5571_probe(struct i2c_client *client, data->dac5571_pwrdwn = dac5571_pwrdwn_quad; break; default: + ret = -EINVAL; goto err; } From ada0dd2becbd5a7537930a0faeb326524a2e8821 Mon Sep 17 00:00:00 2001 From: Eiichi Tsukata Date: Wed, 13 Oct 2021 17:27:29 -0300 Subject: [PATCH 1178/5344] sctp: account stream padding length for reconf chunk commit a2d859e3fc97e79d907761550dbc03ff1b36479c upstream. sctp_make_strreset_req() makes repeated calls to sctp_addto_chunk() which will automatically account for padding on each call. inreq and outreq are already 4 bytes aligned, but the payload is not and doing SCTP_PAD4(a + b) (which _sctp_make_chunk() did implicitly here) is different from SCTP_PAD4(a) + SCTP_PAD4(b) and not enough. It led to possible attempt to use more buffer than it was allocated and triggered a BUG_ON. Cc: Vlad Yasevich Cc: Neil Horman Cc: Greg KH Fixes: cc16f00f6529 ("sctp: add support for generating stream reconf ssn reset request chunk") Reported-by: Eiichi Tsukata Signed-off-by: Eiichi Tsukata Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: Marcelo Ricardo Leitner Reviewed-by: Xin Long Link: https://lore.kernel.org/r/b97c1f8b0c7ff79ac4ed206fc2c49d3612e0850c.1634156849.git.mleitner@redhat.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- net/sctp/sm_make_chunk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 000aa62281f46..4eebe708c8e4e 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -3659,7 +3659,7 @@ struct sctp_chunk *sctp_make_strreset_req( outlen = (sizeof(outreq) + stream_len) * out; inlen = (sizeof(inreq) + stream_len) * in; - retval = sctp_make_reconf(asoc, outlen + inlen); + retval = sctp_make_reconf(asoc, SCTP_PAD4(outlen) + SCTP_PAD4(inlen)); if (!retval) return NULL; From 8e2afbba4df70c61c7036fb8e0b9859458392419 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 24 Sep 2021 01:46:40 +0300 Subject: [PATCH 1179/5344] gpio: pca953x: Improve bias setting commit 55a9968c7e139209a9e93d4ca4321731bea5fc95 upstream. The commit 15add06841a3 ("gpio: pca953x: add ->set_config implementation") introduced support for bias setting. However this, due to being half-baked, brought potential issues: - the turning bias via disabling makes the pin floating for a while; - once enabled, bias can't be disabled. Fix all these by adding support for bias disabling and move the disabling part under the corresponding conditional. While at it, add support for default setting, since it's cheap to add. Fixes: 15add06841a3 ("gpio: pca953x: add ->set_config implementation") Cc: Thomas Petazzoni Signed-off-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski Signed-off-by: Greg Kroah-Hartman --- drivers/gpio/gpio-pca953x.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/gpio/gpio-pca953x.c b/drivers/gpio/gpio-pca953x.c index d9193ffa17a1e..54da66d02b0e5 100644 --- a/drivers/gpio/gpio-pca953x.c +++ b/drivers/gpio/gpio-pca953x.c @@ -583,21 +583,21 @@ static int pca953x_gpio_set_pull_up_down(struct pca953x_chip *chip, mutex_lock(&chip->i2c_lock); - /* Disable pull-up/pull-down */ - ret = regmap_write_bits(chip->regmap, pull_en_reg, bit, 0); - if (ret) - goto exit; - /* Configure pull-up/pull-down */ if (config == PIN_CONFIG_BIAS_PULL_UP) ret = regmap_write_bits(chip->regmap, pull_sel_reg, bit, bit); else if (config == PIN_CONFIG_BIAS_PULL_DOWN) ret = regmap_write_bits(chip->regmap, pull_sel_reg, bit, 0); + else + ret = 0; if (ret) goto exit; - /* Enable pull-up/pull-down */ - ret = regmap_write_bits(chip->regmap, pull_en_reg, bit, bit); + /* Disable/Enable pull-up/pull-down */ + if (config == PIN_CONFIG_BIAS_DISABLE) + ret = regmap_write_bits(chip->regmap, pull_en_reg, bit, 0); + else + ret = regmap_write_bits(chip->regmap, pull_en_reg, bit, bit); exit: mutex_unlock(&chip->i2c_lock); @@ -611,7 +611,9 @@ static int pca953x_gpio_set_config(struct gpio_chip *gc, unsigned int offset, switch (pinconf_to_config_param(config)) { case PIN_CONFIG_BIAS_PULL_UP: + case PIN_CONFIG_BIAS_PULL_PIN_DEFAULT: case PIN_CONFIG_BIAS_PULL_DOWN: + case PIN_CONFIG_BIAS_DISABLE: return pca953x_gpio_set_pull_up_down(chip, offset, config); default: return -ENOTSUPP; From 1d7eacf6af7c629035d15346521b018e4ad0fc48 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Tue, 12 Oct 2021 11:34:46 +0200 Subject: [PATCH 1180/5344] net: arc: select CRC32 commit e599ee234ad4fdfe241d937bbabd96e0d8f9d868 upstream. Fix the following build/link error by adding a dependency on the CRC32 routines: ld: drivers/net/ethernet/arc/emac_main.o: in function `arc_emac_set_rx_mode': emac_main.c:(.text+0xb11): undefined reference to `crc32_le' The crc32_le() call comes through the ether_crc_le() call in arc_emac_set_rx_mode(). [v2: moved the select to ARC_EMAC_CORE; the Makefile is a bit confusing, but the error comes from emac_main.o, which is part of the arc_emac module, which in turn is enabled by CONFIG_ARC_EMAC_CORE. Note that arc_emac is different from emac_arc...] Fixes: 775dd682e2b0ec ("arc_emac: implement promiscuous mode and multicast filtering") Cc: Arnd Bergmann Signed-off-by: Vegard Nossum Link: https://lore.kernel.org/r/20211012093446.1575-1-vegard.nossum@oracle.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/arc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/arc/Kconfig b/drivers/net/ethernet/arc/Kconfig index 45c663d8b9aab..2cd0e45d0b6c1 100644 --- a/drivers/net/ethernet/arc/Kconfig +++ b/drivers/net/ethernet/arc/Kconfig @@ -21,6 +21,7 @@ config ARC_EMAC_CORE depends on ARC || ARCH_ROCKCHIP || COMPILE_TEST select MII select PHYLIB + select CRC32 config ARC_EMAC tristate "ARC EMAC support" From 3b07113195c2d3d6b4283fb9e45afbfcd1186ff3 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Tue, 12 Oct 2021 17:25:09 +0200 Subject: [PATCH 1181/5344] net: korina: select CRC32 commit 427f974d9727ca681085ddcd0530c97ab5811ae0 upstream. Fix the following build/link error by adding a dependency on the CRC32 routines: ld: drivers/net/ethernet/korina.o: in function `korina_multicast_list': korina.c:(.text+0x1af): undefined reference to `crc32_le' Fixes: ef11291bcd5f9 ("Add support the Korina (IDT RC32434) Ethernet MAC") Cc: Arnd Bergmann Signed-off-by: Vegard Nossum Acked-by: Florian fainelli Link: https://lore.kernel.org/r/20211012152509.21771-1-vegard.nossum@oracle.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/Kconfig b/drivers/net/ethernet/Kconfig index e8e9c166185de..2d4eb2f280a12 100644 --- a/drivers/net/ethernet/Kconfig +++ b/drivers/net/ethernet/Kconfig @@ -100,6 +100,7 @@ config JME config KORINA tristate "Korina (IDT RC32434) Ethernet support" depends on MIKROTIK_RB532 + select CRC32 ---help--- If you have a Mikrotik RouterBoard 500 or IDT RC32434 based system say Y. Otherwise say N. From b81e54b4102eceee1a13e9dbfca9d0a06837b4fa Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Sun, 26 Sep 2021 17:55:41 +0300 Subject: [PATCH 1182/5344] net/mlx5e: Mutually exclude RX-FCS and RX-port-timestamp commit 0bc73ad46a76ed6ece4dcacb28858e7b38561e1c upstream. Due to current HW arch limitations, RX-FCS (scattering FCS frame field to software) and RX-port-timestamp (improved timestamp accuracy on the receive side) can't work together. RX-port-timestamp is not controlled by the user and it is enabled by default when supported by the HW/FW. This patch sets RX-port-timestamp opposite to RX-FCS configuration. Fixes: 102722fc6832 ("net/mlx5e: Add support for RXFCS feature flag") Signed-off-by: Aya Levin Reviewed-by: Tariq Toukan Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed Signed-off-by: Greg Kroah-Hartman --- .../net/ethernet/mellanox/mlx5/core/en_main.c | 57 +++++++++++++++++-- include/linux/mlx5/mlx5_ifc.h | 10 +++- 2 files changed, 60 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 622c2d26543d3..2a101ab80ce38 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3796,20 +3796,67 @@ static int set_feature_rx_all(struct net_device *netdev, bool enable) return mlx5_set_port_fcs(mdev, !enable); } +static int mlx5e_set_rx_port_ts(struct mlx5_core_dev *mdev, bool enable) +{ + u32 in[MLX5_ST_SZ_DW(pcmr_reg)] = {}; + bool supported, curr_state; + int err; + + if (!MLX5_CAP_GEN(mdev, ports_check)) + return 0; + + err = mlx5_query_ports_check(mdev, in, sizeof(in)); + if (err) + return err; + + supported = MLX5_GET(pcmr_reg, in, rx_ts_over_crc_cap); + curr_state = MLX5_GET(pcmr_reg, in, rx_ts_over_crc); + + if (!supported || enable == curr_state) + return 0; + + MLX5_SET(pcmr_reg, in, local_port, 1); + MLX5_SET(pcmr_reg, in, rx_ts_over_crc, enable); + + return mlx5_set_ports_check(mdev, in, sizeof(in)); +} + static int set_feature_rx_fcs(struct net_device *netdev, bool enable) { struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_channels *chs = &priv->channels; + struct mlx5_core_dev *mdev = priv->mdev; int err; mutex_lock(&priv->state_lock); - priv->channels.params.scatter_fcs_en = enable; - err = mlx5e_modify_channels_scatter_fcs(&priv->channels, enable); - if (err) - priv->channels.params.scatter_fcs_en = !enable; + if (enable) { + err = mlx5e_set_rx_port_ts(mdev, false); + if (err) + goto out; - mutex_unlock(&priv->state_lock); + chs->params.scatter_fcs_en = true; + err = mlx5e_modify_channels_scatter_fcs(chs, true); + if (err) { + chs->params.scatter_fcs_en = false; + mlx5e_set_rx_port_ts(mdev, true); + } + } else { + chs->params.scatter_fcs_en = false; + err = mlx5e_modify_channels_scatter_fcs(chs, false); + if (err) { + chs->params.scatter_fcs_en = true; + goto out; + } + err = mlx5e_set_rx_port_ts(mdev, true); + if (err) { + mlx5_core_warn(mdev, "Failed to set RX port timestamp %d\n", err); + err = 0; + } + } +out: + mutex_unlock(&priv->state_lock); return err; } diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 44cc6077eb3a2..433854503070c 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -8986,16 +8986,22 @@ struct mlx5_ifc_pcmr_reg_bits { u8 reserved_at_0[0x8]; u8 local_port[0x8]; u8 reserved_at_10[0x10]; + u8 entropy_force_cap[0x1]; u8 entropy_calc_cap[0x1]; u8 entropy_gre_calc_cap[0x1]; - u8 reserved_at_23[0x1b]; + u8 reserved_at_23[0xf]; + u8 rx_ts_over_crc_cap[0x1]; + u8 reserved_at_33[0xb]; u8 fcs_cap[0x1]; u8 reserved_at_3f[0x1]; + u8 entropy_force[0x1]; u8 entropy_calc[0x1]; u8 entropy_gre_calc[0x1]; - u8 reserved_at_43[0x1b]; + u8 reserved_at_43[0xf]; + u8 rx_ts_over_crc[0x1]; + u8 reserved_at_53[0xb]; u8 fcs_chk[0x1]; u8 reserved_at_5f[0x1]; }; From a6bee887e091af054b4559ca6445560d0232ce37 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Fri, 8 Oct 2021 12:34:37 +0200 Subject: [PATCH 1183/5344] net: stmmac: fix get_hw_feature() on old hardware commit 075da584bae2da6a37428d59a477b6bdad430ac3 upstream. Some old IPs do not provide the hardware feature register. On these IPs, this register is read 0x00000000. In old driver version, this feature was handled but a regression came with the commit f10a6a3541b4 ("stmmac: rework get_hw_feature function"). Indeed, this commit removes the return value in dma->get_hw_feature(). This return value was used to indicate the validity of retrieved information and used later on in stmmac_hw_init() to override priv->plat data if this hardware feature were valid. This patch restores the return code in ->get_hw_feature() in order to indicate the hardware feature validity and override priv->plat data only if this hardware feature is valid. Fixes: f10a6a3541b4 ("stmmac: rework get_hw_feature function") Signed-off-by: Herve Codina Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c | 13 +++++++++++-- drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c | 6 ++++-- drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c | 6 ++++-- drivers/net/ethernet/stmicro/stmmac/hwif.h | 6 +++--- 4 files changed, 22 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c index 2bac49b49f739..fbf2deafe8ba3 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c @@ -218,11 +218,18 @@ static void dwmac1000_dump_dma_regs(void __iomem *ioaddr, u32 *reg_space) readl(ioaddr + DMA_BUS_MODE + i * 4); } -static void dwmac1000_get_hw_feature(void __iomem *ioaddr, - struct dma_features *dma_cap) +static int dwmac1000_get_hw_feature(void __iomem *ioaddr, + struct dma_features *dma_cap) { u32 hw_cap = readl(ioaddr + DMA_HW_FEATURE); + if (!hw_cap) { + /* 0x00000000 is the value read on old hardware that does not + * implement this register + */ + return -EOPNOTSUPP; + } + dma_cap->mbps_10_100 = (hw_cap & DMA_HW_FEAT_MIISEL); dma_cap->mbps_1000 = (hw_cap & DMA_HW_FEAT_GMIISEL) >> 1; dma_cap->half_duplex = (hw_cap & DMA_HW_FEAT_HDSEL) >> 2; @@ -252,6 +259,8 @@ static void dwmac1000_get_hw_feature(void __iomem *ioaddr, dma_cap->number_tx_channel = (hw_cap & DMA_HW_FEAT_TXCHCNT) >> 22; /* Alternate (enhanced) DESC mode */ dma_cap->enh_desc = (hw_cap & DMA_HW_FEAT_ENHDESSEL) >> 24; + + return 0; } static void dwmac1000_rx_watchdog(void __iomem *ioaddr, u32 riwt, diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c index 0d993f4b701c2..ae473d85b7fb8 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c @@ -336,8 +336,8 @@ static void dwmac4_dma_tx_chan_op_mode(void __iomem *ioaddr, int mode, writel(mtl_tx_op, ioaddr + MTL_CHAN_TX_OP_MODE(channel)); } -static void dwmac4_get_hw_feature(void __iomem *ioaddr, - struct dma_features *dma_cap) +static int dwmac4_get_hw_feature(void __iomem *ioaddr, + struct dma_features *dma_cap) { u32 hw_cap = readl(ioaddr + GMAC_HW_FEATURE0); @@ -400,6 +400,8 @@ static void dwmac4_get_hw_feature(void __iomem *ioaddr, dma_cap->frpbs = (hw_cap & GMAC_HW_FEAT_FRPBS) >> 11; dma_cap->frpsel = (hw_cap & GMAC_HW_FEAT_FRPSEL) >> 10; dma_cap->dvlan = (hw_cap & GMAC_HW_FEAT_DVLAN) >> 5; + + return 0; } /* Enable/disable TSO feature and set MSS */ diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c index 4af7271cea561..07ef0ac725b3e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c @@ -356,8 +356,8 @@ static int dwxgmac2_dma_interrupt(void __iomem *ioaddr, return ret; } -static void dwxgmac2_get_hw_feature(void __iomem *ioaddr, - struct dma_features *dma_cap) +static int dwxgmac2_get_hw_feature(void __iomem *ioaddr, + struct dma_features *dma_cap) { u32 hw_cap; @@ -425,6 +425,8 @@ static void dwxgmac2_get_hw_feature(void __iomem *ioaddr, dma_cap->frpes = (hw_cap & XGMAC_HWFEAT_FRPES) >> 11; dma_cap->frpbs = (hw_cap & XGMAC_HWFEAT_FRPPB) >> 9; dma_cap->frpsel = (hw_cap & XGMAC_HWFEAT_FRPSEL) >> 3; + + return 0; } static void dwxgmac2_rx_watchdog(void __iomem *ioaddr, u32 riwt, u32 nchan) diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h index 9010d881b12e5..59a1d8c003721 100644 --- a/drivers/net/ethernet/stmicro/stmmac/hwif.h +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h @@ -196,8 +196,8 @@ struct stmmac_dma_ops { int (*dma_interrupt) (void __iomem *ioaddr, struct stmmac_extra_stats *x, u32 chan); /* If supported then get the optional core features */ - void (*get_hw_feature)(void __iomem *ioaddr, - struct dma_features *dma_cap); + int (*get_hw_feature)(void __iomem *ioaddr, + struct dma_features *dma_cap); /* Program the HW RX Watchdog */ void (*rx_watchdog)(void __iomem *ioaddr, u32 riwt, u32 number_chan); void (*set_tx_ring_len)(void __iomem *ioaddr, u32 len, u32 chan); @@ -247,7 +247,7 @@ struct stmmac_dma_ops { #define stmmac_dma_interrupt_status(__priv, __args...) \ stmmac_do_callback(__priv, dma, dma_interrupt, __args) #define stmmac_get_hw_feature(__priv, __args...) \ - stmmac_do_void_callback(__priv, dma, get_hw_feature, __args) + stmmac_do_callback(__priv, dma, get_hw_feature, __args) #define stmmac_rx_watchdog(__priv, __args...) \ stmmac_do_void_callback(__priv, dma, rx_watchdog, __args) #define stmmac_set_tx_ring_len(__priv, __args...) \ From 6355b2de572db06e3c57631a6b3aa6f812c2a76c Mon Sep 17 00:00:00 2001 From: Nanyong Sun Date: Tue, 12 Oct 2021 20:59:01 +0800 Subject: [PATCH 1184/5344] net: encx24j600: check error in devm_regmap_init_encx24j600 commit f03dca0c9e2297c84a018e306f8a9cd534ee4287 upstream. devm_regmap_init may return error which caused by like out of memory, this will results in null pointer dereference later when reading or writing register: general protection fault in encx24j600_spi_probe KASAN: null-ptr-deref in range [0x0000000000000090-0x0000000000000097] CPU: 0 PID: 286 Comm: spi-encx24j600- Not tainted 5.15.0-rc2-00142-g9978db750e31-dirty #11 9c53a778c1306b1b02359f3c2bbedc0222cba652 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 RIP: 0010:regcache_cache_bypass drivers/base/regmap/regcache.c:540 Code: 54 41 89 f4 55 53 48 89 fb 48 83 ec 08 e8 26 94 a8 fe 48 8d bb a0 00 00 00 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 4a 03 00 00 4c 8d ab b0 00 00 00 48 8b ab a0 00 RSP: 0018:ffffc900010476b8 EFLAGS: 00010207 RAX: dffffc0000000000 RBX: fffffffffffffff4 RCX: 0000000000000000 RDX: 0000000000000012 RSI: ffff888002de0000 RDI: 0000000000000094 RBP: ffff888013c9a000 R08: 0000000000000000 R09: fffffbfff3f9cc6a R10: ffffc900010476e8 R11: fffffbfff3f9cc69 R12: 0000000000000001 R13: 000000000000000a R14: ffff888013c9af54 R15: ffff888013c9ad08 FS: 00007ffa984ab580(0000) GS:ffff88801fe00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055a6384136c8 CR3: 000000003bbe6003 CR4: 0000000000770ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: encx24j600_spi_probe drivers/net/ethernet/microchip/encx24j600.c:459 spi_probe drivers/spi/spi.c:397 really_probe drivers/base/dd.c:517 __driver_probe_device drivers/base/dd.c:751 driver_probe_device drivers/base/dd.c:782 __device_attach_driver drivers/base/dd.c:899 bus_for_each_drv drivers/base/bus.c:427 __device_attach drivers/base/dd.c:971 bus_probe_device drivers/base/bus.c:487 device_add drivers/base/core.c:3364 __spi_add_device drivers/spi/spi.c:599 spi_add_device drivers/spi/spi.c:641 spi_new_device drivers/spi/spi.c:717 new_device_store+0x18c/0x1f1 [spi_stub 4e02719357f1ff33f5a43d00630982840568e85e] dev_attr_store drivers/base/core.c:2074 sysfs_kf_write fs/sysfs/file.c:139 kernfs_fop_write_iter fs/kernfs/file.c:300 new_sync_write fs/read_write.c:508 (discriminator 4) vfs_write fs/read_write.c:594 ksys_write fs/read_write.c:648 do_syscall_64 arch/x86/entry/common.c:50 entry_SYSCALL_64_after_hwframe arch/x86/entry/entry_64.S:113 Add error check in devm_regmap_init_encx24j600 to avoid this situation. Fixes: 04fbfce7a222 ("net: Microchip encx24j600 driver") Reported-by: Hulk Robot Signed-off-by: Nanyong Sun Link: https://lore.kernel.org/r/20211012125901.3623144-1-sunnanyong@huawei.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/microchip/encx24j600-regmap.c | 10 ++++++++-- drivers/net/ethernet/microchip/encx24j600.c | 5 ++++- drivers/net/ethernet/microchip/encx24j600_hw.h | 4 ++-- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/microchip/encx24j600-regmap.c b/drivers/net/ethernet/microchip/encx24j600-regmap.c index 1f496fac70332..e2528633c09a1 100644 --- a/drivers/net/ethernet/microchip/encx24j600-regmap.c +++ b/drivers/net/ethernet/microchip/encx24j600-regmap.c @@ -502,13 +502,19 @@ static struct regmap_bus phymap_encx24j600 = { .reg_read = regmap_encx24j600_phy_reg_read, }; -void devm_regmap_init_encx24j600(struct device *dev, - struct encx24j600_context *ctx) +int devm_regmap_init_encx24j600(struct device *dev, + struct encx24j600_context *ctx) { mutex_init(&ctx->mutex); regcfg.lock_arg = ctx; ctx->regmap = devm_regmap_init(dev, ®map_encx24j600, ctx, ®cfg); + if (IS_ERR(ctx->regmap)) + return PTR_ERR(ctx->regmap); ctx->phymap = devm_regmap_init(dev, &phymap_encx24j600, ctx, &phycfg); + if (IS_ERR(ctx->phymap)) + return PTR_ERR(ctx->phymap); + + return 0; } EXPORT_SYMBOL_GPL(devm_regmap_init_encx24j600); diff --git a/drivers/net/ethernet/microchip/encx24j600.c b/drivers/net/ethernet/microchip/encx24j600.c index c3a6edc0ddf62..6b43afc4e3246 100644 --- a/drivers/net/ethernet/microchip/encx24j600.c +++ b/drivers/net/ethernet/microchip/encx24j600.c @@ -1027,10 +1027,13 @@ static int encx24j600_spi_probe(struct spi_device *spi) priv->speed = SPEED_100; priv->ctx.spi = spi; - devm_regmap_init_encx24j600(&spi->dev, &priv->ctx); ndev->irq = spi->irq; ndev->netdev_ops = &encx24j600_netdev_ops; + ret = devm_regmap_init_encx24j600(&spi->dev, &priv->ctx); + if (ret) + goto out_free; + mutex_init(&priv->lock); /* Reset device and check if it is connected */ diff --git a/drivers/net/ethernet/microchip/encx24j600_hw.h b/drivers/net/ethernet/microchip/encx24j600_hw.h index f604a260ede79..711147a159aa9 100644 --- a/drivers/net/ethernet/microchip/encx24j600_hw.h +++ b/drivers/net/ethernet/microchip/encx24j600_hw.h @@ -15,8 +15,8 @@ struct encx24j600_context { int bank; }; -void devm_regmap_init_encx24j600(struct device *dev, - struct encx24j600_context *ctx); +int devm_regmap_init_encx24j600(struct device *dev, + struct encx24j600_context *ctx); /* Single-byte instructions */ #define BANK_SELECT(bank) (0xC0 | ((bank & (BANK_MASK >> BANK_SHIFT)) << 1)) From 21728b8f258b5f463630eb42cef7376bb8e24483 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 13 Oct 2021 16:35:49 +0200 Subject: [PATCH 1185/5344] ethernet: s2io: fix setting mac address during resume commit 40507e7aada8422c38aafa0c8a1a09e4623c712a upstream. After recent cleanups, gcc started warning about a suspicious memcpy() call during the s2io_io_resume() function: In function '__dev_addr_set', inlined from 'eth_hw_addr_set' at include/linux/etherdevice.h:318:2, inlined from 's2io_set_mac_addr' at drivers/net/ethernet/neterion/s2io.c:5205:2, inlined from 's2io_io_resume' at drivers/net/ethernet/neterion/s2io.c:8569:7: arch/x86/include/asm/string_32.h:182:25: error: '__builtin_memcpy' accessing 6 bytes at offsets 0 and 2 overlaps 4 bytes at offset 2 [-Werror=restrict] 182 | #define memcpy(t, f, n) __builtin_memcpy(t, f, n) | ^~~~~~~~~~~~~~~~~~~~~~~~~ include/linux/netdevice.h:4648:9: note: in expansion of macro 'memcpy' 4648 | memcpy(dev->dev_addr, addr, len); | ^~~~~~ What apparently happened is that an old cleanup changed the calling conventions for s2io_set_mac_addr() from taking an ethernet address as a character array to taking a struct sockaddr, but one of the callers was not changed at the same time. Change it to instead call the low-level do_s2io_prog_unicast() function that still takes the old argument type. Fixes: 2fd376884558 ("S2io: Added support set_mac_address driver entry point") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20211013143613.2049096-1-arnd@kernel.org Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/neterion/s2io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c index e0b2bf3279053..71ab4e9c9a171 100644 --- a/drivers/net/ethernet/neterion/s2io.c +++ b/drivers/net/ethernet/neterion/s2io.c @@ -8565,7 +8565,7 @@ static void s2io_io_resume(struct pci_dev *pdev) return; } - if (s2io_set_mac_addr(netdev, netdev->dev_addr) == FAILURE) { + if (do_s2io_prog_unicast(netdev, netdev->dev_addr) == FAILURE) { s2io_card_down(sp); pr_err("Can't restore mac addr after reset.\n"); return; From eef6b817f09f752aaec5b542bce44052a3bf934c Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Wed, 13 Oct 2021 11:49:32 +0800 Subject: [PATCH 1186/5344] nfc: fix error handling of nfc_proto_register() commit 0911ab31896f0e908540746414a77dd63912748d upstream. When nfc proto id is using, nfc_proto_register() return -EBUSY error code, but forgot to unregister proto. Fix it by adding proto_unregister() in the error handling case. Fixes: c7fe3b52c128 ("NFC: add NFC socket family") Signed-off-by: Ziyang Xuan Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20211013034932.2833737-1-william.xuanziyang@huawei.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- net/nfc/af_nfc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/nfc/af_nfc.c b/net/nfc/af_nfc.c index 4a9e72073564a..581358dcbdf8d 100644 --- a/net/nfc/af_nfc.c +++ b/net/nfc/af_nfc.c @@ -60,6 +60,9 @@ int nfc_proto_register(const struct nfc_protocol *nfc_proto) proto_tab[nfc_proto->id] = nfc_proto; write_unlock(&proto_tab_lock); + if (rc) + proto_unregister(nfc_proto->proto); + return rc; } EXPORT_SYMBOL(nfc_proto_register); From 9ad61281b20a0d94c21c542b223d5c7855222450 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Wed, 13 Oct 2021 15:50:12 +0800 Subject: [PATCH 1187/5344] NFC: digital: fix possible memory leak in digital_tg_listen_mdaa() commit 58e7dcc9ca29c14e44267a4d0ea61e3229124907 upstream. 'params' is allocated in digital_tg_listen_mdaa(), but not free when digital_send_cmd() failed, which will cause memory leak. Fix it by freeing 'params' if digital_send_cmd() return failed. Fixes: 1c7a4c24fbfd ("NFC Digital: Add target NFC-DEP support") Signed-off-by: Ziyang Xuan Reviewed-by: Krzysztof Kozlowski Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- net/nfc/digital_core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c index e3599ed4a7a87..9c9caa307cf16 100644 --- a/net/nfc/digital_core.c +++ b/net/nfc/digital_core.c @@ -277,6 +277,7 @@ int digital_tg_configure_hw(struct nfc_digital_dev *ddev, int type, int param) static int digital_tg_listen_mdaa(struct nfc_digital_dev *ddev, u8 rf_tech) { struct digital_tg_mdaa_params *params; + int rc; params = kzalloc(sizeof(*params), GFP_KERNEL); if (!params) @@ -291,8 +292,12 @@ static int digital_tg_listen_mdaa(struct nfc_digital_dev *ddev, u8 rf_tech) get_random_bytes(params->nfcid2 + 2, NFC_NFCID2_MAXSIZE - 2); params->sc = DIGITAL_SENSF_FELICA_SC; - return digital_send_cmd(ddev, DIGITAL_CMD_TG_LISTEN_MDAA, NULL, params, - 500, digital_tg_recv_atr_req, NULL); + rc = digital_send_cmd(ddev, DIGITAL_CMD_TG_LISTEN_MDAA, NULL, params, + 500, digital_tg_recv_atr_req, NULL); + if (rc) + kfree(params); + + return rc; } static int digital_tg_listen_md(struct nfc_digital_dev *ddev, u8 rf_tech) From a8dfd0710e538e86e4681cfc51416a5b6566aabf Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Wed, 13 Oct 2021 15:50:32 +0800 Subject: [PATCH 1188/5344] NFC: digital: fix possible memory leak in digital_in_send_sdd_req() commit 291c932fc3692e4d211a445ba8aa35663831bac7 upstream. 'skb' is allocated in digital_in_send_sdd_req(), but not free when digital_in_send_cmd() failed, which will cause memory leak. Fix it by freeing 'skb' if digital_in_send_cmd() return failed. Fixes: 2c66daecc409 ("NFC Digital: Add NFC-A technology support") Signed-off-by: Ziyang Xuan Reviewed-by: Krzysztof Kozlowski Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- net/nfc/digital_technology.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/nfc/digital_technology.c b/net/nfc/digital_technology.c index 84d2345c75a3f..3adf4589852af 100644 --- a/net/nfc/digital_technology.c +++ b/net/nfc/digital_technology.c @@ -465,8 +465,12 @@ static int digital_in_send_sdd_req(struct nfc_digital_dev *ddev, skb_put_u8(skb, sel_cmd); skb_put_u8(skb, DIGITAL_SDD_REQ_SEL_PAR); - return digital_in_send_cmd(ddev, skb, 30, digital_in_recv_sdd_res, - target); + rc = digital_in_send_cmd(ddev, skb, 30, digital_in_recv_sdd_res, + target); + if (rc) + kfree_skb(skb); + + return rc; } static void digital_in_recv_sens_res(struct nfc_digital_dev *ddev, void *arg, From 569c6ab4a50d961a0f312606b01fdf88b73a1df7 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 6 Oct 2021 10:34:19 +0300 Subject: [PATCH 1189/5344] pata_legacy: fix a couple uninitialized variable bugs commit 013923477cb311293df9079332cf8b806ed0e6f2 upstream. The last byte of "pad" is used without being initialized. Fixes: 55dba3120fbc ("libata: update ->data_xfer hook for ATAPI") Signed-off-by: Dan Carpenter Signed-off-by: Damien Le Moal Signed-off-by: Greg Kroah-Hartman --- drivers/ata/pata_legacy.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/ata/pata_legacy.c b/drivers/ata/pata_legacy.c index 4fd12b20df239..d91ba47f2fc44 100644 --- a/drivers/ata/pata_legacy.c +++ b/drivers/ata/pata_legacy.c @@ -315,7 +315,8 @@ static unsigned int pdc_data_xfer_vlb(struct ata_queued_cmd *qc, iowrite32_rep(ap->ioaddr.data_addr, buf, buflen >> 2); if (unlikely(slop)) { - __le32 pad; + __le32 pad = 0; + if (rw == READ) { pad = cpu_to_le32(ioread32(ap->ioaddr.data_addr)); memcpy(buf + buflen - slop, &pad, slop); @@ -705,7 +706,8 @@ static unsigned int vlb32_data_xfer(struct ata_queued_cmd *qc, ioread32_rep(ap->ioaddr.data_addr, buf, buflen >> 2); if (unlikely(slop)) { - __le32 pad; + __le32 pad = 0; + if (rw == WRITE) { memcpy(&pad, buf + buflen - slop, slop); iowrite32(le32_to_cpu(pad), ap->ioaddr.data_addr); From 8d990a8acff5aa6c3b5a2e8caa2c3a02dc892f99 Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Wed, 13 Oct 2021 14:16:31 +0800 Subject: [PATCH 1190/5344] ata: ahci_platform: fix null-ptr-deref in ahci_platform_enable_regulators() commit 776c75010803849c1cc4f11031a2b3960ab05202 upstream. I got a null-ptr-deref report: KASAN: null-ptr-deref in range [0x0000000000000090-0x0000000000000097] ... RIP: 0010:regulator_enable+0x84/0x260 ... Call Trace: ahci_platform_enable_regulators+0xae/0x320 ahci_platform_enable_resources+0x1a/0x120 ahci_probe+0x4f/0x1b9 platform_probe+0x10b/0x280 ... entry_SYSCALL_64_after_hwframe+0x44/0xae If devm_regulator_get() in ahci_platform_get_resources() fails, hpriv->phy_regulator will point to NULL, when enabling or disabling it, null-ptr-deref will occur. ahci_probe() ahci_platform_get_resources() devm_regulator_get(, "phy") // failed, let phy_regulator = NULL ahci_platform_enable_resources() ahci_platform_enable_regulators() regulator_enable(hpriv->phy_regulator) // null-ptr-deref commit 962399bb7fbf ("ata: libahci_platform: Fix regulator_get_optional() misuse") replaces devm_regulator_get_optional() with devm_regulator_get(), but PHY regulator omits to delete "hpriv->phy_regulator = NULL;" like AHCI. Delete it like AHCI regulator to fix this bug. Fixes: commit 962399bb7fbf ("ata: libahci_platform: Fix regulator_get_optional() misuse") Reported-by: Hulk Robot Signed-off-by: Wang Hai Reviewed-by: Hans de Goede Signed-off-by: Damien Le Moal Signed-off-by: Greg Kroah-Hartman --- drivers/ata/libahci_platform.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/ata/libahci_platform.c b/drivers/ata/libahci_platform.c index 416cfbf2f1c29..8a963d2a951db 100644 --- a/drivers/ata/libahci_platform.c +++ b/drivers/ata/libahci_platform.c @@ -440,10 +440,7 @@ struct ahci_host_priv *ahci_platform_get_resources(struct platform_device *pdev, hpriv->phy_regulator = devm_regulator_get(dev, "phy"); if (IS_ERR(hpriv->phy_regulator)) { rc = PTR_ERR(hpriv->phy_regulator); - if (rc == -EPROBE_DEFER) - goto err_out; - rc = 0; - hpriv->phy_regulator = NULL; + goto err_out; } if (flags & AHCI_PLATFORM_GET_RESETS) { From acf9b4ec89f2e46829fb2e3d6c8693a4a3f0e129 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 12 Oct 2021 20:49:55 +0300 Subject: [PATCH 1191/5344] mlxsw: thermal: Fix out-of-bounds memory accesses commit 332fdf951df8b870e3da86b122ae304e2aabe88c upstream. Currently, mlxsw allows cooling states to be set above the maximum cooling state supported by the driver: # cat /sys/class/thermal/thermal_zone2/cdev0/type mlxsw_fan # cat /sys/class/thermal/thermal_zone2/cdev0/max_state 10 # echo 18 > /sys/class/thermal/thermal_zone2/cdev0/cur_state # echo $? 0 This results in out-of-bounds memory accesses when thermal state transition statistics are enabled (CONFIG_THERMAL_STATISTICS=y), as the transition table is accessed with a too large index (state) [1]. According to the thermal maintainer, it is the responsibility of the driver to reject such operations [2]. Therefore, return an error when the state to be set exceeds the maximum cooling state supported by the driver. To avoid dead code, as suggested by the thermal maintainer [3], partially revert commit a421ce088ac8 ("mlxsw: core: Extend cooling device with cooling levels") that tried to interpret these invalid cooling states (above the maximum) in a special way. The cooling levels array is not removed in order to prevent the fans going below 20% PWM, which would cause them to get stuck at 0% PWM. [1] BUG: KASAN: slab-out-of-bounds in thermal_cooling_device_stats_update+0x271/0x290 Read of size 4 at addr ffff8881052f7bf8 by task kworker/0:0/5 CPU: 0 PID: 5 Comm: kworker/0:0 Not tainted 5.15.0-rc3-custom-45935-gce1adf704b14 #122 Hardware name: Mellanox Technologies Ltd. "MSN2410-CB2FO"/"SA000874", BIOS 4.6.5 03/08/2016 Workqueue: events_freezable_power_ thermal_zone_device_check Call Trace: dump_stack_lvl+0x8b/0xb3 print_address_description.constprop.0+0x1f/0x140 kasan_report.cold+0x7f/0x11b thermal_cooling_device_stats_update+0x271/0x290 __thermal_cdev_update+0x15e/0x4e0 thermal_cdev_update+0x9f/0xe0 step_wise_throttle+0x770/0xee0 thermal_zone_device_update+0x3f6/0xdf0 process_one_work+0xa42/0x1770 worker_thread+0x62f/0x13e0 kthread+0x3ee/0x4e0 ret_from_fork+0x1f/0x30 Allocated by task 1: kasan_save_stack+0x1b/0x40 __kasan_kmalloc+0x7c/0x90 thermal_cooling_device_setup_sysfs+0x153/0x2c0 __thermal_cooling_device_register.part.0+0x25b/0x9c0 thermal_cooling_device_register+0xb3/0x100 mlxsw_thermal_init+0x5c5/0x7e0 __mlxsw_core_bus_device_register+0xcb3/0x19c0 mlxsw_core_bus_device_register+0x56/0xb0 mlxsw_pci_probe+0x54f/0x710 local_pci_probe+0xc6/0x170 pci_device_probe+0x2b2/0x4d0 really_probe+0x293/0xd10 __driver_probe_device+0x2af/0x440 driver_probe_device+0x51/0x1e0 __driver_attach+0x21b/0x530 bus_for_each_dev+0x14c/0x1d0 bus_add_driver+0x3ac/0x650 driver_register+0x241/0x3d0 mlxsw_sp_module_init+0xa2/0x174 do_one_initcall+0xee/0x5f0 kernel_init_freeable+0x45a/0x4de kernel_init+0x1f/0x210 ret_from_fork+0x1f/0x30 The buggy address belongs to the object at ffff8881052f7800 which belongs to the cache kmalloc-1k of size 1024 The buggy address is located 1016 bytes inside of 1024-byte region [ffff8881052f7800, ffff8881052f7c00) The buggy address belongs to the page: page:0000000052355272 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1052f0 head:0000000052355272 order:3 compound_mapcount:0 compound_pincount:0 flags: 0x200000000010200(slab|head|node=0|zone=2) raw: 0200000000010200 ffffea0005034800 0000000300000003 ffff888100041dc0 raw: 0000000000000000 0000000000100010 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8881052f7a80: 00 00 00 00 00 00 04 fc fc fc fc fc fc fc fc fc ffff8881052f7b00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff8881052f7b80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ^ ffff8881052f7c00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff8881052f7c80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [2] https://lore.kernel.org/linux-pm/9aca37cb-1629-5c67-1895-1fdc45c0244e@linaro.org/ [3] https://lore.kernel.org/linux-pm/af9857f2-578e-de3a-e62b-6baff7e69fd4@linaro.org/ CC: Daniel Lezcano Fixes: a50c1e35650b ("mlxsw: core: Implement thermal zone") Fixes: a421ce088ac8 ("mlxsw: core: Extend cooling device with cooling levels") Signed-off-by: Ido Schimmel Tested-by: Vadim Pasternak Link: https://lore.kernel.org/r/20211012174955.472928-1-idosch@idosch.org Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- .../ethernet/mellanox/mlxsw/core_thermal.c | 52 ++----------------- 1 file changed, 5 insertions(+), 47 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c index c3fc1cc4bb2b9..882ea12098b7a 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c @@ -25,16 +25,8 @@ #define MLXSW_THERMAL_ZONE_MAX_NAME 16 #define MLXSW_THERMAL_TEMP_SCORE_MAX GENMASK(31, 0) #define MLXSW_THERMAL_MAX_STATE 10 +#define MLXSW_THERMAL_MIN_STATE 2 #define MLXSW_THERMAL_MAX_DUTY 255 -/* Minimum and maximum fan allowed speed in percent: from 20% to 100%. Values - * MLXSW_THERMAL_MAX_STATE + x, where x is between 2 and 10 are used for - * setting fan speed dynamic minimum. For example, if value is set to 14 (40%) - * cooling levels vector will be set to 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10 to - * introduce PWM speed in percent: 40, 40, 40, 40, 40, 50, 60. 70, 80, 90, 100. - */ -#define MLXSW_THERMAL_SPEED_MIN (MLXSW_THERMAL_MAX_STATE + 2) -#define MLXSW_THERMAL_SPEED_MAX (MLXSW_THERMAL_MAX_STATE * 2) -#define MLXSW_THERMAL_SPEED_MIN_LEVEL 2 /* 20% */ /* External cooling devices, allowed for binding to mlxsw thermal zones. */ static char * const mlxsw_thermal_external_allowed_cdev[] = { @@ -703,49 +695,16 @@ static int mlxsw_thermal_set_cur_state(struct thermal_cooling_device *cdev, struct mlxsw_thermal *thermal = cdev->devdata; struct device *dev = thermal->bus_info->dev; char mfsc_pl[MLXSW_REG_MFSC_LEN]; - unsigned long cur_state, i; int idx; - u8 duty; int err; + if (state > MLXSW_THERMAL_MAX_STATE) + return -EINVAL; + idx = mlxsw_get_cooling_device_idx(thermal, cdev); if (idx < 0) return idx; - /* Verify if this request is for changing allowed fan dynamical - * minimum. If it is - update cooling levels accordingly and update - * state, if current state is below the newly requested minimum state. - * For example, if current state is 5, and minimal state is to be - * changed from 4 to 6, thermal->cooling_levels[0 to 5] will be changed - * all from 4 to 6. And state 5 (thermal->cooling_levels[4]) should be - * overwritten. - */ - if (state >= MLXSW_THERMAL_SPEED_MIN && - state <= MLXSW_THERMAL_SPEED_MAX) { - state -= MLXSW_THERMAL_MAX_STATE; - for (i = 0; i <= MLXSW_THERMAL_MAX_STATE; i++) - thermal->cooling_levels[i] = max(state, i); - - mlxsw_reg_mfsc_pack(mfsc_pl, idx, 0); - err = mlxsw_reg_query(thermal->core, MLXSW_REG(mfsc), mfsc_pl); - if (err) - return err; - - duty = mlxsw_reg_mfsc_pwm_duty_cycle_get(mfsc_pl); - cur_state = mlxsw_duty_to_state(duty); - - /* If current fan state is lower than requested dynamical - * minimum, increase fan speed up to dynamical minimum. - */ - if (state < cur_state) - return 0; - - state = cur_state; - } - - if (state > MLXSW_THERMAL_MAX_STATE) - return -EINVAL; - /* Normalize the state to the valid speed range. */ state = thermal->cooling_levels[state]; mlxsw_reg_mfsc_pack(mfsc_pl, idx, mlxsw_state_to_duty(state)); @@ -1040,8 +999,7 @@ int mlxsw_thermal_init(struct mlxsw_core *core, /* Initialize cooling levels per PWM state. */ for (i = 0; i < MLXSW_THERMAL_MAX_STATE; i++) - thermal->cooling_levels[i] = max(MLXSW_THERMAL_SPEED_MIN_LEVEL, - i); + thermal->cooling_levels[i] = max(MLXSW_THERMAL_MIN_STATE, i); thermal->polling_delay = bus_info->low_frequency ? MLXSW_THERMAL_SLOW_POLL_INT : From c595276919cc6ab14aaeddbfc1f96e0cc91c8a27 Mon Sep 17 00:00:00 2001 From: Vadim Pasternak Date: Mon, 27 Sep 2021 17:22:13 +0300 Subject: [PATCH 1192/5344] platform/mellanox: mlxreg-io: Fix argument base in kstrtou32() call commit 9b024201693e397441668cca0d2df7055fe572eb upstream. Change kstrtou32() argument 'base' to be zero instead of 'len'. It works by chance for setting one bit value, but it is not supposed to work in case value passed to mlxreg_io_attr_store() is greater than 1. It works for example, for: echo 1 > /sys/devices/platform/mlxplat/mlxreg-io/hwmon/.../jtag_enable But it will fail for: echo n > /sys/devices/platform/mlxplat/mlxreg-io/hwmon/.../jtag_enable, where n > 1. The flow for input buffer conversion is as below: _kstrtoull(const char *s, unsigned int base, unsigned long long *res) calls: rv = _parse_integer(s, base, &_res); For the second case, where n > 1: - _parse_integer() converts 's' to 'val'. For n=2, 'len' is set to 2 (string buffer is 0x32 0x0a), for n=3 'len' is set to 3 (string buffer 0x33 0x0a), etcetera. - 'base' is equal or greater then '2' (length of input buffer). As a result, _parse_integer() exits with result zero (rv): rv = 0; while (1) { ... if (val >= base)-> (2 >= 2) break; ... rv++; ... } And _kstrtoull() in their turn will fail: if (rv == 0) return -EINVAL; Fixes: 5ec4a8ace06c ("platform/mellanox: Introduce support for Mellanox register access driver") Signed-off-by: Vadim Pasternak Link: https://lore.kernel.org/r/20210927142214.2613929-2-vadimp@nvidia.com Signed-off-by: Hans de Goede Signed-off-by: Greg Kroah-Hartman --- drivers/platform/mellanox/mlxreg-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/mellanox/mlxreg-io.c b/drivers/platform/mellanox/mlxreg-io.c index acfaf64ffde68..1c3760c13f832 100644 --- a/drivers/platform/mellanox/mlxreg-io.c +++ b/drivers/platform/mellanox/mlxreg-io.c @@ -123,7 +123,7 @@ mlxreg_io_attr_store(struct device *dev, struct device_attribute *attr, return -EINVAL; /* Convert buffer to input value. */ - ret = kstrtou32(buf, len, &input_val); + ret = kstrtou32(buf, 0, &input_val); if (ret) return ret; From 251dc18933a6c72e31abd69da6806c57a58eb25e Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Tue, 12 Oct 2021 13:52:42 +0200 Subject: [PATCH 1193/5344] drm/panel: olimex-lcd-olinuxino: select CRC32 commit a14bc107edd0c108bda2245e50daa22f91c95d20 upstream. Fix the following build/link error by adding a dependency on the CRC32 routines: ld: drivers/gpu/drm/panel/panel-olimex-lcd-olinuxino.o: in function `lcd_olinuxino_probe': panel-olimex-lcd-olinuxino.c:(.text+0x303): undefined reference to `crc32_le' Fixes: 17fd7a9d324fd ("drm/panel: Add support for Olimex LCD-OLinuXino panel") Cc: Arnd Bergmann Signed-off-by: Vegard Nossum Signed-off-by: Sam Ravnborg Link: https://patchwork.freedesktop.org/patch/msgid/20211012115242.10325-1-vegard.nossum@oracle.com Signed-off-by: Dave Airlie Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/panel/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/panel/Kconfig b/drivers/gpu/drm/panel/Kconfig index f152bc4eeb535..e70ccadf7541d 100644 --- a/drivers/gpu/drm/panel/Kconfig +++ b/drivers/gpu/drm/panel/Kconfig @@ -141,6 +141,7 @@ config DRM_PANEL_OLIMEX_LCD_OLINUXINO depends on OF depends on I2C depends on BACKLIGHT_CLASS_DEVICE + select CRC32 help The panel is used with different sizes LCDs, from 480x272 to 1280x800, and 24 bit per pixel. From 88902b237c47e3a503c542dcfb8741ecfead360a Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 29 Sep 2021 13:18:57 +0100 Subject: [PATCH 1194/5344] drm/msm: Fix null pointer dereference on pointer edp commit 2133c4fc8e1348dcb752f267a143fe2254613b34 upstream. The initialization of pointer dev dereferences pointer edp before edp is null checked, so there is a potential null pointer deference issue. Fix this by only dereferencing edp after edp has been null checked. Addresses-Coverity: ("Dereference before null check") Fixes: ab5b0107ccf3 ("drm/msm: Initial add eDP support in msm drm driver (v5)") Signed-off-by: Colin Ian King Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20210929121857.213922-1-colin.king@canonical.com Signed-off-by: Rob Clark Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/msm/edp/edp_ctrl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/edp/edp_ctrl.c b/drivers/gpu/drm/msm/edp/edp_ctrl.c index 7f3dd3ffe2c9b..bbbd96f9eaa06 100644 --- a/drivers/gpu/drm/msm/edp/edp_ctrl.c +++ b/drivers/gpu/drm/msm/edp/edp_ctrl.c @@ -1082,7 +1082,7 @@ void msm_edp_ctrl_power(struct edp_ctrl *ctrl, bool on) int msm_edp_ctrl_init(struct msm_edp *edp) { struct edp_ctrl *ctrl = NULL; - struct device *dev = &edp->pdev->dev; + struct device *dev; int ret; if (!edp) { @@ -1090,6 +1090,7 @@ int msm_edp_ctrl_init(struct msm_edp *edp) return -EINVAL; } + dev = &edp->pdev->dev; ctrl = devm_kzalloc(dev, sizeof(*ctrl), GFP_KERNEL); if (!ctrl) return -ENOMEM; From 36419ea0d8b416b5c2a9d9ce21a0c802f16ec87b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 1 Oct 2021 15:33:08 +0300 Subject: [PATCH 1195/5344] drm/msm/dsi: Fix an error code in msm_dsi_modeset_init() commit 739b4e7756d3301dd673ca517afca46a5f635562 upstream. Return an error code if msm_dsi_manager_validate_current_config(). Don't return success. Fixes: 8b03ad30e314 ("drm/msm/dsi: Use one connector for dual DSI mode") Signed-off-by: Dan Carpenter Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20211001123308.GF2283@kili Signed-off-by: Dmitry Baryshkov Signed-off-by: Rob Clark Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/msm/dsi/dsi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/dsi/dsi.c b/drivers/gpu/drm/msm/dsi/dsi.c index 0d37ae5b310c4..a11b98e990019 100644 --- a/drivers/gpu/drm/msm/dsi/dsi.c +++ b/drivers/gpu/drm/msm/dsi/dsi.c @@ -206,8 +206,10 @@ int msm_dsi_modeset_init(struct msm_dsi *msm_dsi, struct drm_device *dev, goto fail; } - if (!msm_dsi_manager_validate_current_config(msm_dsi->id)) + if (!msm_dsi_manager_validate_current_config(msm_dsi->id)) { + ret = -EINVAL; goto fail; + } msm_dsi->encoder = encoder; From 1bad1cc1e6a94b6a48663eaf4fe6b3091e96ae18 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 1 Oct 2021 15:34:09 +0300 Subject: [PATCH 1196/5344] drm/msm/dsi: fix off by one in dsi_bus_clk_enable error handling commit c8f01ffc83923a91e8087aaa077de13354a7aa59 upstream. This disables a lock which wasn't enabled and it does not disable the first lock in the array. Fixes: 6e0eb52eba9e ("drm/msm/dsi: Parse bus clocks from a list") Signed-off-by: Dan Carpenter Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20211001123409.GG2283@kili Signed-off-by: Dmitry Baryshkov Signed-off-by: Rob Clark Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/msm/dsi/dsi_host.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/dsi/dsi_host.c b/drivers/gpu/drm/msm/dsi/dsi_host.c index 1e7b1be25bb07..5613234823f7d 100644 --- a/drivers/gpu/drm/msm/dsi/dsi_host.c +++ b/drivers/gpu/drm/msm/dsi/dsi_host.c @@ -460,7 +460,7 @@ static int dsi_bus_clk_enable(struct msm_dsi_host *msm_host) return 0; err: - for (; i > 0; i--) + while (--i >= 0) clk_disable_unprepare(msm_host->bus_clks[i]); return ret; From 503fd0562e03a7bd1c77102445c22e6ac3f1623b Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Mon, 23 Aug 2021 17:25:26 +0800 Subject: [PATCH 1197/5344] acpi/arm64: fix next_platform_timer() section mismatch error commit 596143e3aec35c93508d6b7a05ddc999ee209b61 upstream. Fix modpost Section mismatch error in next_platform_timer(). [...] WARNING: modpost: vmlinux.o(.text.unlikely+0x26e60): Section mismatch in reference from the function next_platform_timer() to the variable .init.data:acpi_gtdt_desc The function next_platform_timer() references the variable __initdata acpi_gtdt_desc. This is often because next_platform_timer lacks a __initdata annotation or the annotation of acpi_gtdt_desc is wrong. WARNING: modpost: vmlinux.o(.text.unlikely+0x26e64): Section mismatch in reference from the function next_platform_timer() to the variable .init.data:acpi_gtdt_desc The function next_platform_timer() references the variable __initdata acpi_gtdt_desc. This is often because next_platform_timer lacks a __initdata annotation or the annotation of acpi_gtdt_desc is wrong. ERROR: modpost: Section mismatches detected. Set CONFIG_SECTION_MISMATCH_WARN_ONLY=y to allow them. make[1]: *** [scripts/Makefile.modpost:59: vmlinux.symvers] Error 1 make[1]: *** Deleting file 'vmlinux.symvers' make: *** [Makefile:1176: vmlinux] Error 2 [...] Fixes: a712c3ed9b8a ("acpi/arm64: Add memory-mapped timer support in GTDT driver") Signed-off-by: Jackie Liu Acked-by: Hanjun Guo Link: https://lore.kernel.org/r/20210823092526.2407526-1-liu.yun@linux.dev Signed-off-by: Catalin Marinas Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/arm64/gtdt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/arm64/gtdt.c b/drivers/acpi/arm64/gtdt.c index 7d7497b856020..311419d1d6b05 100644 --- a/drivers/acpi/arm64/gtdt.c +++ b/drivers/acpi/arm64/gtdt.c @@ -36,7 +36,7 @@ struct acpi_gtdt_descriptor { static struct acpi_gtdt_descriptor acpi_gtdt_desc __initdata; -static inline void *next_platform_timer(void *platform_timer) +static inline __init void *next_platform_timer(void *platform_timer) { struct acpi_gtdt_header *gh = platform_timer; From ec25bf9fe6a86ad628ef005d7533fa094b4a1ba6 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 7 Oct 2021 19:49:57 +0200 Subject: [PATCH 1198/5344] mqprio: Correct stats in mqprio_dump_class_stats(). commit 14132690860e4d06aa3e1c4d7d8e9866ba7756dd upstream. Introduction of lockless subqueues broke the class statistics. Before the change stats were accumulated in `bstats' and `qstats' on the stack which was then copied to struct gnet_dump. After the change the `bstats' and `qstats' are initialized to 0 and never updated, yet still fed to gnet_dump. The code updates the global qdisc->cpu_bstats and qdisc->cpu_qstats instead, clobbering them. Most likely a copy-paste error from the code in mqprio_dump(). __gnet_stats_copy_basic() and __gnet_stats_copy_queue() accumulate the values for per-CPU case but for global stats they overwrite the value, so only stats from the last loop iteration / tc end up in sch->[bq]stats. Use the on-stack [bq]stats variables again and add the stats manually in the global case. Fixes: ce679e8df7ed2 ("net: sched: add support for TCQ_F_NOLOCK subqueues to sch_mqprio") Cc: John Fastabend Signed-off-by: Sebastian Andrzej Siewior https://lore.kernel.org/all/20211007175000.2334713-2-bigeasy@linutronix.de/ Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- net/sched/sch_mqprio.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 8766ab5b87880..5eb3b1b7ae5e7 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -529,22 +529,28 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, for (i = tc.offset; i < tc.offset + tc.count; i++) { struct netdev_queue *q = netdev_get_tx_queue(dev, i); struct Qdisc *qdisc = rtnl_dereference(q->qdisc); - struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL; - struct gnet_stats_queue __percpu *cpu_qstats = NULL; spin_lock_bh(qdisc_lock(qdisc)); + if (qdisc_is_percpu_stats(qdisc)) { - cpu_bstats = qdisc->cpu_bstats; - cpu_qstats = qdisc->cpu_qstats; + qlen = qdisc_qlen_sum(qdisc); + + __gnet_stats_copy_basic(NULL, &bstats, + qdisc->cpu_bstats, + &qdisc->bstats); + __gnet_stats_copy_queue(&qstats, + qdisc->cpu_qstats, + &qdisc->qstats, + qlen); + } else { + qlen += qdisc->q.qlen; + bstats.bytes += qdisc->bstats.bytes; + bstats.packets += qdisc->bstats.packets; + qstats.backlog += qdisc->qstats.backlog; + qstats.drops += qdisc->qstats.drops; + qstats.requeues += qdisc->qstats.requeues; + qstats.overlimits += qdisc->qstats.overlimits; } - - qlen = qdisc_qlen_sum(qdisc); - __gnet_stats_copy_basic(NULL, &sch->bstats, - cpu_bstats, &qdisc->bstats); - __gnet_stats_copy_queue(&sch->qstats, - cpu_qstats, - &qdisc->qstats, - qlen); spin_unlock_bh(qdisc_lock(qdisc)); } From d6c3ef6b55309fa75895773e56c7699d2259fd85 Mon Sep 17 00:00:00 2001 From: chongjiapeng Date: Sat, 9 Oct 2021 16:09:26 +0800 Subject: [PATCH 1199/5344] qed: Fix missing error code in qed_slowpath_start() commit a5a14ea7b4e55604acb0dc9d88fdb4cb6945bc77 upstream. The error code is missing in this code scenario, add the error code '-EINVAL' to the return value 'rc'. Eliminate the follow smatch warning: drivers/net/ethernet/qlogic/qed/qed_main.c:1298 qed_slowpath_start() warn: missing error code 'rc'. Reported-by: Abaci Robot Fixes: d51e4af5c209 ("qed: aRFS infrastructure support") Signed-off-by: chongjiapeng Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/qlogic/qed/qed_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index 1db49424aa43c..1cbcc27602780 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -1238,6 +1238,7 @@ static int qed_slowpath_start(struct qed_dev *cdev, } else { DP_NOTICE(cdev, "Failed to acquire PTT for aRFS\n"); + rc = -EINVAL; goto err; } } From 9ed70947dc682747e5b880b7e9527ac76a0f451e Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Mon, 11 Oct 2021 17:22:49 +0200 Subject: [PATCH 1200/5344] r8152: select CRC32 and CRYPTO/CRYPTO_HASH/CRYPTO_SHA256 commit 9973a43012b6ad1720dbc4d5faf5302c28635b8c upstream. Fix the following build/link errors by adding a dependency on CRYPTO, CRYPTO_HASH, CRYPTO_SHA256 and CRC32: ld: drivers/net/usb/r8152.o: in function `rtl8152_fw_verify_checksum': r8152.c:(.text+0x2b2a): undefined reference to `crypto_alloc_shash' ld: r8152.c:(.text+0x2bed): undefined reference to `crypto_shash_digest' ld: r8152.c:(.text+0x2c50): undefined reference to `crypto_destroy_tfm' ld: drivers/net/usb/r8152.o: in function `_rtl8152_set_rx_mode': r8152.c:(.text+0xdcb0): undefined reference to `crc32_le' Fixes: 9370f2d05a2a1 ("r8152: support request_firmware for RTL8153") Fixes: ac718b69301c7 ("net/usb: new driver for RTL8152") Signed-off-by: Vegard Nossum Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/usb/Kconfig b/drivers/net/usb/Kconfig index 05bdcc5917f6b..ca234d1a0e3bf 100644 --- a/drivers/net/usb/Kconfig +++ b/drivers/net/usb/Kconfig @@ -99,6 +99,10 @@ config USB_RTL8150 config USB_RTL8152 tristate "Realtek RTL8152/RTL8153 Based USB Ethernet Adapters" select MII + select CRC32 + select CRYPTO + select CRYPTO_HASH + select CRYPTO_SHA256 help This option adds support for Realtek RTL8152 based USB 2.0 10/100 Ethernet adapters and RTL8153 based USB 3.0 10/100/1000 From 84fc37946c146e24fd96e1eba4a8cacbd74d8c01 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Fri, 8 Oct 2021 12:38:01 -0700 Subject: [PATCH 1201/5344] ionic: don't remove netdev->dev_addr when syncing uc list commit 5c976a56570f29aaf4a2f9a1bf99789c252183c9 upstream. Bridging, and possibly other upper stack gizmos, adds the lower device's netdev->dev_addr to its own uc list, and then requests it be deleted when the upper bridge device is removed. This delete request also happens with the bridging vlan_filtering is enabled and then disabled. Bonding has a similar behavior with the uc list, but since it also uses set_mac to manage netdev->dev_addr, it doesn't have the same the failure case. Because we store our netdev->dev_addr in our uc list, we need to ignore the delete request from dev_uc_sync so as to not lose the address and all hope of communicating. Note that ndo_set_mac_address is expressly changing netdev->dev_addr, so no limitation is set there. Fixes: 2a654540be10 ("ionic: Add Rx filter and rx_mode ndo support") Signed-off-by: Shannon Nelson Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/pensando/ionic/ionic_lif.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index e66002251596b..99ba3551458fc 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -912,6 +912,10 @@ static int ionic_addr_add(struct net_device *netdev, const u8 *addr) static int ionic_addr_del(struct net_device *netdev, const u8 *addr) { + /* Don't delete our own address from the uc list */ + if (ether_addr_equal(addr, netdev->dev_addr)) + return 0; + return ionic_lif_addr(netdev_priv(netdev), addr, false); } From dbf565b705c25dd6355cb1a38938d8aa9c9c161b Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 20 Oct 2021 11:40:18 +0200 Subject: [PATCH 1202/5344] Linux 5.4.155 Link: https://lore.kernel.org/r/20211018132329.453964125@linuxfoundation.org Link: https://lore.kernel.org/r/20211018143049.664480980@linuxfoundation.org Tested-by: Shuah Khan Tested-by: Florian Fainelli Tested-by: Jon Hunter Tested-by: Linux Kernel Functional Testing Tested-by: Sudip Mukherjee Tested-by: Guenter Roeck Tested-by: Hulk Robot Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 3358f56a37f06..f7e2bf924463b 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 4 -SUBLEVEL = 154 +SUBLEVEL = 155 EXTRAVERSION = NAME = Kleptomaniac Octopus From 068bbf66c09dd78c46b14011d29f17fc062bcb13 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 1 Sep 2021 22:18:18 +0200 Subject: [PATCH 1203/5344] parisc: math-emu: Fix fall-through warnings commit 6f1fce595b78b775d7fb585c15c2dc3a6994f96e upstream. Fix lots of fallthrough warnings, e.g.: arch/parisc/math-emu/fpudispatch.c:323:33: warning: this statement may fall through [-Wimplicit-fallthrough=] Signed-off-by: Helge Deller Signed-off-by: Greg Kroah-Hartman --- arch/parisc/math-emu/fpudispatch.c | 56 ++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 3 deletions(-) diff --git a/arch/parisc/math-emu/fpudispatch.c b/arch/parisc/math-emu/fpudispatch.c index 7c46969ead9b1..01ed133227c25 100644 --- a/arch/parisc/math-emu/fpudispatch.c +++ b/arch/parisc/math-emu/fpudispatch.c @@ -310,12 +310,15 @@ decode_0c(u_int ir, u_int class, u_int subop, u_int fpregs[]) r1 &= ~3; fpregs[t+3] = fpregs[r1+3]; fpregs[t+2] = fpregs[r1+2]; + fallthrough; case 1: /* double */ fpregs[t+1] = fpregs[r1+1]; + fallthrough; case 0: /* single */ fpregs[t] = fpregs[r1]; return(NOEXCEPTION); } + BUG(); case 3: /* FABS */ switch (fmt) { case 2: /* illegal */ @@ -325,13 +328,16 @@ decode_0c(u_int ir, u_int class, u_int subop, u_int fpregs[]) r1 &= ~3; fpregs[t+3] = fpregs[r1+3]; fpregs[t+2] = fpregs[r1+2]; + fallthrough; case 1: /* double */ fpregs[t+1] = fpregs[r1+1]; + fallthrough; case 0: /* single */ /* copy and clear sign bit */ fpregs[t] = fpregs[r1] & 0x7fffffff; return(NOEXCEPTION); } + BUG(); case 6: /* FNEG */ switch (fmt) { case 2: /* illegal */ @@ -341,13 +347,16 @@ decode_0c(u_int ir, u_int class, u_int subop, u_int fpregs[]) r1 &= ~3; fpregs[t+3] = fpregs[r1+3]; fpregs[t+2] = fpregs[r1+2]; + fallthrough; case 1: /* double */ fpregs[t+1] = fpregs[r1+1]; + fallthrough; case 0: /* single */ /* copy and invert sign bit */ fpregs[t] = fpregs[r1] ^ 0x80000000; return(NOEXCEPTION); } + BUG(); case 7: /* FNEGABS */ switch (fmt) { case 2: /* illegal */ @@ -357,13 +366,16 @@ decode_0c(u_int ir, u_int class, u_int subop, u_int fpregs[]) r1 &= ~3; fpregs[t+3] = fpregs[r1+3]; fpregs[t+2] = fpregs[r1+2]; + fallthrough; case 1: /* double */ fpregs[t+1] = fpregs[r1+1]; + fallthrough; case 0: /* single */ /* copy and set sign bit */ fpregs[t] = fpregs[r1] | 0x80000000; return(NOEXCEPTION); } + BUG(); case 4: /* FSQRT */ switch (fmt) { case 0: @@ -376,6 +388,7 @@ decode_0c(u_int ir, u_int class, u_int subop, u_int fpregs[]) case 3: /* quad not implemented */ return(MAJOR_0C_EXCP); } + BUG(); case 5: /* FRND */ switch (fmt) { case 0: @@ -389,7 +402,7 @@ decode_0c(u_int ir, u_int class, u_int subop, u_int fpregs[]) return(MAJOR_0C_EXCP); } } /* end of switch (subop) */ - + BUG(); case 1: /* class 1 */ df = extru(ir,fpdfpos,2); /* get dest format */ if ((df & 2) || (fmt & 2)) { @@ -419,6 +432,7 @@ decode_0c(u_int ir, u_int class, u_int subop, u_int fpregs[]) case 3: /* dbl/dbl */ return(MAJOR_0C_EXCP); } + BUG(); case 1: /* FCNVXF */ switch(fmt) { case 0: /* sgl/sgl */ @@ -434,6 +448,7 @@ decode_0c(u_int ir, u_int class, u_int subop, u_int fpregs[]) return(dbl_to_dbl_fcnvxf(&fpregs[r1],0, &fpregs[t],status)); } + BUG(); case 2: /* FCNVFX */ switch(fmt) { case 0: /* sgl/sgl */ @@ -449,6 +464,7 @@ decode_0c(u_int ir, u_int class, u_int subop, u_int fpregs[]) return(dbl_to_dbl_fcnvfx(&fpregs[r1],0, &fpregs[t],status)); } + BUG(); case 3: /* FCNVFXT */ switch(fmt) { case 0: /* sgl/sgl */ @@ -464,6 +480,7 @@ decode_0c(u_int ir, u_int class, u_int subop, u_int fpregs[]) return(dbl_to_dbl_fcnvfxt(&fpregs[r1],0, &fpregs[t],status)); } + BUG(); case 5: /* FCNVUF (PA2.0 only) */ switch(fmt) { case 0: /* sgl/sgl */ @@ -479,6 +496,7 @@ decode_0c(u_int ir, u_int class, u_int subop, u_int fpregs[]) return(dbl_to_dbl_fcnvuf(&fpregs[r1],0, &fpregs[t],status)); } + BUG(); case 6: /* FCNVFU (PA2.0 only) */ switch(fmt) { case 0: /* sgl/sgl */ @@ -494,6 +512,7 @@ decode_0c(u_int ir, u_int class, u_int subop, u_int fpregs[]) return(dbl_to_dbl_fcnvfu(&fpregs[r1],0, &fpregs[t],status)); } + BUG(); case 7: /* FCNVFUT (PA2.0 only) */ switch(fmt) { case 0: /* sgl/sgl */ @@ -509,10 +528,11 @@ decode_0c(u_int ir, u_int class, u_int subop, u_int fpregs[]) return(dbl_to_dbl_fcnvfut(&fpregs[r1],0, &fpregs[t],status)); } + BUG(); case 4: /* undefined */ return(MAJOR_0C_EXCP); } /* end of switch subop */ - + BUG(); case 2: /* class 2 */ fpu_type_flags=fpregs[FPU_TYPE_FLAG_POS]; r2 = extru(ir, fpr2pos, 5) * sizeof(double)/sizeof(u_int); @@ -590,6 +610,7 @@ decode_0c(u_int ir, u_int class, u_int subop, u_int fpregs[]) case 3: /* quad not implemented */ return(MAJOR_0C_EXCP); } + BUG(); case 1: /* FTEST */ switch (fmt) { case 0: @@ -609,8 +630,10 @@ decode_0c(u_int ir, u_int class, u_int subop, u_int fpregs[]) case 3: return(MAJOR_0C_EXCP); } + BUG(); } /* end of switch subop */ } /* end of else for PA1.0 & PA1.1 */ + BUG(); case 3: /* class 3 */ r2 = extru(ir,fpr2pos,5) * sizeof(double)/sizeof(u_int); if (r2 == 0) @@ -633,6 +656,7 @@ decode_0c(u_int ir, u_int class, u_int subop, u_int fpregs[]) case 3: /* quad not implemented */ return(MAJOR_0C_EXCP); } + BUG(); case 1: /* FSUB */ switch (fmt) { case 0: @@ -645,6 +669,7 @@ decode_0c(u_int ir, u_int class, u_int subop, u_int fpregs[]) case 3: /* quad not implemented */ return(MAJOR_0C_EXCP); } + BUG(); case 2: /* FMPY */ switch (fmt) { case 0: @@ -657,6 +682,7 @@ decode_0c(u_int ir, u_int class, u_int subop, u_int fpregs[]) case 3: /* quad not implemented */ return(MAJOR_0C_EXCP); } + BUG(); case 3: /* FDIV */ switch (fmt) { case 0: @@ -669,6 +695,7 @@ decode_0c(u_int ir, u_int class, u_int subop, u_int fpregs[]) case 3: /* quad not implemented */ return(MAJOR_0C_EXCP); } + BUG(); case 4: /* FREM */ switch (fmt) { case 0: @@ -681,6 +708,7 @@ decode_0c(u_int ir, u_int class, u_int subop, u_int fpregs[]) case 3: /* quad not implemented */ return(MAJOR_0C_EXCP); } + BUG(); } /* end of class 3 switch */ } /* end of switch(class) */ @@ -736,10 +764,12 @@ u_int fpregs[]; return(MAJOR_0E_EXCP); case 1: /* double */ fpregs[t+1] = fpregs[r1+1]; + fallthrough; case 0: /* single */ fpregs[t] = fpregs[r1]; return(NOEXCEPTION); } + BUG(); case 3: /* FABS */ switch (fmt) { case 2: @@ -747,10 +777,12 @@ u_int fpregs[]; return(MAJOR_0E_EXCP); case 1: /* double */ fpregs[t+1] = fpregs[r1+1]; + fallthrough; case 0: /* single */ fpregs[t] = fpregs[r1] & 0x7fffffff; return(NOEXCEPTION); } + BUG(); case 6: /* FNEG */ switch (fmt) { case 2: @@ -758,10 +790,12 @@ u_int fpregs[]; return(MAJOR_0E_EXCP); case 1: /* double */ fpregs[t+1] = fpregs[r1+1]; + fallthrough; case 0: /* single */ fpregs[t] = fpregs[r1] ^ 0x80000000; return(NOEXCEPTION); } + BUG(); case 7: /* FNEGABS */ switch (fmt) { case 2: @@ -769,10 +803,12 @@ u_int fpregs[]; return(MAJOR_0E_EXCP); case 1: /* double */ fpregs[t+1] = fpregs[r1+1]; + fallthrough; case 0: /* single */ fpregs[t] = fpregs[r1] | 0x80000000; return(NOEXCEPTION); } + BUG(); case 4: /* FSQRT */ switch (fmt) { case 0: @@ -785,6 +821,7 @@ u_int fpregs[]; case 3: return(MAJOR_0E_EXCP); } + BUG(); case 5: /* FRMD */ switch (fmt) { case 0: @@ -798,7 +835,7 @@ u_int fpregs[]; return(MAJOR_0E_EXCP); } } /* end of switch (subop */ - + BUG(); case 1: /* class 1 */ df = extru(ir,fpdfpos,2); /* get dest format */ /* @@ -826,6 +863,7 @@ u_int fpregs[]; case 3: /* dbl/dbl */ return(MAJOR_0E_EXCP); } + BUG(); case 1: /* FCNVXF */ switch(fmt) { case 0: /* sgl/sgl */ @@ -841,6 +879,7 @@ u_int fpregs[]; return(dbl_to_dbl_fcnvxf(&fpregs[r1],0, &fpregs[t],status)); } + BUG(); case 2: /* FCNVFX */ switch(fmt) { case 0: /* sgl/sgl */ @@ -856,6 +895,7 @@ u_int fpregs[]; return(dbl_to_dbl_fcnvfx(&fpregs[r1],0, &fpregs[t],status)); } + BUG(); case 3: /* FCNVFXT */ switch(fmt) { case 0: /* sgl/sgl */ @@ -871,6 +911,7 @@ u_int fpregs[]; return(dbl_to_dbl_fcnvfxt(&fpregs[r1],0, &fpregs[t],status)); } + BUG(); case 5: /* FCNVUF (PA2.0 only) */ switch(fmt) { case 0: /* sgl/sgl */ @@ -886,6 +927,7 @@ u_int fpregs[]; return(dbl_to_dbl_fcnvuf(&fpregs[r1],0, &fpregs[t],status)); } + BUG(); case 6: /* FCNVFU (PA2.0 only) */ switch(fmt) { case 0: /* sgl/sgl */ @@ -901,6 +943,7 @@ u_int fpregs[]; return(dbl_to_dbl_fcnvfu(&fpregs[r1],0, &fpregs[t],status)); } + BUG(); case 7: /* FCNVFUT (PA2.0 only) */ switch(fmt) { case 0: /* sgl/sgl */ @@ -916,9 +959,11 @@ u_int fpregs[]; return(dbl_to_dbl_fcnvfut(&fpregs[r1],0, &fpregs[t],status)); } + BUG(); case 4: /* undefined */ return(MAJOR_0C_EXCP); } /* end of switch subop */ + BUG(); case 2: /* class 2 */ /* * Be careful out there. @@ -994,6 +1039,7 @@ u_int fpregs[]; } } /* end of switch subop */ } /* end of else for PA1.0 & PA1.1 */ + BUG(); case 3: /* class 3 */ /* * Be careful out there. @@ -1026,6 +1072,7 @@ u_int fpregs[]; return(dbl_fadd(&fpregs[r1],&fpregs[r2], &fpregs[t],status)); } + BUG(); case 1: /* FSUB */ switch (fmt) { case 0: @@ -1035,6 +1082,7 @@ u_int fpregs[]; return(dbl_fsub(&fpregs[r1],&fpregs[r2], &fpregs[t],status)); } + BUG(); case 2: /* FMPY or XMPYU */ /* * check for integer multiply (x bit set) @@ -1071,6 +1119,7 @@ u_int fpregs[]; &fpregs[r2],&fpregs[t],status)); } } + BUG(); case 3: /* FDIV */ switch (fmt) { case 0: @@ -1080,6 +1129,7 @@ u_int fpregs[]; return(dbl_fdiv(&fpregs[r1],&fpregs[r2], &fpregs[t],status)); } + BUG(); case 4: /* FREM */ switch (fmt) { case 0: From cb0e11e92d0cf515e8552ee623ba4c4958415f01 Mon Sep 17 00:00:00 2001 From: Russell King Date: Wed, 26 Feb 2020 17:14:21 +0000 Subject: [PATCH 1204/5344] net: switchdev: do not propagate bridge updates across bridges MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 07c6f9805f12f1bb538ef165a092b300350384aa upstream. When configuring a tree of independent bridges, propagating changes from the upper bridge across a bridge master to the lower bridge ports brings surprises. For example, a lower bridge may have vlan filtering enabled. It may have a vlan interface attached to the bridge master, which may then be incorporated into another bridge. As soon as the lower bridge vlan interface is attached to the upper bridge, the lower bridge has vlan filtering disabled. This occurs because switchdev recursively applies its changes to all lower devices no matter what. Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Signed-off-by: Russell King Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller Cc: Fabian Bläse Signed-off-by: Greg Kroah-Hartman --- net/switchdev/switchdev.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index ea9ddea35a886..96773555af5ca 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -476,6 +476,9 @@ static int __switchdev_handle_port_obj_add(struct net_device *dev, * necessary to go through this helper. */ netdev_for_each_lower_dev(dev, lower_dev, iter) { + if (netif_is_bridge_master(lower_dev)) + continue; + err = __switchdev_handle_port_obj_add(lower_dev, port_obj_info, check_cb, add_cb); if (err && err != -EOPNOTSUPP) @@ -528,6 +531,9 @@ static int __switchdev_handle_port_obj_del(struct net_device *dev, * necessary to go through this helper. */ netdev_for_each_lower_dev(dev, lower_dev, iter) { + if (netif_is_bridge_master(lower_dev)) + continue; + err = __switchdev_handle_port_obj_del(lower_dev, port_obj_info, check_cb, del_cb); if (err && err != -EOPNOTSUPP) @@ -579,6 +585,9 @@ static int __switchdev_handle_port_attr_set(struct net_device *dev, * necessary to go through this helper. */ netdev_for_each_lower_dev(dev, lower_dev, iter) { + if (netif_is_bridge_master(lower_dev)) + continue; + err = __switchdev_handle_port_attr_set(lower_dev, port_attr_info, check_cb, set_cb); if (err && err != -EOPNOTSUPP) From 0e1c199824c8358be95fac3e69a96e195b29de13 Mon Sep 17 00:00:00 2001 From: Sumit Garg Date: Tue, 12 Oct 2021 13:01:16 +0530 Subject: [PATCH 1205/5344] tee: optee: Fix missing devices unregister during optee_remove commit 7f565d0ead264329749c0da488de9c8dfa2f18ce upstream. When OP-TEE driver is built as a module, OP-TEE client devices registered on TEE bus during probe should be unregistered during optee_remove. So implement optee_unregister_devices() accordingly. Fixes: c3fa24af9244 ("tee: optee: add TEE bus device enumeration support") Reported-by: Sudeep Holla Signed-off-by: Sumit Garg Signed-off-by: Jens Wiklander [SG: backport to 5.4, dev name s/optee-ta/optee-clnt/] Signed-off-by: Sumit Garg Signed-off-by: Greg Kroah-Hartman --- drivers/tee/optee/core.c | 3 +++ drivers/tee/optee/device.c | 22 ++++++++++++++++++++++ drivers/tee/optee/optee_private.h | 1 + 3 files changed, 26 insertions(+) diff --git a/drivers/tee/optee/core.c b/drivers/tee/optee/core.c index 4bb4c8f28cbd7..5eaef45799e61 100644 --- a/drivers/tee/optee/core.c +++ b/drivers/tee/optee/core.c @@ -582,6 +582,9 @@ static struct optee *optee_probe(struct device_node *np) if (sec_caps & OPTEE_SMC_SEC_CAP_DYNAMIC_SHM) pool = optee_config_dyn_shm(); + /* Unregister OP-TEE specific client devices on TEE bus */ + optee_unregister_devices(); + /* * If dynamic shared memory is not available or failed - try static one */ diff --git a/drivers/tee/optee/device.c b/drivers/tee/optee/device.c index e3a148521ec1d..acff7dd677d67 100644 --- a/drivers/tee/optee/device.c +++ b/drivers/tee/optee/device.c @@ -65,6 +65,13 @@ static int get_devices(struct tee_context *ctx, u32 session, return 0; } +static void optee_release_device(struct device *dev) +{ + struct tee_client_device *optee_device = to_tee_client_device(dev); + + kfree(optee_device); +} + static int optee_register_device(const uuid_t *device_uuid, u32 device_id) { struct tee_client_device *optee_device = NULL; @@ -75,6 +82,7 @@ static int optee_register_device(const uuid_t *device_uuid, u32 device_id) return -ENOMEM; optee_device->dev.bus = &tee_bus_type; + optee_device->dev.release = optee_release_device; dev_set_name(&optee_device->dev, "optee-clnt%u", device_id); uuid_copy(&optee_device->id.uuid, device_uuid); @@ -158,3 +166,17 @@ int optee_enumerate_devices(void) return rc; } + +static int __optee_unregister_device(struct device *dev, void *data) +{ + if (!strncmp(dev_name(dev), "optee-clnt", strlen("optee-clnt"))) + device_unregister(dev); + + return 0; +} + +void optee_unregister_devices(void) +{ + bus_for_each_dev(&tee_bus_type, NULL, NULL, + __optee_unregister_device); +} diff --git a/drivers/tee/optee/optee_private.h b/drivers/tee/optee/optee_private.h index 3eeaad2a28686..54c3fa01d0024 100644 --- a/drivers/tee/optee/optee_private.h +++ b/drivers/tee/optee/optee_private.h @@ -175,6 +175,7 @@ void optee_fill_pages_list(u64 *dst, struct page **pages, int num_pages, size_t page_offset); int optee_enumerate_devices(void); +void optee_unregister_devices(void); /* * Small helpers From d884e1e55c1e72c0114af0bf32a5546927181c8d Mon Sep 17 00:00:00 2001 From: Eugen Hristev Date: Thu, 2 Sep 2021 15:13:58 +0300 Subject: [PATCH 1206/5344] ARM: dts: at91: sama5d2_som1_ek: disable ISC node by default [ Upstream commit 4348cc10da6377a86940beb20ad357933b8f91bb ] Without a sensor node, the ISC will simply fail to probe, as the corresponding port node is missing. It is then logical to disable the node in the devicetree. If we add a port with a connection to a sensor endpoint, ISC can be enabled. Signed-off-by: Eugen Hristev Signed-off-by: Nicolas Ferre Link: https://lore.kernel.org/r/20210902121358.503589-1-eugen.hristev@microchip.com Signed-off-by: Sasha Levin --- arch/arm/boot/dts/at91-sama5d27_som1_ek.dts | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts b/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts index 89f0c9979b89c..4f63158d6b9bd 100644 --- a/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts +++ b/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts @@ -69,7 +69,6 @@ isc: isc@f0008000 { pinctrl-names = "default"; pinctrl-0 = <&pinctrl_isc_base &pinctrl_isc_data_8bit &pinctrl_isc_data_9_10 &pinctrl_isc_data_11_12>; - status = "okay"; }; qspi1: spi@f0024000 { From cf7629cc51f01811b6ed8723dc8e4d4c7080be94 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Tue, 5 Oct 2021 11:36:01 -0700 Subject: [PATCH 1207/5344] xtensa: xtfpga: use CONFIG_USE_OF instead of CONFIG_OF [ Upstream commit f3d7c2cdf6dc0d5402ec29c3673893b3542c5ad1 ] Use platform data to initialize xtfpga device drivers when CONFIG_USE_OF is not selected. This fixes xtfpga networking when CONFIG_USE_OF is not selected but CONFIG_OF is. Signed-off-by: Max Filippov Signed-off-by: Sasha Levin --- arch/xtensa/platforms/xtfpga/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/xtensa/platforms/xtfpga/setup.c b/arch/xtensa/platforms/xtfpga/setup.c index 829115bb381f8..61c5e2c454392 100644 --- a/arch/xtensa/platforms/xtfpga/setup.c +++ b/arch/xtensa/platforms/xtfpga/setup.c @@ -81,7 +81,7 @@ void __init platform_calibrate_ccount(void) #endif -#ifdef CONFIG_OF +#ifdef CONFIG_USE_OF static void __init xtfpga_clk_setup(struct device_node *np) { @@ -299,4 +299,4 @@ static int __init xtavnet_init(void) */ arch_initcall(xtavnet_init); -#endif /* CONFIG_OF */ +#endif /* CONFIG_USE_OF */ From 05cc6f3d03039f57f9ce61a52dd0cefe0bcd6a9e Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 1 Aug 2021 10:36:59 -0700 Subject: [PATCH 1208/5344] xtensa: xtfpga: Try software restart before simulating CPU reset [ Upstream commit 012e974501a270d8dfd4ee2039e1fdf7579c907e ] Rebooting xtensa images loaded with the '-kernel' option in qemu does not work. When executing a reboot command, the qemu session either hangs or experiences an endless sequence of error messages. Kernel panic - not syncing: Unrecoverable error in exception handler Reset code jumps to the CPU restart address, but Linux can not recover from there because code and data in the kernel init sections have been discarded and overwritten at this point. XTFPGA platforms have a means to reset the CPU by writing 0xdead into a specific FPGA IO address. When used in QEMU the kernel image loaded with the '-kernel' option gets restored to its original state allowing the machine to boot successfully. Use that mechanism to attempt a platform reset. If it does not work, fall back to the existing mechanism. Signed-off-by: Guenter Roeck Signed-off-by: Max Filippov Signed-off-by: Sasha Levin --- arch/xtensa/platforms/xtfpga/setup.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/xtensa/platforms/xtfpga/setup.c b/arch/xtensa/platforms/xtfpga/setup.c index 61c5e2c454392..4edccb4d4a5ff 100644 --- a/arch/xtensa/platforms/xtfpga/setup.c +++ b/arch/xtensa/platforms/xtfpga/setup.c @@ -50,8 +50,12 @@ void platform_power_off(void) void platform_restart(void) { - /* Flush and reset the mmu, simulate a processor reset, and - * jump to the reset vector. */ + /* Try software reset first. */ + WRITE_ONCE(*(u32 *)XTFPGA_SWRST_VADDR, 0xdead); + + /* If software reset did not work, flush and reset the mmu, + * simulate a processor reset, and jump to the reset vector. + */ cpu_reset(); /* control never gets here */ } From 72990940a53dac0f93ed4bf27e6601cf063fba56 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Wed, 6 Oct 2021 13:20:44 -0400 Subject: [PATCH 1209/5344] NFSD: Keep existing listeners on portlist error [ Upstream commit c20106944eb679fa3ab7e686fe5f6ba30fbc51e5 ] If nfsd has existing listening sockets without any processes, then an error returned from svc_create_xprt() for an additional transport will remove those existing listeners. We're seeing this in practice when userspace attempts to create rpcrdma transports without having the rpcrdma modules present before creating nfsd kernel processes. Fix this by checking for existing sockets before calling nfsd_destroy(). Signed-off-by: Benjamin Coddington Signed-off-by: Chuck Lever Signed-off-by: Sasha Levin --- fs/nfsd/nfsctl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index afd42b4870e34..f1798cbd5c14c 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -792,7 +792,10 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr svc_xprt_put(xprt); } out_err: - nfsd_destroy(net); + if (!list_empty(&nn->nfsd_serv->sv_permsocks)) + nn->nfsd_serv->sv_nrthreads--; + else + nfsd_destroy(net); return err; } From 538a2d7bbb5ba99992561c88154c4390ba379dc0 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Wed, 6 Oct 2021 22:19:43 +0200 Subject: [PATCH 1210/5344] dma-debug: fix sg checks in debug_dma_map_sg() [ Upstream commit 293d92cbbd2418ca2ba43fed07f1b92e884d1c77 ] The following warning occurred sporadically on s390: DMA-API: nvme 0006:00:00.0: device driver maps memory from kernel text or rodata [addr=0000000048cc5e2f] [len=131072] WARNING: CPU: 4 PID: 825 at kernel/dma/debug.c:1083 check_for_illegal_area+0xa8/0x138 It is a false-positive warning, due to broken logic in debug_dma_map_sg(). check_for_illegal_area() checks for overlay of sg elements with kernel text or rodata. It is called with sg_dma_len(s) instead of s->length as parameter. After the call to ->map_sg(), sg_dma_len() will contain the length of possibly combined sg elements in the DMA address space, and not the individual sg element length, which would be s->length. The check will then use the physical start address of an sg element, and add the DMA length for the overlap check, which could result in the false warning, because the DMA length can be larger than the actual single sg element length. In addition, the call to check_for_illegal_area() happens in the iteration over mapped_ents, which will not include all individual sg elements if any of them were combined in ->map_sg(). Fix this by using s->length instead of sg_dma_len(s). Also put the call to check_for_illegal_area() in a separate loop, iterating over all the individual sg elements ("nents" instead of "mapped_ents"). While at it, as suggested by Robin Murphy, also move check_for_stack() inside the new loop, as it is similarly concerned with validating the individual sg elements. Link: https://lore.kernel.org/lkml/20210705185252.4074653-1-gerald.schaefer@linux.ibm.com Fixes: 884d05970bfb ("dma-debug: use sg_dma_len accessor") Signed-off-by: Gerald Schaefer Signed-off-by: Christoph Hellwig Signed-off-by: Sasha Levin --- kernel/dma/debug.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 01e893cf9b9f7..b28665f4d8c7a 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -1354,6 +1354,12 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, if (unlikely(dma_debug_disabled())) return; + for_each_sg(sg, s, nents, i) { + check_for_stack(dev, sg_page(s), s->offset); + if (!PageHighMem(sg_page(s))) + check_for_illegal_area(dev, sg_virt(s), s->length); + } + for_each_sg(sg, s, mapped_ents, i) { entry = dma_entry_alloc(); if (!entry) @@ -1369,12 +1375,6 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, entry->sg_call_ents = nents; entry->sg_mapped_ents = mapped_ents; - check_for_stack(dev, sg_page(s), s->offset); - - if (!PageHighMem(sg_page(s))) { - check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s)); - } - check_sg_segment(dev, s); add_dma_entry(entry); From 4a4a4b53d79a38f3c228f04b934696c6803e03ad Mon Sep 17 00:00:00 2001 From: Shengjiu Wang Date: Wed, 13 Oct 2021 13:17:04 +0800 Subject: [PATCH 1211/5344] ASoC: wm8960: Fix clock configuration on slave mode [ Upstream commit 6b9b546dc00797c74bef491668ce5431ff54e1e2 ] There is a noise issue for 8kHz sample rate on slave mode. Compared with master mode, the difference is the DACDIV setting, after correcting the DACDIV, the noise is gone. There is no noise issue for 48kHz sample rate, because the default value of DACDIV is correct for 48kHz. So wm8960_configure_clocking() should be functional for ADC and DAC function even if it is slave mode. In order to be compatible for old use case, just add condition for checking that sysclk is zero with slave mode. Fixes: 0e50b51aa22f ("ASoC: wm8960: Let wm8960 driver configure its bit clock and frame clock") Signed-off-by: Shengjiu Wang Acked-by: Charles Keepax Link: https://lore.kernel.org/r/1634102224-3922-1-git-send-email-shengjiu.wang@nxp.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/codecs/wm8960.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/sound/soc/codecs/wm8960.c b/sound/soc/codecs/wm8960.c index 708fc4ed54eda..512f8899dcbbd 100644 --- a/sound/soc/codecs/wm8960.c +++ b/sound/soc/codecs/wm8960.c @@ -752,9 +752,16 @@ static int wm8960_configure_clocking(struct snd_soc_component *component) int i, j, k; int ret; - if (!(iface1 & (1<<6))) { - dev_dbg(component->dev, - "Codec is slave mode, no need to configure clock\n"); + /* + * For Slave mode clocking should still be configured, + * so this if statement should be removed, but some platform + * may not work if the sysclk is not configured, to avoid such + * compatible issue, just add '!wm8960->sysclk' condition in + * this if statement. + */ + if (!(iface1 & (1 << 6)) && !wm8960->sysclk) { + dev_warn(component->dev, + "slave mode, but proceeding with no clock configuration\n"); return 0; } From 8248b128fc60d686842427c25377d36337f543e3 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 12 Oct 2021 16:54:37 +0200 Subject: [PATCH 1212/5344] netfilter: ipvs: make global sysctl readonly in non-init netns [ Upstream commit 174c376278949c44aad89c514a6b5db6cee8db59 ] Because the data pointer of net/ipv4/vs/debug_level is not updated per netns, it must be marked as read-only in non-init netns. Fixes: c6d2d445d8de ("IPVS: netns, final patch enabling network name space.") Signed-off-by: Antoine Tenart Acked-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/ipvs/ip_vs_ctl.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index ee25de5afd0ca..7a607ab5aebda 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -4065,6 +4065,11 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode; tbl[idx++].data = &ipvs->sysctl_schedule_icmp; tbl[idx++].data = &ipvs->sysctl_ignore_tunneled; +#ifdef CONFIG_IP_VS_DEBUG + /* Global sysctls must be ro in non-init netns */ + if (!net_eq(net, &init_net)) + tbl[idx++].mode = 0444; +#endif ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); if (ipvs->sysctl_hdr == NULL) { From 8a5597f291f8128a7e8b17bca9403c6bf69c2d40 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 15 Oct 2021 15:07:54 +0200 Subject: [PATCH 1213/5344] lan78xx: select CRC32 [ Upstream commit 46393d61a328d7c4e3264252dae891921126c674 ] Fix the following build/link error by adding a dependency on the CRC32 routines: ld: drivers/net/usb/lan78xx.o: in function `lan78xx_set_multicast': lan78xx.c:(.text+0x48cf): undefined reference to `crc32_le' The actual use of crc32_le() comes indirectly through ether_crc(). Fixes: 55d7de9de6c30 ("Microchip's LAN7800 family USB 2/3 to 10/100/1000 Ethernet device driver") Signed-off-by: Vegard Nossum Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/usb/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/Kconfig b/drivers/net/usb/Kconfig index ca234d1a0e3bf..d7005cc76ce91 100644 --- a/drivers/net/usb/Kconfig +++ b/drivers/net/usb/Kconfig @@ -117,6 +117,7 @@ config USB_LAN78XX select PHYLIB select MICROCHIP_PHY select FIXED_PHY + select CRC32 help This option adds support for Microchip LAN78XX based USB 2 & USB 3 10/100/1000 Ethernet adapters. From 1ac1aa69120c01f8708743d0da08b4415d6c03dc Mon Sep 17 00:00:00 2001 From: Aleksander Jan Bajkowski Date: Sat, 16 Oct 2021 00:10:20 +0200 Subject: [PATCH 1214/5344] net: dsa: lantiq_gswip: fix register definition [ Upstream commit 66d262804a2276721eac86cf18fcd61046149193 ] I compared the register definitions with the D-Link DWR-966 GPL sources and found that the PUAFD field definition was incorrect. This definition is unused and causes no issues. Fixes: 14fceff4771e ("net: dsa: Add Lantiq / Intel DSA driver for vrx200") Signed-off-by: Aleksander Jan Bajkowski Acked-by: Hauke Mehrtens Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/dsa/lantiq_gswip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c index 3225de0f655f2..60e36f46f8abe 100644 --- a/drivers/net/dsa/lantiq_gswip.c +++ b/drivers/net/dsa/lantiq_gswip.c @@ -229,7 +229,7 @@ #define GSWIP_SDMA_PCTRLp(p) (0xBC0 + ((p) * 0x6)) #define GSWIP_SDMA_PCTRL_EN BIT(0) /* SDMA Port Enable */ #define GSWIP_SDMA_PCTRL_FCEN BIT(1) /* Flow Control Enable */ -#define GSWIP_SDMA_PCTRL_PAUFWD BIT(1) /* Pause Frame Forwarding */ +#define GSWIP_SDMA_PCTRL_PAUFWD BIT(3) /* Pause Frame Forwarding */ #define GSWIP_TABLE_ACTIVE_VLAN 0x01 #define GSWIP_TABLE_VLAN_MAPPING 0x02 From f6ca0b59991ad08a9cf27d792cda8a992e7ea33b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 4 Oct 2021 00:56:06 -0700 Subject: [PATCH 1215/5344] NIOS2: irqflags: rename a redefined register name [ Upstream commit 4cce60f15c04d69eff6ffc539ab09137dbe15070 ] Both arch/nios2/ and drivers/mmc/host/tmio_mmc.c define a macro with the name "CTL_STATUS". Change the one in arch/nios2/ to be "CTL_FSTATUS" (flags status) to eliminate the build warning. In file included from ../drivers/mmc/host/tmio_mmc.c:22: drivers/mmc/host/tmio_mmc.h:31: warning: "CTL_STATUS" redefined 31 | #define CTL_STATUS 0x1c arch/nios2/include/asm/registers.h:14: note: this is the location of the previous definition 14 | #define CTL_STATUS 0 Fixes: b31ebd8055ea ("nios2: Nios2 registers") Signed-off-by: Randy Dunlap Cc: Dinh Nguyen Signed-off-by: Dinh Nguyen Signed-off-by: Sasha Levin --- arch/nios2/include/asm/irqflags.h | 4 ++-- arch/nios2/include/asm/registers.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/nios2/include/asm/irqflags.h b/arch/nios2/include/asm/irqflags.h index b3ec3e510706d..25acf27862f91 100644 --- a/arch/nios2/include/asm/irqflags.h +++ b/arch/nios2/include/asm/irqflags.h @@ -9,7 +9,7 @@ static inline unsigned long arch_local_save_flags(void) { - return RDCTL(CTL_STATUS); + return RDCTL(CTL_FSTATUS); } /* @@ -18,7 +18,7 @@ static inline unsigned long arch_local_save_flags(void) */ static inline void arch_local_irq_restore(unsigned long flags) { - WRCTL(CTL_STATUS, flags); + WRCTL(CTL_FSTATUS, flags); } static inline void arch_local_irq_disable(void) diff --git a/arch/nios2/include/asm/registers.h b/arch/nios2/include/asm/registers.h index 183c720e454d9..95b67dd16f818 100644 --- a/arch/nios2/include/asm/registers.h +++ b/arch/nios2/include/asm/registers.h @@ -11,7 +11,7 @@ #endif /* control register numbers */ -#define CTL_STATUS 0 +#define CTL_FSTATUS 0 #define CTL_ESTATUS 1 #define CTL_BSTATUS 2 #define CTL_IENABLE 3 From d28066bc085fd64169c5929298b4ac0fc86faf95 Mon Sep 17 00:00:00 2001 From: Guangbin Huang Date: Tue, 19 Oct 2021 22:16:29 +0800 Subject: [PATCH 1216/5344] net: hns3: reset DWRR of unused tc to zero [ Upstream commit b63fcaab959807282e9822e659034edf95fc8bd1 ] Currently, DWRR of tc will be initialized to a fixed value when this tc is enabled, but it is not been reset to 0 when this tc is disabled. It cause a problem that the DWRR of unused tc is not 0 after using tc tool to add and delete multi-tc parameters. For examples, after enabling 4 TCs and restoring to 1 TC by follow tc commands: $ tc qdisc add dev eth0 root mqprio num_tc 4 map 0 1 2 3 0 1 2 3 queues \ 8@0 8@8 8@16 8@24 hw 1 mode channel $ tc qdisc del dev eth0 root Now there is just one TC is enabled for eth0, but the tc info querying by debugfs is shown as follow: $ cat /mnt/hns3/0000:7d:00.0/tm/tc_sch_info enabled tc number: 1 weight_offset: 14 TC MODE WEIGHT 0 dwrr 100 1 dwrr 100 2 dwrr 100 3 dwrr 100 4 dwrr 0 5 dwrr 0 6 dwrr 0 7 dwrr 0 This patch fixes it by resetting DWRR of tc to 0 when tc is disabled. Fixes: 848440544b41 ("net: hns3: Add support of TX Scheduler & Shaper to HNS3 driver") Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c index 62399cc1c5a63..d98f0e2ec7aa3 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c @@ -633,6 +633,8 @@ static void hclge_tm_pg_info_init(struct hclge_dev *hdev) hdev->tm_info.pg_info[i].tc_bit_map = hdev->hw_tc_map; for (k = 0; k < hdev->tm_info.num_tc; k++) hdev->tm_info.pg_info[i].tc_dwrr[k] = BW_PERCENT; + for (; k < HNAE3_MAX_TC; k++) + hdev->tm_info.pg_info[i].tc_dwrr[k] = 0; } } From 81dc32bace7fd0ebe7232b7d1c48cd3bd077a84d Mon Sep 17 00:00:00 2001 From: Guangbin Huang Date: Tue, 19 Oct 2021 22:16:30 +0800 Subject: [PATCH 1217/5344] net: hns3: add limit ets dwrr bandwidth cannot be 0 [ Upstream commit 731797fdffa3d083db536e2fdd07ceb050bb40b1 ] If ets dwrr bandwidth of tc is set to 0, the hardware will switch to SP mode. In this case, this tc may occupy all the tx bandwidth if it has huge traffic, so it violates the purpose of the user setting. To fix this problem, limit the ets dwrr bandwidth must greater than 0. Fixes: cacde272dd00 ("net: hns3: Add hclge_dcb module for the support of DCB feature") Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c index d16488bab86f5..9076605403a74 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c @@ -132,6 +132,15 @@ static int hclge_ets_validate(struct hclge_dev *hdev, struct ieee_ets *ets, *changed = true; break; case IEEE_8021QAZ_TSA_ETS: + /* The hardware will switch to sp mode if bandwidth is + * 0, so limit ets bandwidth must be greater than 0. + */ + if (!ets->tc_tx_bw[i]) { + dev_err(&hdev->pdev->dev, + "tc%u ets bw cannot be 0\n", i); + return -EINVAL; + } + if (hdev->tm_info.tc_info[i].tc_sch_mode != HCLGE_SCH_MODE_DWRR) *changed = true; From d30a0b720732c14529b5c7cbaf237837c31fab4c Mon Sep 17 00:00:00 2001 From: Peng Li Date: Tue, 19 Oct 2021 22:16:35 +0800 Subject: [PATCH 1218/5344] net: hns3: disable sriov before unload hclge layer [ Upstream commit 0dd8a25f355b4df2d41c08df1716340854c7d4c5 ] HNS3 driver includes hns3.ko, hnae3.ko and hclge.ko. hns3.ko includes network stack and pci_driver, hclge.ko includes HW device action, algo_ops and timer task, hnae3.ko includes some register function. When SRIOV is enable and hclge.ko is removed, HW device is unloaded but VF still exists, PF will not reply VF mbx messages, and cause errors. This patch fix it by disable SRIOV before remove hclge.ko. Fixes: e2cb1dec9779 ("net: hns3: Add HNS3 VF HCL(Hardware Compatibility Layer) Support") Signed-off-by: Peng Li Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/hisilicon/hns3/hnae3.c | 21 +++++++++++++++++++ drivers/net/ethernet/hisilicon/hns3/hnae3.h | 1 + .../hisilicon/hns3/hns3pf/hclge_main.c | 1 + 3 files changed, 23 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.c b/drivers/net/ethernet/hisilicon/hns3/hnae3.c index 03ca7d925e8e0..2e38c7d214c45 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.c +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.c @@ -10,6 +10,27 @@ static LIST_HEAD(hnae3_ae_algo_list); static LIST_HEAD(hnae3_client_list); static LIST_HEAD(hnae3_ae_dev_list); +void hnae3_unregister_ae_algo_prepare(struct hnae3_ae_algo *ae_algo) +{ + const struct pci_device_id *pci_id; + struct hnae3_ae_dev *ae_dev; + + if (!ae_algo) + return; + + list_for_each_entry(ae_dev, &hnae3_ae_dev_list, node) { + if (!hnae3_get_bit(ae_dev->flag, HNAE3_DEV_INITED_B)) + continue; + + pci_id = pci_match_id(ae_algo->pdev_id_table, ae_dev->pdev); + if (!pci_id) + continue; + if (IS_ENABLED(CONFIG_PCI_IOV)) + pci_disable_sriov(ae_dev->pdev); + } +} +EXPORT_SYMBOL(hnae3_unregister_ae_algo_prepare); + /* we are keeping things simple and using single lock for all the * list. This is a non-critical code so other updations, if happen * in parallel, can wait. diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 0db835d87d09d..6cf8490110642 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -666,6 +666,7 @@ struct hnae3_handle { int hnae3_register_ae_dev(struct hnae3_ae_dev *ae_dev); void hnae3_unregister_ae_dev(struct hnae3_ae_dev *ae_dev); +void hnae3_unregister_ae_algo_prepare(struct hnae3_ae_algo *ae_algo); void hnae3_unregister_ae_algo(struct hnae3_ae_algo *ae_algo); void hnae3_register_ae_algo(struct hnae3_ae_algo *ae_algo); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index f44e8401496b1..8ecfabaefa85b 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -10274,6 +10274,7 @@ static int hclge_init(void) static void hclge_exit(void) { + hnae3_unregister_ae_algo_prepare(&ae_algo); hnae3_unregister_ae_algo(&ae_algo); } module_init(hclge_init); From d2d95f681e85ffb7702f57d88d9bfa4f027e3940 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Wed, 20 Oct 2021 09:04:33 +0200 Subject: [PATCH 1219/5344] net: stmmac: Fix E2E delay mechanism [ Upstream commit 3cb958027cb8b78d3ee639ce9af54c2ef1bf964f ] When utilizing End to End delay mechanism, the following error messages show up: |root@ehl1:~# ptp4l --tx_timestamp_timeout=50 -H -i eno2 -E -m |ptp4l[950.573]: selected /dev/ptp3 as PTP clock |ptp4l[950.586]: port 1: INITIALIZING to LISTENING on INIT_COMPLETE |ptp4l[950.586]: port 0: INITIALIZING to LISTENING on INIT_COMPLETE |ptp4l[952.879]: port 1: new foreign master 001395.fffe.4897b4-1 |ptp4l[956.879]: selected best master clock 001395.fffe.4897b4 |ptp4l[956.879]: port 1: assuming the grand master role |ptp4l[956.879]: port 1: LISTENING to GRAND_MASTER on RS_GRAND_MASTER |ptp4l[962.017]: port 1: received DELAY_REQ without timestamp |ptp4l[962.273]: port 1: received DELAY_REQ without timestamp |ptp4l[963.090]: port 1: received DELAY_REQ without timestamp Commit f2fb6b6275eb ("net: stmmac: enable timestamp snapshot for required PTP packets in dwmac v5.10a") already addresses this problem for the dwmac v5.10. However, same holds true for all dwmacs above version v4.10. Correct the check accordingly. Afterwards everything works as expected. Tested on Intel Atom(R) x6414RE Processor. Fixes: 14f347334bf2 ("net: stmmac: Correctly take timestamp for PTPv2") Fixes: f2fb6b6275eb ("net: stmmac: enable timestamp snapshot for required PTP packets in dwmac v5.10a") Suggested-by: Ong Boon Leong Signed-off-by: Kurt Kanzenbach Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 835ac178bc8c0..94c652b9a0a8b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -604,7 +604,7 @@ static int stmmac_hwtstamp_set(struct net_device *dev, struct ifreq *ifr) config.rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT; ptp_v2 = PTP_TCR_TSVER2ENA; snap_type_sel = PTP_TCR_SNAPTYPSEL_1; - if (priv->synopsys_id != DWMAC_CORE_5_10) + if (priv->synopsys_id < DWMAC_CORE_4_10) ts_event_en = PTP_TCR_TSEVNTENA; ptp_over_ipv4_udp = PTP_TCR_TSIPV4ENA; ptp_over_ipv6_udp = PTP_TCR_TSIPV6ENA; From 3c2952e69495b5680e49a435ffded508dadb5e2a Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 20 Oct 2021 19:52:06 +0300 Subject: [PATCH 1220/5344] net: enetc: fix ethtool counter name for PM0_TERR [ Upstream commit fb8dc5fc8cbdfd62ecd16493848aee2f42ed84d9 ] There are two counters named "MAC tx frames", one of them is actually incorrect. The correct name for that counter should be "MAC tx error frames", which is symmetric to the existing "MAC rx error frames". Fixes: 16eb4c85c964 ("enetc: Add ethtool statistics") Signed-off-by: Vladimir Oltean Reviewed-by: Link: https://lore.kernel.org/r/20211020165206.1069889-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/freescale/enetc/enetc_ethtool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c index 89121d7ce3e6f..636aa6d81d8fe 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c @@ -155,7 +155,7 @@ static const struct { { ENETC_PM0_TFRM, "MAC tx frames" }, { ENETC_PM0_TFCS, "MAC tx fcs errors" }, { ENETC_PM0_TVLAN, "MAC tx VLAN frames" }, - { ENETC_PM0_TERR, "MAC tx frames" }, + { ENETC_PM0_TERR, "MAC tx frame errors" }, { ENETC_PM0_TUCA, "MAC tx unicast frames" }, { ENETC_PM0_TMCA, "MAC tx multicast frames" }, { ENETC_PM0_TBCA, "MAC tx broadcast frames" }, From 4fc8f0111a4601c2b615a7a327a16a5acf210784 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Fri, 24 Sep 2021 16:55:56 +0900 Subject: [PATCH 1221/5344] can: rcar_can: fix suspend/resume commit f7c05c3987dcfde9a4e8c2d533db013fabebca0d upstream. If the driver was not opened, rcar_can_suspend() should not call clk_disable() because the clock was not enabled. Fixes: fd1159318e55 ("can: add Renesas R-Car CAN driver") Link: https://lore.kernel.org/all/20210924075556.223685-1-yoshihiro.shimoda.uh@renesas.com Cc: stable@vger.kernel.org Signed-off-by: Yoshihiro Shimoda Tested-by: Ayumi Nakamichi Reviewed-by: Ulrich Hecht Tested-by: Biju Das Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- drivers/net/can/rcar/rcar_can.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/net/can/rcar/rcar_can.c b/drivers/net/can/rcar/rcar_can.c index bf5adea9c0a38..ac52288fa3bfe 100644 --- a/drivers/net/can/rcar/rcar_can.c +++ b/drivers/net/can/rcar/rcar_can.c @@ -848,10 +848,12 @@ static int __maybe_unused rcar_can_suspend(struct device *dev) struct rcar_can_priv *priv = netdev_priv(ndev); u16 ctlr; - if (netif_running(ndev)) { - netif_stop_queue(ndev); - netif_device_detach(ndev); - } + if (!netif_running(ndev)) + return 0; + + netif_stop_queue(ndev); + netif_device_detach(ndev); + ctlr = readw(&priv->regs->ctlr); ctlr |= RCAR_CAN_CTLR_CANM_HALT; writew(ctlr, &priv->regs->ctlr); @@ -870,6 +872,9 @@ static int __maybe_unused rcar_can_resume(struct device *dev) u16 ctlr; int err; + if (!netif_running(ndev)) + return 0; + err = clk_enable(priv->clk); if (err) { netdev_err(ndev, "clk_enable() failed, error %d\n", err); @@ -883,10 +888,9 @@ static int __maybe_unused rcar_can_resume(struct device *dev) writew(ctlr, &priv->regs->ctlr); priv->can.state = CAN_STATE_ERROR_ACTIVE; - if (netif_running(ndev)) { - netif_device_attach(ndev); - netif_start_queue(ndev); - } + netif_device_attach(ndev); + netif_start_queue(ndev); + return 0; } From fd46d2c0d00a8de5de7ab1087fc1b70a706af6bb Mon Sep 17 00:00:00 2001 From: Stephane Grosjean Date: Wed, 29 Sep 2021 16:21:10 +0200 Subject: [PATCH 1222/5344] can: peak_usb: pcan_usb_fd_decode_status(): fix back to ERROR_ACTIVE state notification commit 3d031abc7e7249573148871180c28ecedb5e27df upstream. This corrects the lack of notification of a return to ERROR_ACTIVE state for USB - CANFD devices from PEAK-System. Fixes: 0a25e1f4f185 ("can: peak_usb: add support for PEAK new CANFD USB adapters") Link: https://lore.kernel.org/all/20210929142111.55757-1-s.grosjean@peak-system.com Cc: stable@vger.kernel.org Signed-off-by: Stephane Grosjean Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- drivers/net/can/usb/peak_usb/pcan_usb_fd.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c index 96bbdef672bc9..571d9b0bfe518 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c +++ b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c @@ -551,11 +551,10 @@ static int pcan_usb_fd_decode_status(struct pcan_usb_fd_if *usb_if, } else if (sm->channel_p_w_b & PUCAN_BUS_WARNING) { new_state = CAN_STATE_ERROR_WARNING; } else { - /* no error bit (so, no error skb, back to active state) */ - dev->can.state = CAN_STATE_ERROR_ACTIVE; + /* back to (or still in) ERROR_ACTIVE state */ + new_state = CAN_STATE_ERROR_ACTIVE; pdev->bec.txerr = 0; pdev->bec.rxerr = 0; - return 0; } /* state hasn't changed */ From ea034f61989b642e224f8f204e4e1b8874540494 Mon Sep 17 00:00:00 2001 From: Zheyu Ma Date: Thu, 14 Oct 2021 06:28:33 +0000 Subject: [PATCH 1223/5344] can: peak_pci: peak_pci_remove(): fix UAF commit 949fe9b35570361bc6ee2652f89a0561b26eec98 upstream. When remove the module peek_pci, referencing 'chan' again after releasing 'dev' will cause UAF. Fix this by releasing 'dev' later. The following log reveals it: [ 35.961814 ] BUG: KASAN: use-after-free in peak_pci_remove+0x16f/0x270 [peak_pci] [ 35.963414 ] Read of size 8 at addr ffff888136998ee8 by task modprobe/5537 [ 35.965513 ] Call Trace: [ 35.965718 ] dump_stack_lvl+0xa8/0xd1 [ 35.966028 ] print_address_description+0x87/0x3b0 [ 35.966420 ] kasan_report+0x172/0x1c0 [ 35.966725 ] ? peak_pci_remove+0x16f/0x270 [peak_pci] [ 35.967137 ] ? trace_irq_enable_rcuidle+0x10/0x170 [ 35.967529 ] ? peak_pci_remove+0x16f/0x270 [peak_pci] [ 35.967945 ] __asan_report_load8_noabort+0x14/0x20 [ 35.968346 ] peak_pci_remove+0x16f/0x270 [peak_pci] [ 35.968752 ] pci_device_remove+0xa9/0x250 Fixes: e6d9c80b7ca1 ("can: peak_pci: add support of some new PEAK-System PCI cards") Link: https://lore.kernel.org/all/1634192913-15639-1-git-send-email-zheyuma97@gmail.com Cc: stable@vger.kernel.org Signed-off-by: Zheyu Ma Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- drivers/net/can/sja1000/peak_pci.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/can/sja1000/peak_pci.c b/drivers/net/can/sja1000/peak_pci.c index 8c0244f51059e..41a63777030cb 100644 --- a/drivers/net/can/sja1000/peak_pci.c +++ b/drivers/net/can/sja1000/peak_pci.c @@ -731,16 +731,15 @@ static void peak_pci_remove(struct pci_dev *pdev) struct net_device *prev_dev = chan->prev_dev; dev_info(&pdev->dev, "removing device %s\n", dev->name); + /* do that only for first channel */ + if (!prev_dev && chan->pciec_card) + peak_pciec_remove(chan->pciec_card); unregister_sja1000dev(dev); free_sja1000dev(dev); dev = prev_dev; - if (!dev) { - /* do that only for first channel */ - if (chan->pciec_card) - peak_pciec_remove(chan->pciec_card); + if (!dev) break; - } priv = netdev_priv(dev); chan = priv->priv; } From 447e029c7903477247357701feb984b9beed20df Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Mon, 6 Sep 2021 17:42:19 +0800 Subject: [PATCH 1224/5344] can: j1939: j1939_tp_rxtimer(): fix errant alert in j1939_tp_rxtimer commit b504a884f6b5a77dac7d580ffa08e482f70d1a30 upstream. When the session state is J1939_SESSION_DONE, j1939_tp_rxtimer() will give an alert "rx timeout, send abort", but do nothing actually. Move the alert into session active judgment condition, it is more reasonable. One of the scenarios is that j1939_tp_rxtimer() execute followed by j1939_xtp_rx_abort_one(). After j1939_xtp_rx_abort_one(), the session state is J1939_SESSION_DONE, then j1939_tp_rxtimer() give an alert. Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Link: https://lore.kernel.org/all/20210906094219.95924-1-william.xuanziyang@huawei.com Cc: stable@vger.kernel.org Signed-off-by: Ziyang Xuan Acked-by: Oleksij Rempel Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- net/can/j1939/transport.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 6571895228f01..e1d3a6ba2d0e1 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -1230,12 +1230,11 @@ static enum hrtimer_restart j1939_tp_rxtimer(struct hrtimer *hrtimer) session->err = -ETIME; j1939_session_deactivate(session); } else { - netdev_alert(priv->ndev, "%s: 0x%p: rx timeout, send abort\n", - __func__, session); - j1939_session_list_lock(session->priv); if (session->state >= J1939_SESSION_ACTIVE && session->state < J1939_SESSION_ACTIVE_MAX) { + netdev_alert(priv->ndev, "%s: 0x%p: rx timeout, send abort\n", + __func__, session); j1939_session_get(session); hrtimer_start(&session->rxtimer, ms_to_ktime(J1939_XTP_ABORT_TIMEOUT_MS), From 189d624e05d6e0bd97b87ba5fb95a295ff92ad43 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Sun, 26 Sep 2021 18:47:57 +0800 Subject: [PATCH 1225/5344] can: j1939: j1939_netdev_start(): fix UAF for rx_kref of j1939_priv commit d9d52a3ebd284882f5562c88e55991add5d01586 upstream. It will trigger UAF for rx_kref of j1939_priv as following. cpu0 cpu1 j1939_sk_bind(socket0, ndev0, ...) j1939_netdev_start j1939_sk_bind(socket1, ndev0, ...) j1939_netdev_start j1939_priv_set j1939_priv_get_by_ndev_locked j1939_jsk_add ..... j1939_netdev_stop kref_put_lock(&priv->rx_kref, ...) kref_get(&priv->rx_kref, ...) REFCOUNT_WARN("addition on 0;...") ==================================================== refcount_t: addition on 0; use-after-free. WARNING: CPU: 1 PID: 20874 at lib/refcount.c:25 refcount_warn_saturate+0x169/0x1e0 RIP: 0010:refcount_warn_saturate+0x169/0x1e0 Call Trace: j1939_netdev_start+0x68b/0x920 j1939_sk_bind+0x426/0xeb0 ? security_socket_bind+0x83/0xb0 The rx_kref's kref_get() and kref_put() should use j1939_netdev_lock to protect. Fixes: 9d71dd0c70099 ("can: add support of SAE J1939 protocol") Link: https://lore.kernel.org/all/20210926104757.2021540-1-william.xuanziyang@huawei.com Cc: stable@vger.kernel.org Reported-by: syzbot+85d9878b19c94f9019ad@syzkaller.appspotmail.com Signed-off-by: Ziyang Xuan Acked-by: Oleksij Rempel Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- net/can/j1939/main.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index 6884d18f919c7..266c189f1e809 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -249,11 +249,14 @@ struct j1939_priv *j1939_netdev_start(struct net_device *ndev) struct j1939_priv *priv, *priv_new; int ret; - priv = j1939_priv_get_by_ndev(ndev); + spin_lock(&j1939_netdev_lock); + priv = j1939_priv_get_by_ndev_locked(ndev); if (priv) { kref_get(&priv->rx_kref); + spin_unlock(&j1939_netdev_lock); return priv; } + spin_unlock(&j1939_netdev_lock); priv = j1939_priv_create(ndev); if (!priv) @@ -269,10 +272,10 @@ struct j1939_priv *j1939_netdev_start(struct net_device *ndev) /* Someone was faster than us, use their priv and roll * back our's. */ + kref_get(&priv_new->rx_kref); spin_unlock(&j1939_netdev_lock); dev_put(ndev); kfree(priv); - kref_get(&priv_new->rx_kref); return priv_new; } j1939_priv_set(ndev, priv); From 4445e3e4e195c593355b3a3bcf4d17289fdcfdd3 Mon Sep 17 00:00:00 2001 From: Zhang Changzhong Date: Thu, 30 Sep 2021 11:33:20 +0800 Subject: [PATCH 1226/5344] can: j1939: j1939_xtp_rx_dat_one(): cancel session if receive TP.DT with error length commit 379743985ab6cfe2cbd32067cf4ed497baca6d06 upstream. According to SAE-J1939-21, the data length of TP.DT must be 8 bytes, so cancel session when receive unexpected TP.DT message. Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Link: https://lore.kernel.org/all/1632972800-45091-1-git-send-email-zhangchangzhong@huawei.com Cc: stable@vger.kernel.org Signed-off-by: Zhang Changzhong Acked-by: Oleksij Rempel Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- net/can/j1939/transport.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index e1d3a6ba2d0e1..ebe055d7bfb08 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -1770,6 +1770,7 @@ static void j1939_xtp_rx_dpo(struct j1939_priv *priv, struct sk_buff *skb, static void j1939_xtp_rx_dat_one(struct j1939_session *session, struct sk_buff *skb) { + enum j1939_xtp_abort abort = J1939_XTP_ABORT_FAULT; struct j1939_priv *priv = session->priv; struct j1939_sk_buff_cb *skcb; struct sk_buff *se_skb = NULL; @@ -1784,9 +1785,11 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, skcb = j1939_skb_to_cb(skb); dat = skb->data; - if (skb->len <= 1) + if (skb->len != 8) { /* makes no sense */ + abort = J1939_XTP_ABORT_UNEXPECTED_DATA; goto out_session_cancel; + } switch (session->last_cmd) { case 0xff: @@ -1884,7 +1887,7 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, out_session_cancel: kfree_skb(se_skb); j1939_session_timers_cancel(session); - j1939_session_cancel(session, J1939_XTP_ABORT_FAULT); + j1939_session_cancel(session, abort); j1939_session_put(session); } From 516189dea83d88f0ed5f9654270deceb6adeb868 Mon Sep 17 00:00:00 2001 From: Zhang Changzhong Date: Thu, 14 Oct 2021 17:26:40 +0800 Subject: [PATCH 1227/5344] can: j1939: j1939_xtp_rx_rts_session_new(): abort TP less than 9 bytes commit a4fbe70c5cb746441d56b28cf88161d9e0e25378 upstream. The receiver should abort TP if 'total message size' in TP.CM_RTS and TP.CM_BAM is less than 9 or greater than 1785 [1], but currently the j1939 stack only checks the upper bound and the receiver will accept the following broadcast message: vcan1 18ECFF00 [8] 20 08 00 02 FF 00 23 01 vcan1 18EBFF00 [8] 01 00 00 00 00 00 00 00 vcan1 18EBFF00 [8] 02 00 FF FF FF FF FF FF This patch adds check for the lower bound and abort illegal TP. [1] SAE-J1939-82 A.3.4 Row 2 and A.3.6 Row 6. Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Link: https://lore.kernel.org/all/1634203601-3460-1-git-send-email-zhangchangzhong@huawei.com Cc: stable@vger.kernel.org Signed-off-by: Zhang Changzhong Acked-by: Oleksij Rempel Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- net/can/j1939/j1939-priv.h | 1 + net/can/j1939/transport.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/net/can/j1939/j1939-priv.h b/net/can/j1939/j1939-priv.h index 12369b604ce95..cea712fb2a9e0 100644 --- a/net/can/j1939/j1939-priv.h +++ b/net/can/j1939/j1939-priv.h @@ -326,6 +326,7 @@ int j1939_session_activate(struct j1939_session *session); void j1939_tp_schedule_txtimer(struct j1939_session *session, int msec); void j1939_session_timers_cancel(struct j1939_session *session); +#define J1939_MIN_TP_PACKET_SIZE 9 #define J1939_MAX_TP_PACKET_SIZE (7 * 0xff) #define J1939_MAX_ETP_PACKET_SIZE (7 * 0x00ffffff) diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index ebe055d7bfb08..7a54170da09df 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -1596,6 +1596,8 @@ j1939_session *j1939_xtp_rx_rts_session_new(struct j1939_priv *priv, abort = J1939_XTP_ABORT_FAULT; else if (len > priv->tp_max_packet_size) abort = J1939_XTP_ABORT_RESOURCE; + else if (len < J1939_MIN_TP_PACKET_SIZE) + abort = J1939_XTP_ABORT_FAULT; } if (abort != J1939_XTP_NO_ABORT) { From fefdfc8c1ed35513fdd4246c9c9ea6cfe63499f1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 7 Oct 2021 14:19:49 -0400 Subject: [PATCH 1228/5344] ceph: fix handling of "meta" errors commit 1bd85aa65d0e7b5e4d09240f492f37c569fdd431 upstream. Currently, we check the wb_err too early for directories, before all of the unsafe child requests have been waited on. In order to fix that we need to check the mapping->wb_err later nearer to the end of ceph_fsync. We also have an overly-complex method for tracking errors after blocklisting. The errors recorded in cleanup_session_requests go to a completely separate field in the inode, but we end up reporting them the same way we would for any other error (in fsync). There's no real benefit to tracking these errors in two different places, since the only reporting mechanism for them is in fsync, and we'd need to advance them both every time. Given that, we can just remove i_meta_err, and convert the places that used it to instead just use mapping->wb_err instead. That also fixes the original problem by ensuring that we do a check_and_advance of the wb_err at the end of the fsync op. Cc: stable@vger.kernel.org URL: https://tracker.ceph.com/issues/52864 Reported-by: Patrick Donnelly Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov Signed-off-by: Greg Kroah-Hartman --- fs/ceph/caps.c | 12 +++--------- fs/ceph/file.c | 1 - fs/ceph/inode.c | 2 -- fs/ceph/mds_client.c | 17 +++++------------ fs/ceph/super.h | 3 --- 5 files changed, 8 insertions(+), 27 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 0fad044a5752b..06e9b26bf277a 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2249,7 +2249,6 @@ static int unsafe_request_wait(struct inode *inode) int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) { - struct ceph_file_info *fi = file->private_data; struct inode *inode = file->f_mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); u64 flush_tid; @@ -2280,14 +2279,9 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) if (err < 0) ret = err; - if (errseq_check(&ci->i_meta_err, READ_ONCE(fi->meta_err))) { - spin_lock(&file->f_lock); - err = errseq_check_and_advance(&ci->i_meta_err, - &fi->meta_err); - spin_unlock(&file->f_lock); - if (err < 0) - ret = err; - } + err = file_check_and_advance_wb_err(file); + if (err < 0) + ret = err; out: dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); return ret; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 34785a203461d..aa1eac6d89f2e 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -234,7 +234,6 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, fi->fmode = fmode; spin_lock_init(&fi->rw_contexts_lock); INIT_LIST_HEAD(&fi->rw_contexts); - fi->meta_err = errseq_sample(&ci->i_meta_err); fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen); return 0; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 5beebbbb42f09..af85a72376040 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -515,8 +515,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ceph_fscache_inode_init(ci); - ci->i_meta_err = 0; - return &ci->vfs_inode; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 1ef370913c007..37fb71797b341 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1272,7 +1272,6 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, { struct ceph_mds_request *req; struct rb_node *p; - struct ceph_inode_info *ci; dout("cleanup_session_requests mds%d\n", session->s_mds); mutex_lock(&mdsc->mutex); @@ -1281,16 +1280,10 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, struct ceph_mds_request, r_unsafe_item); pr_warn_ratelimited(" dropping unsafe request %llu\n", req->r_tid); - if (req->r_target_inode) { - /* dropping unsafe change of inode's attributes */ - ci = ceph_inode(req->r_target_inode); - errseq_set(&ci->i_meta_err, -EIO); - } - if (req->r_unsafe_dir) { - /* dropping unsafe directory operation */ - ci = ceph_inode(req->r_unsafe_dir); - errseq_set(&ci->i_meta_err, -EIO); - } + if (req->r_target_inode) + mapping_set_error(req->r_target_inode->i_mapping, -EIO); + if (req->r_unsafe_dir) + mapping_set_error(req->r_unsafe_dir->i_mapping, -EIO); __unregister_request(mdsc, req); } /* zero r_attempts, so kick_requests() will re-send requests */ @@ -1436,7 +1429,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, spin_unlock(&mdsc->cap_dirty_lock); if (dirty_dropped) { - errseq_set(&ci->i_meta_err, -EIO); + mapping_set_error(inode->i_mapping, -EIO); if (ci->i_wrbuffer_ref_head == 0 && ci->i_wr_ref == 0 && diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 8ffc8e88dd3d2..6db7b3387e1cc 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -402,8 +402,6 @@ struct ceph_inode_info { struct fscache_cookie *fscache; u32 i_fscache_gen; #endif - errseq_t i_meta_err; - struct inode vfs_inode; /* at end */ }; @@ -712,7 +710,6 @@ struct ceph_file_info { spinlock_t rw_contexts_lock; struct list_head rw_contexts; - errseq_t meta_err; u32 filp_gen; atomic_t num_locks; }; From 2000d4fa6a3834969370a824fa792dc6cfd7737e Mon Sep 17 00:00:00 2001 From: Valentin Vidic Date: Mon, 18 Oct 2021 15:15:42 -0700 Subject: [PATCH 1229/5344] ocfs2: mount fails with buffer overflow in strlen commit b15fa9224e6e1239414525d8d556d824701849fc upstream. Starting with kernel 5.11 built with CONFIG_FORTIFY_SOURCE mouting an ocfs2 filesystem with either o2cb or pcmk cluster stack fails with the trace below. Problem seems to be that strings for cluster stack and cluster name are not guaranteed to be null terminated in the disk representation, while strlcpy assumes that the source string is always null terminated. This causes a read outside of the source string triggering the buffer overflow detection. detected buffer overflow in strlen ------------[ cut here ]------------ kernel BUG at lib/string.c:1149! invalid opcode: 0000 [#1] SMP PTI CPU: 1 PID: 910 Comm: mount.ocfs2 Not tainted 5.14.0-1-amd64 #1 Debian 5.14.6-2 RIP: 0010:fortify_panic+0xf/0x11 ... Call Trace: ocfs2_initialize_super.isra.0.cold+0xc/0x18 [ocfs2] ocfs2_fill_super+0x359/0x19b0 [ocfs2] mount_bdev+0x185/0x1b0 legacy_get_tree+0x27/0x40 vfs_get_tree+0x25/0xb0 path_mount+0x454/0xa20 __x64_sys_mount+0x103/0x140 do_syscall_64+0x3b/0xc0 entry_SYSCALL_64_after_hwframe+0x44/0xae Link: https://lkml.kernel.org/r/20210929180654.32460-1-vvidic@valentin-vidic.from.hr Signed-off-by: Valentin Vidic Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/ocfs2/super.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 60b4b6df1ed36..eaec97892dce8 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2150,11 +2150,17 @@ static int ocfs2_initialize_super(struct super_block *sb, } if (ocfs2_clusterinfo_valid(osb)) { + /* + * ci_stack and ci_cluster in ocfs2_cluster_info may not be null + * terminated, so make sure no overflow happens here by using + * memcpy. Destination strings will always be null terminated + * because osb is allocated using kzalloc. + */ osb->osb_stackflags = OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags; - strlcpy(osb->osb_cluster_stack, + memcpy(osb->osb_cluster_stack, OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, - OCFS2_STACK_LABEL_LEN + 1); + OCFS2_STACK_LABEL_LEN); if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) { mlog(ML_ERROR, "couldn't mount because of an invalid " @@ -2163,9 +2169,9 @@ static int ocfs2_initialize_super(struct super_block *sb, status = -EINVAL; goto bail; } - strlcpy(osb->osb_cluster_name, + memcpy(osb->osb_cluster_name, OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster, - OCFS2_CLUSTER_NAME_LEN + 1); + OCFS2_CLUSTER_NAME_LEN); } else { /* The empty string is identical with classic tools that * don't know about s_cluster_info. */ From ba447bfccc34fb7b572ff313c4cc8168be28cda5 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 18 Oct 2021 15:16:09 -0700 Subject: [PATCH 1230/5344] elfcore: correct reference to CONFIG_UML commit b0e901280d9860a0a35055f220e8e457f300f40a upstream. Commit 6e7b64b9dd6d ("elfcore: fix building with clang") introduces special handling for two architectures, ia64 and User Mode Linux. However, the wrong name, i.e., CONFIG_UM, for the intended Kconfig symbol for User-Mode Linux was used. Although the directory for User Mode Linux is ./arch/um; the Kconfig symbol for this architecture is called CONFIG_UML. Luckily, ./scripts/checkkconfigsymbols.py warns on non-existing configs: UM Referencing files: include/linux/elfcore.h Similar symbols: UML, NUMA Correct the name of the config to the intended one. [akpm@linux-foundation.org: fix um/x86_64, per Catalin] Link: https://lkml.kernel.org/r/20211006181119.2851441-1-catalin.marinas@arm.com Link: https://lkml.kernel.org/r/YV6pejGzLy5ppEpt@arm.com Link: https://lkml.kernel.org/r/20211006082209.417-1-lukas.bulwahn@gmail.com Fixes: 6e7b64b9dd6d ("elfcore: fix building with clang") Signed-off-by: Lukas Bulwahn Cc: Arnd Bergmann Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Catalin Marinas Cc: Barret Rhoden Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/elfcore.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h index b81f9e1d74b0a..9d249dfbab726 100644 --- a/include/linux/elfcore.h +++ b/include/linux/elfcore.h @@ -58,7 +58,7 @@ static inline int elf_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregse } #endif -#if defined(CONFIG_UM) || defined(CONFIG_IA64) +#if (defined(CONFIG_UML) && defined(CONFIG_X86_32)) || defined(CONFIG_IA64) /* * These functions parameterize elf_core_dump in fs/binfmt_elf.c to write out * extra segments containing the gate DSO contents. Dumping its From c7fa75772a0dc1b466316c517988984eada51d89 Mon Sep 17 00:00:00 2001 From: Brendan Grieve Date: Fri, 15 Oct 2021 10:53:35 +0800 Subject: [PATCH 1231/5344] ALSA: usb-audio: Provide quirk for Sennheiser GSP670 Headset commit 3c414eb65c294719a91a746260085363413f91c1 upstream. As per discussion at: https://github.com/szszoke/sennheiser-gsp670-pulseaudio-profile/issues/13 The GSP670 has 2 playback and 1 recording device that by default are detected in an incompatible order for alsa. This may have been done to make it compatible for the console by the manufacturer and only affects the latest firmware which uses its own ID. This quirk will resolve this by reordering the channels. Signed-off-by: Brendan Grieve Cc: Link: https://lore.kernel.org/r/20211015025335.196592-1-brendan@grieve.com.au Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/quirks-table.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/sound/usb/quirks-table.h b/sound/usb/quirks-table.h index 9620ae0003ce4..01dee2074ab36 100644 --- a/sound/usb/quirks-table.h +++ b/sound/usb/quirks-table.h @@ -3806,5 +3806,37 @@ ALC1220_VB_DESKTOP(0x26ce, 0x0a01), /* Asrock TRX40 Creator */ } } }, +{ + /* + * Sennheiser GSP670 + * Change order of interfaces loaded + */ + USB_DEVICE(0x1395, 0x0300), + .bInterfaceClass = USB_CLASS_PER_INTERFACE, + .driver_info = (unsigned long) &(const struct snd_usb_audio_quirk) { + .ifnum = QUIRK_ANY_INTERFACE, + .type = QUIRK_COMPOSITE, + .data = &(const struct snd_usb_audio_quirk[]) { + // Communication + { + .ifnum = 3, + .type = QUIRK_AUDIO_STANDARD_INTERFACE + }, + // Recording + { + .ifnum = 4, + .type = QUIRK_AUDIO_STANDARD_INTERFACE + }, + // Main + { + .ifnum = 1, + .type = QUIRK_AUDIO_STANDARD_INTERFACE + }, + { + .ifnum = -1 + } + } + } +}, #undef USB_DEVICE_VENDOR_SPEC From e817187018de0dbe9112b8e9452a4f0bdf97d101 Mon Sep 17 00:00:00 2001 From: Steven Clarkson Date: Thu, 14 Oct 2021 06:35:54 -0700 Subject: [PATCH 1232/5344] ALSA: hda/realtek: Add quirk for Clevo PC50HS commit aef454b40288158b850aab13e3d2a8c406779401 upstream. Apply existing PCI quirk to the Clevo PC50HS and related models to fix audio output on the built in speakers. Signed-off-by: Steven Clarkson Cc: Link: https://lore.kernel.org/r/20211014133554.1326741-1-sc@lambdal.com Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index c4837c78a8624..49aefb7875e72 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -2537,6 +2537,7 @@ static const struct snd_pci_quirk alc882_fixup_tbl[] = { SND_PCI_QUIRK(0x1558, 0x65d2, "Clevo PB51R[CDF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x65e1, "Clevo PB51[ED][DF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x65e5, "Clevo PC50D[PRS](?:-D|-G)?", ALC1220_FIXUP_CLEVO_PB51ED_PINS), + SND_PCI_QUIRK(0x1558, 0x65f1, "Clevo PC50HS", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x67d1, "Clevo PB71[ER][CDF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x67e1, "Clevo PB71[DE][CDF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x67e5, "Clevo PC70D[PRS](?:-D|-G)?", ALC1220_FIXUP_CLEVO_PB51ED_PINS), From 23827300cb34969b5f636215974ffa7f5fdbde70 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 6 Oct 2021 16:17:12 +0200 Subject: [PATCH 1233/5344] ASoC: DAPM: Fix missing kctl change notifications commit 5af82c81b2c49cfb1cad84d9eb6eab0e3d1c4842 upstream. The put callback of a kcontrol is supposed to return 1 when the value is changed, and this will be notified to user-space. However, some DAPM kcontrols always return 0 (except for errors), hence the user-space misses the update of a control value. This patch corrects the behavior by properly returning 1 when the value gets updated. Reported-and-tested-by: Hans de Goede Cc: Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20211006141712.2439-1-tiwai@suse.de Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- sound/soc/soc-dapm.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index 7c4d5963692dd..66f6b698a5436 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -2546,6 +2546,7 @@ static int snd_soc_dapm_set_pin(struct snd_soc_dapm_context *dapm, const char *pin, int status) { struct snd_soc_dapm_widget *w = dapm_find_widget(dapm, pin, true); + int ret = 0; dapm_assert_locked(dapm); @@ -2558,13 +2559,14 @@ static int snd_soc_dapm_set_pin(struct snd_soc_dapm_context *dapm, dapm_mark_dirty(w, "pin configuration"); dapm_widget_invalidate_input_paths(w); dapm_widget_invalidate_output_paths(w); + ret = 1; } w->connected = status; if (status == 0) w->force = 0; - return 0; + return ret; } /** @@ -3580,14 +3582,15 @@ int snd_soc_dapm_put_pin_switch(struct snd_kcontrol *kcontrol, { struct snd_soc_card *card = snd_kcontrol_chip(kcontrol); const char *pin = (const char *)kcontrol->private_value; + int ret; if (ucontrol->value.integer.value[0]) - snd_soc_dapm_enable_pin(&card->dapm, pin); + ret = snd_soc_dapm_enable_pin(&card->dapm, pin); else - snd_soc_dapm_disable_pin(&card->dapm, pin); + ret = snd_soc_dapm_disable_pin(&card->dapm, pin); snd_soc_dapm_sync(&card->dapm); - return 0; + return ret; } EXPORT_SYMBOL_GPL(snd_soc_dapm_put_pin_switch); @@ -4029,7 +4032,7 @@ static int snd_soc_dapm_dai_link_put(struct snd_kcontrol *kcontrol, rtd->params_select = ucontrol->value.enumerated.item[0]; - return 0; + return 1; } static void From 897e9decbab4cab1035186c810770d9324c2c2b1 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Sat, 16 Oct 2021 15:23:50 +0800 Subject: [PATCH 1234/5344] audit: fix possible null-pointer dereference in audit_filter_rules commit 6e3ee990c90494561921c756481d0e2125d8b895 upstream. Fix possible null-pointer dereference in audit_filter_rules. audit_filter_rules() error: we previously assumed 'ctx' could be null Cc: stable@vger.kernel.org Fixes: bf361231c295 ("audit: add saddr_fam filter field") Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Gaosheng Cui Signed-off-by: Paul Moore Signed-off-by: Greg Kroah-Hartman --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4effe01ebbe2b..d33c5dccde1c7 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -624,7 +624,7 @@ static int audit_filter_rules(struct task_struct *tsk, result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val); break; case AUDIT_SADDR_FAM: - if (ctx->sockaddr) + if (ctx && ctx->sockaddr) result = audit_comparator(ctx->sockaddr->ss_family, f->op, f->val); break; From a5e83b8ca237728a1537b7f9ea83e5cf6c71499a Mon Sep 17 00:00:00 2001 From: "Christopher M. Riedl" Date: Sat, 6 Feb 2021 01:23:42 -0600 Subject: [PATCH 1235/5344] powerpc64/idle: Fix SP offsets when saving GPRs commit 73287caa9210ded6066833195f4335f7f688a46b upstream. The idle entry/exit code saves/restores GPRs in the stack "red zone" (Protected Zone according to PowerPC64 ELF ABI v2). However, the offset used for the first GPR is incorrect and overwrites the back chain - the Protected Zone actually starts below the current SP. In practice this is probably not an issue, but it's still incorrect so fix it. Also expand the comments to explain why using the stack "red zone" instead of creating a new stackframe is appropriate here. Signed-off-by: Christopher M. Riedl Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210206072342.5067-1-cmr@codefail.de Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kernel/idle_book3s.S | 138 ++++++++++++++++-------------- 1 file changed, 73 insertions(+), 65 deletions(-) diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index d32751994a62c..f2320c78372bf 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -50,28 +50,32 @@ _GLOBAL(isa300_idle_stop_mayloss) std r1,PACAR1(r13) mflr r4 mfcr r5 - /* use stack red zone rather than a new frame for saving regs */ - std r2,-8*0(r1) - std r14,-8*1(r1) - std r15,-8*2(r1) - std r16,-8*3(r1) - std r17,-8*4(r1) - std r18,-8*5(r1) - std r19,-8*6(r1) - std r20,-8*7(r1) - std r21,-8*8(r1) - std r22,-8*9(r1) - std r23,-8*10(r1) - std r24,-8*11(r1) - std r25,-8*12(r1) - std r26,-8*13(r1) - std r27,-8*14(r1) - std r28,-8*15(r1) - std r29,-8*16(r1) - std r30,-8*17(r1) - std r31,-8*18(r1) - std r4,-8*19(r1) - std r5,-8*20(r1) + /* + * Use the stack red zone rather than a new frame for saving regs since + * in the case of no GPR loss the wakeup code branches directly back to + * the caller without deallocating the stack frame first. + */ + std r2,-8*1(r1) + std r14,-8*2(r1) + std r15,-8*3(r1) + std r16,-8*4(r1) + std r17,-8*5(r1) + std r18,-8*6(r1) + std r19,-8*7(r1) + std r20,-8*8(r1) + std r21,-8*9(r1) + std r22,-8*10(r1) + std r23,-8*11(r1) + std r24,-8*12(r1) + std r25,-8*13(r1) + std r26,-8*14(r1) + std r27,-8*15(r1) + std r28,-8*16(r1) + std r29,-8*17(r1) + std r30,-8*18(r1) + std r31,-8*19(r1) + std r4,-8*20(r1) + std r5,-8*21(r1) /* 168 bytes */ PPC_STOP b . /* catch bugs */ @@ -87,8 +91,8 @@ _GLOBAL(isa300_idle_stop_mayloss) */ _GLOBAL(idle_return_gpr_loss) ld r1,PACAR1(r13) - ld r4,-8*19(r1) - ld r5,-8*20(r1) + ld r4,-8*20(r1) + ld r5,-8*21(r1) mtlr r4 mtcr r5 /* @@ -96,25 +100,25 @@ _GLOBAL(idle_return_gpr_loss) * from PACATOC. This could be avoided for that less common case * if KVM saved its r2. */ - ld r2,-8*0(r1) - ld r14,-8*1(r1) - ld r15,-8*2(r1) - ld r16,-8*3(r1) - ld r17,-8*4(r1) - ld r18,-8*5(r1) - ld r19,-8*6(r1) - ld r20,-8*7(r1) - ld r21,-8*8(r1) - ld r22,-8*9(r1) - ld r23,-8*10(r1) - ld r24,-8*11(r1) - ld r25,-8*12(r1) - ld r26,-8*13(r1) - ld r27,-8*14(r1) - ld r28,-8*15(r1) - ld r29,-8*16(r1) - ld r30,-8*17(r1) - ld r31,-8*18(r1) + ld r2,-8*1(r1) + ld r14,-8*2(r1) + ld r15,-8*3(r1) + ld r16,-8*4(r1) + ld r17,-8*5(r1) + ld r18,-8*6(r1) + ld r19,-8*7(r1) + ld r20,-8*8(r1) + ld r21,-8*9(r1) + ld r22,-8*10(r1) + ld r23,-8*11(r1) + ld r24,-8*12(r1) + ld r25,-8*13(r1) + ld r26,-8*14(r1) + ld r27,-8*15(r1) + ld r28,-8*16(r1) + ld r29,-8*17(r1) + ld r30,-8*18(r1) + ld r31,-8*19(r1) blr /* @@ -152,28 +156,32 @@ _GLOBAL(isa206_idle_insn_mayloss) std r1,PACAR1(r13) mflr r4 mfcr r5 - /* use stack red zone rather than a new frame for saving regs */ - std r2,-8*0(r1) - std r14,-8*1(r1) - std r15,-8*2(r1) - std r16,-8*3(r1) - std r17,-8*4(r1) - std r18,-8*5(r1) - std r19,-8*6(r1) - std r20,-8*7(r1) - std r21,-8*8(r1) - std r22,-8*9(r1) - std r23,-8*10(r1) - std r24,-8*11(r1) - std r25,-8*12(r1) - std r26,-8*13(r1) - std r27,-8*14(r1) - std r28,-8*15(r1) - std r29,-8*16(r1) - std r30,-8*17(r1) - std r31,-8*18(r1) - std r4,-8*19(r1) - std r5,-8*20(r1) + /* + * Use the stack red zone rather than a new frame for saving regs since + * in the case of no GPR loss the wakeup code branches directly back to + * the caller without deallocating the stack frame first. + */ + std r2,-8*1(r1) + std r14,-8*2(r1) + std r15,-8*3(r1) + std r16,-8*4(r1) + std r17,-8*5(r1) + std r18,-8*6(r1) + std r19,-8*7(r1) + std r20,-8*8(r1) + std r21,-8*9(r1) + std r22,-8*10(r1) + std r23,-8*11(r1) + std r24,-8*12(r1) + std r25,-8*13(r1) + std r26,-8*14(r1) + std r27,-8*15(r1) + std r28,-8*16(r1) + std r29,-8*17(r1) + std r30,-8*18(r1) + std r31,-8*19(r1) + std r4,-8*20(r1) + std r5,-8*21(r1) cmpwi r3,PNV_THREAD_NAP bne 1f IDLE_STATE_ENTER_SEQ_NORET(PPC_NAP) From 3d53c9ef96041136c4661bf570396fc694e74bf7 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 15 Oct 2021 23:01:48 +1100 Subject: [PATCH 1236/5344] KVM: PPC: Book3S HV: Fix stack handling in idle_kvm_start_guest() commit 9b4416c5095c20e110c82ae602c254099b83b72f upstream. In commit 10d91611f426 ("powerpc/64s: Reimplement book3s idle code in C") kvm_start_guest() became idle_kvm_start_guest(). The old code allocated a stack frame on the emergency stack, but didn't use the frame to store anything, and also didn't store anything in its caller's frame. idle_kvm_start_guest() on the other hand is written more like a normal C function, it creates a frame on entry, and also stores CR/LR into its callers frame (per the ABI). The problem is that there is no caller frame on the emergency stack. The emergency stack for a given CPU is allocated with: paca_ptrs[i]->emergency_sp = alloc_stack(limit, i) + THREAD_SIZE; So emergency_sp actually points to the first address above the emergency stack allocation for a given CPU, we must not store above it without first decrementing it to create a frame. This is different to the regular kernel stack, paca->kstack, which is initialised to point at an initial frame that is ready to use. idle_kvm_start_guest() stores the backchain, CR and LR all of which write outside the allocation for the emergency stack. It then creates a stack frame and saves the non-volatile registers. Unfortunately the frame it creates is not large enough to fit the non-volatiles, and so the saving of the non-volatile registers also writes outside the emergency stack allocation. The end result is that we corrupt whatever is at 0-24 bytes, and 112-248 bytes above the emergency stack allocation. In practice this has gone unnoticed because the memory immediately above the emergency stack happens to be used for other stack allocations, either another CPUs mc_emergency_sp or an IRQ stack. See the order of calls to irqstack_early_init() and emergency_stack_init(). The low addresses of another stack are the top of that stack, and so are only used if that stack is under extreme pressue, which essentially never happens in practice - and if it did there's a high likelyhood we'd crash due to that stack overflowing. Still, we shouldn't be corrupting someone else's stack, and it is purely luck that we aren't corrupting something else. To fix it we save CR/LR into the caller's frame using the existing r1 on entry, we then create a SWITCH_FRAME_SIZE frame (which has space for pt_regs) on the emergency stack with the backchain pointing to the existing stack, and then finally we switch to the new frame on the emergency stack. Fixes: 10d91611f426 ("powerpc/64s: Reimplement book3s idle code in C") Cc: stable@vger.kernel.org # v5.2+ Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211015133929.832061-1-mpe@ellerman.id.au Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index feaf6ca2e76c1..663d219c292b6 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -292,13 +292,15 @@ kvm_novcpu_exit: * r3 contains the SRR1 wakeup value, SRR1 is trashed. */ _GLOBAL(idle_kvm_start_guest) - ld r4,PACAEMERGSP(r13) mfcr r5 mflr r0 - std r1,0(r4) - std r5,8(r4) - std r0,16(r4) - subi r1,r4,STACK_FRAME_OVERHEAD + std r5, 8(r1) // Save CR in caller's frame + std r0, 16(r1) // Save LR in caller's frame + // Create frame on emergency stack + ld r4, PACAEMERGSP(r13) + stdu r1, -SWITCH_FRAME_SIZE(r4) + // Switch to new frame on emergency stack + mr r1, r4 SAVE_NVGPRS(r1) /* @@ -444,10 +446,9 @@ kvm_no_guest: /* set up r3 for return */ mfspr r3,SPRN_SRR1 REST_NVGPRS(r1) - addi r1, r1, STACK_FRAME_OVERHEAD - ld r0, 16(r1) - ld r5, 8(r1) - ld r1, 0(r1) + ld r1, 0(r1) // Switch back to caller stack + ld r0, 16(r1) // Reload LR + ld r5, 8(r1) // Reload CR mtlr r0 mtcr r5 blr From 727376479179bd9f444b401018793c1f922d9dba Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 15 Oct 2021 23:02:08 +1100 Subject: [PATCH 1237/5344] KVM: PPC: Book3S HV: Make idle_kvm_start_guest() return 0 if it went to guest commit cdeb5d7d890e14f3b70e8087e745c4a6a7d9f337 upstream. We call idle_kvm_start_guest() from power7_offline() if the thread has been requested to enter KVM. We pass it the SRR1 value that was returned from power7_idle_insn() which tells us what sort of wakeup we're processing. Depending on the SRR1 value we pass in, the KVM code might enter the guest, or it might return to us to do some host action if the wakeup requires it. If idle_kvm_start_guest() is able to handle the wakeup, and enter the guest it is supposed to indicate that by returning a zero SRR1 value to us. That was the behaviour prior to commit 10d91611f426 ("powerpc/64s: Reimplement book3s idle code in C"), however in that commit the handling of SRR1 was reworked, and the zeroing behaviour was lost. Returning from idle_kvm_start_guest() without zeroing the SRR1 value can confuse the host offline code, causing the guest to crash and other weirdness. Fixes: 10d91611f426 ("powerpc/64s: Reimplement book3s idle code in C") Cc: stable@vger.kernel.org # v5.2+ Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211015133929.832061-2-mpe@ellerman.id.au Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 663d219c292b6..f9c7326672b95 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -301,6 +301,7 @@ _GLOBAL(idle_kvm_start_guest) stdu r1, -SWITCH_FRAME_SIZE(r4) // Switch to new frame on emergency stack mr r1, r4 + std r3, 32(r1) // Save SRR1 wakeup value SAVE_NVGPRS(r1) /* @@ -352,6 +353,10 @@ kvm_unsplit_wakeup: kvm_secondary_got_guest: + // About to go to guest, clear saved SRR1 + li r0, 0 + std r0, 32(r1) + /* Set HSTATE_DSCR(r13) to something sensible */ ld r6, PACA_DSCR_DEFAULT(r13) std r6, HSTATE_DSCR(r13) @@ -443,8 +448,8 @@ kvm_no_guest: mfspr r4, SPRN_LPCR rlwimi r4, r3, 0, LPCR_PECE0 | LPCR_PECE1 mtspr SPRN_LPCR, r4 - /* set up r3 for return */ - mfspr r3,SPRN_SRR1 + // Return SRR1 wakeup value, or 0 if we went into the guest + ld r3, 32(r1) REST_NVGPRS(r1) ld r1, 0(r1) // Switch back to caller stack ld r0, 16(r1) // Reload LR From 941b866ad4690395300f1f73eb9a42b9b52f304c Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 20 Oct 2021 20:48:26 +1100 Subject: [PATCH 1238/5344] powerpc/idle: Don't corrupt back chain when going idle commit 496c5fe25c377ddb7815c4ce8ecfb676f051e9b6 upstream. In isa206_idle_insn_mayloss() we store various registers into the stack red zone, which is allowed. However inside the IDLE_STATE_ENTER_SEQ_NORET macro we save r2 again, to 0(r1), which corrupts the stack back chain. We used to do the same in isa206_idle_insn_mayloss() itself, but we fixed that in 73287caa9210 ("powerpc64/idle: Fix SP offsets when saving GPRs"), however we missed that the macro also corrupts the back chain. Corrupting the back chain is bad for debuggability but doesn't necessarily cause a bug. However we recently changed the stack handling in some KVM code, and it now relies on the stack back chain being valid when it returns. The corruption causes that code to return with r1 pointing somewhere in kernel data, at some point LR is restored from the stack and we branch to NULL or somewhere else invalid. Only affects Power8 hosts running KVM guests, with dynamic_mt_modes enabled (which it is by default). The fixes tag below points to the commit that changed the KVM stack handling, exposing this bug. The actual corruption of the back chain has always existed since 948cf67c4726 ("powerpc: Add NAP mode support on Power7 in HV mode"). Fixes: 9b4416c5095c ("KVM: PPC: Book3S HV: Fix stack handling in idle_kvm_start_guest()") Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211020094826.3222052-1-mpe@ellerman.id.au Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kernel/idle_book3s.S | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index f2320c78372bf..c3f62c9ce7406 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -124,14 +124,16 @@ _GLOBAL(idle_return_gpr_loss) /* * This is the sequence required to execute idle instructions, as * specified in ISA v2.07 (and earlier). MSR[IR] and MSR[DR] must be 0. - * - * The 0(r1) slot is used to save r2 in isa206, so use that here. + * We have to store a GPR somewhere, ptesync, then reload it, and create + * a false dependency on the result of the load. It doesn't matter which + * GPR we store, or where we store it. We have already stored r2 to the + * stack at -8(r1) in isa206_idle_insn_mayloss, so use that. */ #define IDLE_STATE_ENTER_SEQ_NORET(IDLE_INST) \ /* Magic NAP/SLEEP/WINKLE mode enter sequence */ \ - std r2,0(r1); \ + std r2,-8(r1); \ ptesync; \ - ld r2,0(r1); \ + ld r2,-8(r1); \ 236: cmpd cr0,r2,r2; \ bne 236b; \ IDLE_INST; \ From aeff9689ca0cd75bfc62ccabdaeb360773b9ff0a Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Thu, 7 Oct 2021 19:44:30 +0200 Subject: [PATCH 1239/5344] nfc: nci: fix the UAF of rf_conn_info object commit 1b1499a817c90fd1ce9453a2c98d2a01cca0e775 upstream. The nci_core_conn_close_rsp_packet() function will release the conn_info with given conn_id. However, it needs to set the rf_conn_info to NULL to prevent other routines like nci_rf_intf_activated_ntf_packet() to trigger the UAF. Reviewed-by: Krzysztof Kozlowski Signed-off-by: Lin Ma Signed-off-by: Krzysztof Kozlowski Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/nfc/nci/rsp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/nfc/nci/rsp.c b/net/nfc/nci/rsp.c index a48297b79f34f..b0ed2b47ac437 100644 --- a/net/nfc/nci/rsp.c +++ b/net/nfc/nci/rsp.c @@ -277,6 +277,8 @@ static void nci_core_conn_close_rsp_packet(struct nci_dev *ndev, ndev->cur_conn_id); if (conn_info) { list_del(&conn_info->list); + if (conn_info == ndev->rf_conn_info) + ndev->rf_conn_info = NULL; devm_kfree(&ndev->nfc_dev->dev, conn_info); } } From 48d3bc9cf23a829c085407f95d22c9f8af72b400 Mon Sep 17 00:00:00 2001 From: Xiaolong Huang Date: Fri, 8 Oct 2021 14:58:30 +0800 Subject: [PATCH 1240/5344] isdn: cpai: check ctr->cnr to avoid array index out of bound commit 1f3e2e97c003f80c4b087092b225c8787ff91e4d upstream. The cmtp_add_connection() would add a cmtp session to a controller and run a kernel thread to process cmtp. __module_get(THIS_MODULE); session->task = kthread_run(cmtp_session, session, "kcmtpd_ctr_%d", session->num); During this process, the kernel thread would call detach_capi_ctr() to detach a register controller. if the controller was not attached yet, detach_capi_ctr() would trigger an array-index-out-bounds bug. [ 46.866069][ T6479] UBSAN: array-index-out-of-bounds in drivers/isdn/capi/kcapi.c:483:21 [ 46.867196][ T6479] index -1 is out of range for type 'capi_ctr *[32]' [ 46.867982][ T6479] CPU: 1 PID: 6479 Comm: kcmtpd_ctr_0 Not tainted 5.15.0-rc2+ #8 [ 46.869002][ T6479] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 [ 46.870107][ T6479] Call Trace: [ 46.870473][ T6479] dump_stack_lvl+0x57/0x7d [ 46.870974][ T6479] ubsan_epilogue+0x5/0x40 [ 46.871458][ T6479] __ubsan_handle_out_of_bounds.cold+0x43/0x48 [ 46.872135][ T6479] detach_capi_ctr+0x64/0xc0 [ 46.872639][ T6479] cmtp_session+0x5c8/0x5d0 [ 46.873131][ T6479] ? __init_waitqueue_head+0x60/0x60 [ 46.873712][ T6479] ? cmtp_add_msgpart+0x120/0x120 [ 46.874256][ T6479] kthread+0x147/0x170 [ 46.874709][ T6479] ? set_kthread_struct+0x40/0x40 [ 46.875248][ T6479] ret_from_fork+0x1f/0x30 [ 46.875773][ T6479] Signed-off-by: Xiaolong Huang Acked-by: Arnd Bergmann Link: https://lore.kernel.org/r/20211008065830.305057-1-butterflyhuangxx@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/isdn/capi/kcapi.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c index 18de41a266ebe..aa625b7ddcce2 100644 --- a/drivers/isdn/capi/kcapi.c +++ b/drivers/isdn/capi/kcapi.c @@ -565,6 +565,11 @@ int detach_capi_ctr(struct capi_ctr *ctr) ctr_down(ctr, CAPI_CTR_DETACHED); + if (ctr->cnr < 1 || ctr->cnr - 1 >= CAPI_MAXCONTR) { + err = -EINVAL; + goto unlock_out; + } + if (capi_controller[ctr->cnr - 1] != ctr) { err = -EINVAL; goto unlock_out; From 1a28994ef87a8d74e40c688e53a4b0b4d9aab84e Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Tue, 5 Oct 2021 22:54:54 +0200 Subject: [PATCH 1241/5344] netfilter: Kconfig: use 'default y' instead of 'm' for bool config option commit 77076934afdcd46516caf18ed88b2f88025c9ddb upstream. This option, NF_CONNTRACK_SECMARK, is a bool, so it can never be 'm'. Fixes: 33b8e77605620 ("[NETFILTER]: Add CONFIG_NETFILTER_ADVANCED option") Signed-off-by: Vegard Nossum Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman --- net/netfilter/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 91efae88e8c2a..ef72819d9d315 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -94,7 +94,7 @@ config NF_CONNTRACK_MARK config NF_CONNTRACK_SECMARK bool 'Connection tracking security mark support' depends on NETWORK_SECMARK - default m if NETFILTER_ADVANCED=n + default y if NETFILTER_ADVANCED=n help This option enables security markings to be applied to connections. Typically they are copied to connections from From cc21c9dff73e3fe3189efc4cf1cfccb6bb6d72b6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 12 Oct 2021 18:37:09 +0200 Subject: [PATCH 1242/5344] selftests: netfilter: remove stray bash debug line commit 3e6ed7703dae6838c104d73d3e76e9b79f5c0528 upstream. This should not be there. Fixes: 2de03b45236f ("selftests: netfilter: add flowtable test script") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman --- tools/testing/selftests/netfilter/nft_flowtable.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh index 16571ac1dab40..1ccb12fe2511a 100755 --- a/tools/testing/selftests/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/netfilter/nft_flowtable.sh @@ -174,7 +174,6 @@ fi ip netns exec ns1 ping -c 1 -q 10.0.2.99 > /dev/null if [ $? -ne 0 ];then echo "ERROR: ns1 cannot reach ns2" 1>&2 - bash exit 1 fi From 7857d6663d91058c12fbda566ff31248cde6ab79 Mon Sep 17 00:00:00 2001 From: Brendan Higgins Date: Wed, 29 Sep 2021 14:27:09 -0700 Subject: [PATCH 1243/5344] gcc-plugins/structleak: add makefile var for disabling structleak [ Upstream commit 554afc3b9797511e3245864e32aebeb6abbab1e3 ] KUnit and structleak don't play nice, so add a makefile variable for enabling structleak when it complains. Co-developed-by: Kees Cook Signed-off-by: Kees Cook Signed-off-by: Brendan Higgins Reviewed-by: David Gow Signed-off-by: Shuah Khan Signed-off-by: Sasha Levin --- scripts/Makefile.gcc-plugins | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/Makefile.gcc-plugins b/scripts/Makefile.gcc-plugins index 5f7df50cfe7aa..63d21489216b0 100644 --- a/scripts/Makefile.gcc-plugins +++ b/scripts/Makefile.gcc-plugins @@ -19,6 +19,10 @@ gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF) \ += -fplugin-arg-structleak_plugin-byref gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL) \ += -fplugin-arg-structleak_plugin-byref-all +ifdef CONFIG_GCC_PLUGIN_STRUCTLEAK + DISABLE_STRUCTLEAK_PLUGIN += -fplugin-arg-structleak_plugin-disable +endif +export DISABLE_STRUCTLEAK_PLUGIN gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STRUCTLEAK) \ += -DSTRUCTLEAK_PLUGIN From a20e14d3ca043e075f88d664af3709bdb74c7d84 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 1 Oct 2021 13:52:30 +0100 Subject: [PATCH 1244/5344] btrfs: deal with errors when checking if a dir entry exists during log replay [ Upstream commit 77a5b9e3d14cbce49ceed2766b2003c034c066dc ] Currently inode_in_dir() ignores errors returned from btrfs_lookup_dir_index_item() and from btrfs_lookup_dir_item(), treating any errors as if the directory entry does not exists in the fs/subvolume tree, which is obviously not correct, as we can get errors such as -EIO when reading extent buffers while searching the fs/subvolume's tree. Fix that by making inode_in_dir() return the errors and making its only caller, add_inode_ref(), deal with returned errors as well. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/tree-log.c | 47 ++++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9d358dafef367..f342055699870 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -900,9 +900,11 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, } /* - * helper function to see if a given name and sequence number found - * in an inode back reference are already in a directory and correctly - * point to this inode + * See if a given name and sequence number found in an inode back reference are + * already in a directory and correctly point to this inode. + * + * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it + * exists. */ static noinline int inode_in_dir(struct btrfs_root *root, struct btrfs_path *path, @@ -911,29 +913,35 @@ static noinline int inode_in_dir(struct btrfs_root *root, { struct btrfs_dir_item *di; struct btrfs_key location; - int match = 0; + int ret = 0; di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, index, name, name_len, 0); - if (di && !IS_ERR(di)) { + if (IS_ERR(di)) { + if (PTR_ERR(di) != -ENOENT) + ret = PTR_ERR(di); + goto out; + } else if (di) { btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); if (location.objectid != objectid) goto out; - } else + } else { goto out; - btrfs_release_path(path); + } + btrfs_release_path(path); di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); - if (di && !IS_ERR(di)) { - btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); - if (location.objectid != objectid) - goto out; - } else + if (IS_ERR(di)) { + ret = PTR_ERR(di); goto out; - match = 1; + } else if (di) { + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); + if (location.objectid == objectid) + ret = 1; + } out: btrfs_release_path(path); - return match; + return ret; } /* @@ -1500,10 +1508,12 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (ret) goto out; - /* if we already have a perfect match, we're done */ - if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), - btrfs_ino(BTRFS_I(inode)), ref_index, - name, namelen)) { + ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), + btrfs_ino(BTRFS_I(inode)), ref_index, + name, namelen); + if (ret < 0) { + goto out; + } else if (ret == 0) { /* * look for a conflicting back reference in the * metadata. if we find one we have to unlink that name @@ -1561,6 +1571,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, btrfs_update_inode(trans, root, inode); } + /* Else, ret == 1, we already have a perfect match, we're done. */ ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; kfree(name); From 99a7a3f3514fe337cfea8c914439756ba5ddd129 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Fri, 8 Oct 2021 12:34:39 +0200 Subject: [PATCH 1245/5344] net: stmmac: add support for dwmac 3.40a [ Upstream commit 9cb1d19f47fafad7dcf7c8564e633440c946cfd7 ] dwmac 3.40a is an old ip version that can be found on SPEAr3xx soc. Signed-off-by: Herve Codina Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c | 1 + drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c index fad503820e040..b3365b34cac7c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c @@ -71,6 +71,7 @@ static int dwmac_generic_probe(struct platform_device *pdev) static const struct of_device_id dwmac_generic_match[] = { { .compatible = "st,spear600-gmac"}, + { .compatible = "snps,dwmac-3.40a"}, { .compatible = "snps,dwmac-3.50a"}, { .compatible = "snps,dwmac-3.610"}, { .compatible = "snps,dwmac-3.70a"}, diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index 678aa2b001e01..a46fea472bc46 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -505,6 +505,14 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac) plat->pmt = 1; } + if (of_device_is_compatible(np, "snps,dwmac-3.40a")) { + plat->has_gmac = 1; + plat->enh_desc = 1; + plat->tx_coe = 1; + plat->bugged_jumbo = 1; + plat->pmt = 1; + } + if (of_device_is_compatible(np, "snps,dwmac-4.00") || of_device_is_compatible(np, "snps,dwmac-4.10a") || of_device_is_compatible(np, "snps,dwmac-4.20a")) { From f550dabfb9cfda90be95a4cd71a019376d2a94b2 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Fri, 8 Oct 2021 12:34:40 +0200 Subject: [PATCH 1246/5344] ARM: dts: spear3xx: Fix gmac node [ Upstream commit 6636fec29cdf6665bd219564609e8651f6ddc142 ] On SPEAr3xx, ethernet driver is not compatible with the SPEAr600 one. Indeed, SPEAr3xx uses an earlier version of this IP (v3.40) and needs some driver tuning compare to SPEAr600. The v3.40 IP support was added to stmmac driver and this patch fixes this issue and use the correct compatible string for SPEAr3xx Signed-off-by: Herve Codina Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- arch/arm/boot/dts/spear3xx.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/spear3xx.dtsi b/arch/arm/boot/dts/spear3xx.dtsi index f266b7b034823..cc88ebe7a60ce 100644 --- a/arch/arm/boot/dts/spear3xx.dtsi +++ b/arch/arm/boot/dts/spear3xx.dtsi @@ -47,7 +47,7 @@ }; gmac: eth@e0800000 { - compatible = "st,spear600-gmac"; + compatible = "snps,dwmac-3.40a"; reg = <0xe0800000 0x8000>; interrupts = <23 22>; interrupt-names = "macirq", "eth_wake_irq"; From 8c113212f5e4934a2e4f2e84c9e880cdf276f6e6 Mon Sep 17 00:00:00 2001 From: Zheyu Ma Date: Sat, 9 Oct 2021 11:33:49 +0000 Subject: [PATCH 1247/5344] isdn: mISDN: Fix sleeping function called from invalid context [ Upstream commit 6510e80a0b81b5d814e3aea6297ba42f5e76f73c ] The driver can call card->isac.release() function from an atomic context. Fix this by calling this function after releasing the lock. The following log reveals it: [ 44.168226 ] BUG: sleeping function called from invalid context at kernel/workqueue.c:3018 [ 44.168941 ] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 5475, name: modprobe [ 44.169574 ] INFO: lockdep is turned off. [ 44.169899 ] irq event stamp: 0 [ 44.170160 ] hardirqs last enabled at (0): [<0000000000000000>] 0x0 [ 44.170627 ] hardirqs last disabled at (0): [] copy_process+0x132d/0x3e00 [ 44.171240 ] softirqs last enabled at (0): [] copy_process+0x135a/0x3e00 [ 44.171852 ] softirqs last disabled at (0): [<0000000000000000>] 0x0 [ 44.172318 ] Preemption disabled at: [ 44.172320 ] [] nj_release+0x69/0x500 [netjet] [ 44.174441 ] Call Trace: [ 44.174630 ] dump_stack_lvl+0xa8/0xd1 [ 44.174912 ] dump_stack+0x15/0x17 [ 44.175166 ] ___might_sleep+0x3a2/0x510 [ 44.175459 ] ? nj_release+0x69/0x500 [netjet] [ 44.175791 ] __might_sleep+0x82/0xe0 [ 44.176063 ] ? start_flush_work+0x20/0x7b0 [ 44.176375 ] start_flush_work+0x33/0x7b0 [ 44.176672 ] ? trace_irq_enable_rcuidle+0x85/0x170 [ 44.177034 ] ? kasan_quarantine_put+0xaa/0x1f0 [ 44.177372 ] ? kasan_quarantine_put+0xaa/0x1f0 [ 44.177711 ] __flush_work+0x11a/0x1a0 [ 44.177991 ] ? flush_work+0x20/0x20 [ 44.178257 ] ? lock_release+0x13c/0x8f0 [ 44.178550 ] ? __kasan_check_write+0x14/0x20 [ 44.178872 ] ? do_raw_spin_lock+0x148/0x360 [ 44.179187 ] ? read_lock_is_recursive+0x20/0x20 [ 44.179530 ] ? __kasan_check_read+0x11/0x20 [ 44.179846 ] ? do_raw_spin_unlock+0x55/0x900 [ 44.180168 ] ? ____kasan_slab_free+0x116/0x140 [ 44.180505 ] ? _raw_spin_unlock_irqrestore+0x41/0x60 [ 44.180878 ] ? skb_queue_purge+0x1a3/0x1c0 [ 44.181189 ] ? kfree+0x13e/0x290 [ 44.181438 ] flush_work+0x17/0x20 [ 44.181695 ] mISDN_freedchannel+0xe8/0x100 [ 44.182006 ] isac_release+0x210/0x260 [mISDNipac] [ 44.182366 ] nj_release+0xf6/0x500 [netjet] [ 44.182685 ] nj_remove+0x48/0x70 [netjet] [ 44.182989 ] pci_device_remove+0xa9/0x250 Signed-off-by: Zheyu Ma Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/isdn/hardware/mISDN/netjet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/isdn/hardware/mISDN/netjet.c b/drivers/isdn/hardware/mISDN/netjet.c index 9e6aab04f9d61..8299defff55ae 100644 --- a/drivers/isdn/hardware/mISDN/netjet.c +++ b/drivers/isdn/hardware/mISDN/netjet.c @@ -949,8 +949,8 @@ nj_release(struct tiger_hw *card) nj_disable_hwirq(card); mode_tiger(&card->bc[0], ISDN_P_NONE); mode_tiger(&card->bc[1], ISDN_P_NONE); - card->isac.release(&card->isac); spin_unlock_irqrestore(&card->lock, flags); + card->isac.release(&card->isac); release_region(card->base, card->base_s); card->base_s = 0; } From bb5edfb023171e2bace265840745b44e1beca9cc Mon Sep 17 00:00:00 2001 From: Prashant Malani Date: Tue, 28 Sep 2021 03:19:34 -0700 Subject: [PATCH 1248/5344] platform/x86: intel_scu_ipc: Update timeout value in comment [ Upstream commit a0c5814b9933f25ecb6de169483c5b88cf632bca ] The comment decribing the IPC timeout hadn't been updated when the actual timeout was changed from 3 to 5 seconds in commit a7d53dbbc70a ("platform/x86: intel_scu_ipc: Increase virtual timeout from 3 to 5 seconds") . Since the value is anyway updated to 10s now, take this opportunity to update the value in the comment too. Signed-off-by: Prashant Malani Cc: Benson Leung Reviewed-by: Mika Westerberg Link: https://lore.kernel.org/r/20210928101932.2543937-4-pmalani@chromium.org Signed-off-by: Hans de Goede Signed-off-by: Sasha Levin --- drivers/platform/x86/intel_scu_ipc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/intel_scu_ipc.c b/drivers/platform/x86/intel_scu_ipc.c index e330ec73c465d..bcb516892d01d 100644 --- a/drivers/platform/x86/intel_scu_ipc.c +++ b/drivers/platform/x86/intel_scu_ipc.c @@ -181,7 +181,7 @@ static inline int busy_loop(struct intel_scu_ipc_dev *scu) return 0; } -/* Wait till ipc ioc interrupt is received or timeout in 3 HZ */ +/* Wait till ipc ioc interrupt is received or timeout in 10 HZ */ static inline int ipc_wait_for_interrupt(struct intel_scu_ipc_dev *scu) { int status; From f7c0a7fd2b1801ff10f410adc0551ffffd02bf1c Mon Sep 17 00:00:00 2001 From: Kai Vehmanen Date: Tue, 12 Oct 2021 17:29:35 +0300 Subject: [PATCH 1249/5344] ALSA: hda: avoid write to STATESTS if controller is in reset [ Upstream commit b37a15188eae9d4c49c5bb035e0c8d4058e4d9b3 ] The snd_hdac_bus_reset_link() contains logic to clear STATESTS register before performing controller reset. This code dates back to an old bugfix in commit e8a7f136f5ed ("[ALSA] hda-intel - Improve HD-audio codec probing robustness"). Originally the code was added to azx_reset(). The code was moved around in commit a41d122449be ("ALSA: hda - Embed bus into controller object") and ended up to snd_hdac_bus_reset_link() and called primarily via snd_hdac_bus_init_chip(). The logic to clear STATESTS is correct when snd_hdac_bus_init_chip() is called when controller is not in reset. In this case, STATESTS can be cleared. This can be useful e.g. when forcing a controller reset to retry codec probe. A normal non-power-on reset will not clear the bits. However, this old logic is problematic when controller is already in reset. The HDA specification states that controller must be taken out of reset before writing to registers other than GCTL.CRST (1.0a spec, 3.3.7). The write to STATESTS in snd_hdac_bus_reset_link() will be lost if the controller is already in reset per the HDA specification mentioned. This has been harmless on older hardware. On newer generation of Intel PCIe based HDA controllers, if configured to report issues, this write will emit an unsupported request error. If ACPI Platform Error Interface (APEI) is enabled in kernel, this will end up to kernel log. Fix the code in snd_hdac_bus_reset_link() to only clear the STATESTS if the function is called when controller is not in reset. Otherwise clearing the bits is not possible and should be skipped. Signed-off-by: Kai Vehmanen Link: https://lore.kernel.org/r/20211012142935.3731820-1-kai.vehmanen@linux.intel.com Signed-off-by: Takashi Iwai Signed-off-by: Sasha Levin --- sound/hda/hdac_controller.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sound/hda/hdac_controller.c b/sound/hda/hdac_controller.c index 7e7be8e4dcf9c..87ba66dcfd478 100644 --- a/sound/hda/hdac_controller.c +++ b/sound/hda/hdac_controller.c @@ -395,8 +395,9 @@ int snd_hdac_bus_reset_link(struct hdac_bus *bus, bool full_reset) if (!full_reset) goto skip_reset; - /* clear STATESTS */ - snd_hdac_chip_writew(bus, STATESTS, STATESTS_INT_MASK); + /* clear STATESTS if not in reset */ + if (snd_hdac_chip_readb(bus, GCTL) & AZX_GCTL_RESET) + snd_hdac_chip_writew(bus, STATESTS, STATESTS_INT_MASK); /* reset controller */ snd_hdac_bus_enter_link_reset(bus); From 690fbc3fc3e05f5edc6d861eeba94b3fc227bf27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 15 Oct 2021 21:19:33 -0700 Subject: [PATCH 1250/5344] Input: snvs_pwrkey - add clk handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit d997cc1715df7b6c3df798881fb9941acf0079f8 ] On i.MX7S and i.MX8M* (but not i.MX6*) the pwrkey device has an associated clock. Accessing the registers requires that this clock is enabled. Binding the driver on at least i.MX7S and i.MX8MP while not having the clock enabled results in a complete hang of the machine. (This usually only happens if snvs_pwrkey is built as a module and the rtc-snvs driver isn't already bound because at bootup the required clk is on and only gets disabled when the clk framework disables unused clks late during boot.) This completes the fix in commit 135be16d3505 ("ARM: dts: imx7s: add snvs clock to pwrkey"). Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20211013062848.2667192-1-u.kleine-koenig@pengutronix.de Signed-off-by: Dmitry Torokhov Signed-off-by: Sasha Levin --- drivers/input/keyboard/snvs_pwrkey.c | 29 ++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/drivers/input/keyboard/snvs_pwrkey.c b/drivers/input/keyboard/snvs_pwrkey.c index e76b7a400a1c1..248bb86f4b3f4 100644 --- a/drivers/input/keyboard/snvs_pwrkey.c +++ b/drivers/input/keyboard/snvs_pwrkey.c @@ -3,6 +3,7 @@ // Driver for the IMX SNVS ON/OFF Power Key // Copyright (C) 2015 Freescale Semiconductor, Inc. All Rights Reserved. +#include #include #include #include @@ -81,6 +82,11 @@ static irqreturn_t imx_snvs_pwrkey_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } +static void imx_snvs_pwrkey_disable_clk(void *data) +{ + clk_disable_unprepare(data); +} + static void imx_snvs_pwrkey_act(void *pdata) { struct pwrkey_drv_data *pd = pdata; @@ -93,6 +99,7 @@ static int imx_snvs_pwrkey_probe(struct platform_device *pdev) struct pwrkey_drv_data *pdata = NULL; struct input_dev *input = NULL; struct device_node *np; + struct clk *clk; int error; /* Get SNVS register Page */ @@ -115,6 +122,28 @@ static int imx_snvs_pwrkey_probe(struct platform_device *pdev) dev_warn(&pdev->dev, "KEY_POWER without setting in dts\n"); } + clk = devm_clk_get_optional(&pdev->dev, NULL); + if (IS_ERR(clk)) { + dev_err(&pdev->dev, "Failed to get snvs clock (%pe)\n", clk); + return PTR_ERR(clk); + } + + error = clk_prepare_enable(clk); + if (error) { + dev_err(&pdev->dev, "Failed to enable snvs clock (%pe)\n", + ERR_PTR(error)); + return error; + } + + error = devm_add_action_or_reset(&pdev->dev, + imx_snvs_pwrkey_disable_clk, clk); + if (error) { + dev_err(&pdev->dev, + "Failed to register clock cleanup handler (%pe)\n", + ERR_PTR(error)); + return error; + } + pdata->wakeup = of_property_read_bool(np, "wakeup-source"); pdata->irq = platform_get_irq(pdev, 0); From e53f87c777dfb5451de4ee747c907aebf1beacf9 Mon Sep 17 00:00:00 2001 From: Yanfei Xu Date: Sun, 26 Sep 2021 12:53:13 +0800 Subject: [PATCH 1251/5344] net: mdiobus: Fix memory leak in __mdiobus_register commit ab609f25d19858513919369ff3d9a63c02cd9e2e upstream. Once device_register() failed, we should call put_device() to decrement reference count for cleanup. Or it will cause memory leak. BUG: memory leak unreferenced object 0xffff888114032e00 (size 256): comm "kworker/1:3", pid 2960, jiffies 4294943572 (age 15.920s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 08 2e 03 14 81 88 ff ff ................ 08 2e 03 14 81 88 ff ff 90 76 65 82 ff ff ff ff .........ve..... backtrace: [] kmalloc include/linux/slab.h:591 [inline] [] kzalloc include/linux/slab.h:721 [inline] [] device_private_init drivers/base/core.c:3203 [inline] [] device_add+0x89b/0xdf0 drivers/base/core.c:3253 [] __mdiobus_register+0xc3/0x450 drivers/net/phy/mdio_bus.c:537 [] __devm_mdiobus_register+0x75/0xf0 drivers/net/phy/mdio_devres.c:87 [] ax88772_init_mdio drivers/net/usb/asix_devices.c:676 [inline] [] ax88772_bind+0x330/0x480 drivers/net/usb/asix_devices.c:786 [] usbnet_probe+0x3ff/0xdf0 drivers/net/usb/usbnet.c:1745 [] usb_probe_interface+0x177/0x370 drivers/usb/core/driver.c:396 [] call_driver_probe drivers/base/dd.c:517 [inline] [] really_probe.part.0+0xe7/0x380 drivers/base/dd.c:596 [] really_probe drivers/base/dd.c:558 [inline] [] __driver_probe_device+0x10c/0x1e0 drivers/base/dd.c:751 [] driver_probe_device+0x2a/0x120 drivers/base/dd.c:781 [] __device_attach_driver+0xf6/0x140 drivers/base/dd.c:898 [] bus_for_each_drv+0xb7/0x100 drivers/base/bus.c:427 [] __device_attach+0x122/0x260 drivers/base/dd.c:969 [] bus_probe_device+0xc6/0xe0 drivers/base/bus.c:487 [] device_add+0x5fb/0xdf0 drivers/base/core.c:3359 [] usb_set_configuration+0x9d9/0xb90 drivers/usb/core/message.c:2170 [] usb_generic_driver_probe+0x8c/0xc0 drivers/usb/core/generic.c:238 BUG: memory leak unreferenced object 0xffff888116f06900 (size 32): comm "kworker/0:2", pid 2670, jiffies 4294944448 (age 7.160s) hex dump (first 32 bytes): 75 73 62 2d 30 30 31 3a 30 30 33 00 00 00 00 00 usb-001:003..... 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kstrdup+0x36/0x70 mm/util.c:60 [] kstrdup_const+0x53/0x80 mm/util.c:83 [] kvasprintf_const+0xc2/0x110 lib/kasprintf.c:48 [] kobject_set_name_vargs+0x3b/0xe0 lib/kobject.c:289 [] dev_set_name+0x63/0x90 drivers/base/core.c:3147 [] __mdiobus_register+0xbb/0x450 drivers/net/phy/mdio_bus.c:535 [] __devm_mdiobus_register+0x75/0xf0 drivers/net/phy/mdio_devres.c:87 [] ax88772_init_mdio drivers/net/usb/asix_devices.c:676 [inline] [] ax88772_bind+0x330/0x480 drivers/net/usb/asix_devices.c:786 [] usbnet_probe+0x3ff/0xdf0 drivers/net/usb/usbnet.c:1745 [] usb_probe_interface+0x177/0x370 drivers/usb/core/driver.c:396 [] call_driver_probe drivers/base/dd.c:517 [inline] [] really_probe.part.0+0xe7/0x380 drivers/base/dd.c:596 [] really_probe drivers/base/dd.c:558 [inline] [] __driver_probe_device+0x10c/0x1e0 drivers/base/dd.c:751 [] driver_probe_device+0x2a/0x120 drivers/base/dd.c:781 [] __device_attach_driver+0xf6/0x140 drivers/base/dd.c:898 [] bus_for_each_drv+0xb7/0x100 drivers/base/bus.c:427 [] __device_attach+0x122/0x260 drivers/base/dd.c:969 Reported-by: syzbot+398e7dc692ddbbb4cfec@syzkaller.appspotmail.com Signed-off-by: Yanfei Xu Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/phy/mdio_bus.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index bec73f0640d03..8377ff229a303 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -395,6 +395,7 @@ int __mdiobus_register(struct mii_bus *bus, struct module *owner) err = device_register(&bus->dev); if (err) { pr_err("mii_bus %s failed to register\n", bus->id); + put_device(&bus->dev); return -EINVAL; } From 263950494267f2bdd434fe769b9aaaf610fb20cb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 18 Oct 2021 15:44:12 -0400 Subject: [PATCH 1252/5344] tracing: Have all levels of checks prevent recursion commit ed65df63a39a3f6ed04f7258de8b6789e5021c18 upstream. While writing an email explaining the "bit = 0" logic for a discussion on making ftrace_test_recursion_trylock() disable preemption, I discovered a path that makes the "not do the logic if bit is zero" unsafe. The recursion logic is done in hot paths like the function tracer. Thus, any code executed causes noticeable overhead. Thus, tricks are done to try to limit the amount of code executed. This included the recursion testing logic. Having recursion testing is important, as there are many paths that can end up in an infinite recursion cycle when tracing every function in the kernel. Thus protection is needed to prevent that from happening. Because it is OK to recurse due to different running context levels (e.g. an interrupt preempts a trace, and then a trace occurs in the interrupt handler), a set of bits are used to know which context one is in (normal, softirq, irq and NMI). If a recursion occurs in the same level, it is prevented*. Then there are infrastructure levels of recursion as well. When more than one callback is attached to the same function to trace, it calls a loop function to iterate over all the callbacks. Both the callbacks and the loop function have recursion protection. The callbacks use the "ftrace_test_recursion_trylock()" which has a "function" set of context bits to test, and the loop function calls the internal trace_test_and_set_recursion() directly, with an "internal" set of bits. If an architecture does not implement all the features supported by ftrace then the callbacks are never called directly, and the loop function is called instead, which will implement the features of ftrace. Since both the loop function and the callbacks do recursion protection, it was seemed unnecessary to do it in both locations. Thus, a trick was made to have the internal set of recursion bits at a more significant bit location than the function bits. Then, if any of the higher bits were set, the logic of the function bits could be skipped, as any new recursion would first have to go through the loop function. This is true for architectures that do not support all the ftrace features, because all functions being traced must first go through the loop function before going to the callbacks. But this is not true for architectures that support all the ftrace features. That's because the loop function could be called due to two callbacks attached to the same function, but then a recursion function inside the callback could be called that does not share any other callback, and it will be called directly. i.e. traced_function_1: [ more than one callback tracing it ] call loop_func loop_func: trace_recursion set internal bit call callback callback: trace_recursion [ skipped because internal bit is set, return 0 ] call traced_function_2 traced_function_2: [ only traced by above callback ] call callback callback: trace_recursion [ skipped because internal bit is set, return 0 ] call traced_function_2 [ wash, rinse, repeat, BOOM! out of shampoo! ] Thus, the "bit == 0 skip" trick is not safe, unless the loop function is call for all functions. Since we want to encourage architectures to implement all ftrace features, having them slow down due to this extra logic may encourage the maintainers to update to the latest ftrace features. And because this logic is only safe for them, remove it completely. [*] There is on layer of recursion that is allowed, and that is to allow for the transition between interrupt context (normal -> softirq -> irq -> NMI), because a trace may occur before the context update is visible to the trace recursion logic. Link: https://lore.kernel.org/all/609b565a-ed6e-a1da-f025-166691b5d994@linux.alibaba.com/ Link: https://lkml.kernel.org/r/20211018154412.09fcad3c@gandalf.local.home Cc: Linus Torvalds Cc: Petr Mladek Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Thomas Gleixner Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Josh Poimboeuf Cc: Jiri Kosina Cc: Miroslav Benes Cc: Joe Lawrence Cc: Colin Ian King Cc: Masami Hiramatsu Cc: "Peter Zijlstra (Intel)" Cc: Nicholas Piggin Cc: Jisheng Zhang Cc: =?utf-8?b?546L6LSH?= Cc: Guo Ren Cc: stable@vger.kernel.org Fixes: edc15cafcbfa3 ("tracing: Avoid unnecessary multiple recursion checks") Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/ftrace.c | 4 +-- kernel/trace/trace.h | 64 +++++++++++----------------------- kernel/trace/trace_functions.c | 2 +- 3 files changed, 23 insertions(+), 47 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 15f7e7c614210..e594d273de20b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6338,7 +6338,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op; int bit; - bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); + bit = trace_test_and_set_recursion(TRACE_LIST_START); if (bit < 0) return; @@ -6413,7 +6413,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, { int bit; - bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); + bit = trace_test_and_set_recursion(TRACE_LIST_START); if (bit < 0) return; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index fc3aa81a43e3c..35e9a01b54800 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -518,23 +518,8 @@ struct tracer { * When function tracing occurs, the following steps are made: * If arch does not support a ftrace feature: * call internal function (uses INTERNAL bits) which calls... - * If callback is registered to the "global" list, the list - * function is called and recursion checks the GLOBAL bits. - * then this function calls... * The function callback, which can use the FTRACE bits to * check for recursion. - * - * Now if the arch does not suppport a feature, and it calls - * the global list function which calls the ftrace callback - * all three of these steps will do a recursion protection. - * There's no reason to do one if the previous caller already - * did. The recursion that we are protecting against will - * go through the same steps again. - * - * To prevent the multiple recursion checks, if a recursion - * bit is set that is higher than the MAX bit of the current - * check, then we know that the check was made by the previous - * caller, and we can skip the current check. */ enum { TRACE_BUFFER_BIT, @@ -547,12 +532,14 @@ enum { TRACE_FTRACE_NMI_BIT, TRACE_FTRACE_IRQ_BIT, TRACE_FTRACE_SIRQ_BIT, + TRACE_FTRACE_TRANSITION_BIT, - /* INTERNAL_BITs must be greater than FTRACE_BITs */ + /* Internal use recursion bits */ TRACE_INTERNAL_BIT, TRACE_INTERNAL_NMI_BIT, TRACE_INTERNAL_IRQ_BIT, TRACE_INTERNAL_SIRQ_BIT, + TRACE_INTERNAL_TRANSITION_BIT, TRACE_BRANCH_BIT, /* @@ -592,12 +579,6 @@ enum { * function is called to clear it. */ TRACE_GRAPH_NOTRACE_BIT, - - /* - * When transitioning between context, the preempt_count() may - * not be correct. Allow for a single recursion to cover this case. - */ - TRACE_TRANSITION_BIT, }; #define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0) @@ -617,12 +598,18 @@ enum { #define TRACE_CONTEXT_BITS 4 #define TRACE_FTRACE_START TRACE_FTRACE_BIT -#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1) #define TRACE_LIST_START TRACE_INTERNAL_BIT -#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) -#define TRACE_CONTEXT_MASK TRACE_LIST_MAX +#define TRACE_CONTEXT_MASK ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) + +enum { + TRACE_CTX_NMI, + TRACE_CTX_IRQ, + TRACE_CTX_SOFTIRQ, + TRACE_CTX_NORMAL, + TRACE_CTX_TRANSITION, +}; static __always_inline int trace_get_context_bit(void) { @@ -630,59 +617,48 @@ static __always_inline int trace_get_context_bit(void) if (in_interrupt()) { if (in_nmi()) - bit = 0; + bit = TRACE_CTX_NMI; else if (in_irq()) - bit = 1; + bit = TRACE_CTX_IRQ; else - bit = 2; + bit = TRACE_CTX_SOFTIRQ; } else - bit = 3; + bit = TRACE_CTX_NORMAL; return bit; } -static __always_inline int trace_test_and_set_recursion(int start, int max) +static __always_inline int trace_test_and_set_recursion(int start) { unsigned int val = current->trace_recursion; int bit; - /* A previous recursion check was made */ - if ((val & TRACE_CONTEXT_MASK) > max) - return 0; - bit = trace_get_context_bit() + start; if (unlikely(val & (1 << bit))) { /* * It could be that preempt_count has not been updated during * a switch between contexts. Allow for a single recursion. */ - bit = TRACE_TRANSITION_BIT; + bit = start + TRACE_CTX_TRANSITION; if (trace_recursion_test(bit)) return -1; trace_recursion_set(bit); barrier(); - return bit + 1; + return bit; } - /* Normal check passed, clear the transition to allow it again */ - trace_recursion_clear(TRACE_TRANSITION_BIT); - val |= 1 << bit; current->trace_recursion = val; barrier(); - return bit + 1; + return bit; } static __always_inline void trace_clear_recursion(int bit) { unsigned int val = current->trace_recursion; - if (!bit) - return; - - bit--; bit = 1 << bit; val &= ~bit; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index b611cd36e22db..4e8acfe3437fd 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -138,7 +138,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, pc = preempt_count(); preempt_disable_notrace(); - bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX); + bit = trace_test_and_set_recursion(TRACE_FTRACE_START); if (bit < 0) goto out; From 1a9ee9a15a87c47b2c2b04087fe5eae96d38794a Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Wed, 8 Sep 2021 19:25:59 +0100 Subject: [PATCH 1253/5344] ARM: 9122/1: select HAVE_FUTEX_CMPXCHG commit 9d417cbe36eee7afdd85c2e871685f8dab7c2dba upstream. tglx notes: This function [futex_detect_cmpxchg] is only needed when an architecture has to runtime discover whether the CPU supports it or not. ARM has unconditional support for this, so the obvious thing to do is the below. Fixes linkage failure from Clang randconfigs: kernel/futex.o:(.text.fixup+0x5c): relocation truncated to fit: R_ARM_JUMP24 against `.init.text' and boot failures for CONFIG_THUMB2_KERNEL. Link: https://github.com/ClangBuiltLinux/linux/issues/325 Comments from Nick Desaulniers: See-also: 03b8c7b623c8 ("futex: Allow architectures to skip futex_atomic_cmpxchg_inatomic() test") Reported-by: Arnd Bergmann Reported-by: Nathan Chancellor Suggested-by: Thomas Gleixner Signed-off-by: Nick Desaulniers Reviewed-by: Thomas Gleixner Tested-by: Nathan Chancellor Reviewed-by: Linus Walleij Cc: stable@vger.kernel.org # v3.14+ Reviewed-by: Arnd Bergmann Signed-off-by: Russell King (Oracle) Signed-off-by: Greg Kroah-Hartman --- arch/arm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 9aa88715f196c..4b36bbcf5a5b4 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -85,6 +85,7 @@ config ARM select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL select HAVE_FUNCTION_GRAPH_TRACER if !THUMB2_KERNEL && !CC_IS_CLANG select HAVE_FUNCTION_TRACER if !XIP_KERNEL && (CC_IS_GCC || CLANG_VERSION >= 100000) + select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_GCC_PLUGINS select HAVE_HW_BREAKPOINT if PERF_EVENTS && (CPU_V6 || CPU_V6K || CPU_V7) select HAVE_IDE if PCI || ISA || PCMCIA From 999e907dc697105eec6fd28bbd02ed31b77f4017 Mon Sep 17 00:00:00 2001 From: Fabien Dessenne Date: Fri, 8 Oct 2021 14:25:17 +0200 Subject: [PATCH 1254/5344] pinctrl: stm32: use valid pin identifier in stm32_pinctrl_resume() commit c370bb474016ab9edfdabd7c08a88dd13a71ddbd upstream. When resuming from low power, the driver attempts to restore the configuration of some pins. This is done by a call to: stm32_pinctrl_restore_gpio_regs(struct stm32_pinctrl *pctl, u32 pin) where 'pin' must be a valid pin value (i.e. matching some 'groups->pin'). Fix the current implementation which uses some wrong 'pin' value. Fixes: e2f3cf18c3e2 ("pinctrl: stm32: add suspend/resume management") Signed-off-by: Fabien Dessenne Link: https://lore.kernel.org/r/20211008122517.617633-1-fabien.dessenne@foss.st.com Signed-off-by: Linus Walleij Signed-off-by: Greg Kroah-Hartman --- drivers/pinctrl/stm32/pinctrl-stm32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pinctrl/stm32/pinctrl-stm32.c b/drivers/pinctrl/stm32/pinctrl-stm32.c index 3c2a60f7b4172..f9abd4364fbaa 100644 --- a/drivers/pinctrl/stm32/pinctrl-stm32.c +++ b/drivers/pinctrl/stm32/pinctrl-stm32.c @@ -1554,8 +1554,8 @@ int __maybe_unused stm32_pinctrl_resume(struct device *dev) struct stm32_pinctrl_group *g = pctl->groups; int i; - for (i = g->pin; i < g->pin + pctl->ngroups; i++) - stm32_pinctrl_restore_gpio_regs(pctl, i); + for (i = 0; i < pctl->ngroups; i++, g++) + stm32_pinctrl_restore_gpio_regs(pctl, g->pin); return 0; } From 4100cecc94f3807efe70bdb9ce1fdb7e075efbda Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 27 Oct 2021 09:54:30 +0200 Subject: [PATCH 1255/5344] Linux 5.4.156 Link: https://lore.kernel.org/r/20211025190937.555108060@linuxfoundation.org Tested-by: Florian Fainelli Tested-by: Linux Kernel Functional Testing Tested-by: Shuah Khan Tested-by: Sudip Mukherjee Tested-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index f7e2bf924463b..ced1f0fd48dc6 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 4 -SUBLEVEL = 155 +SUBLEVEL = 156 EXTRAVERSION = NAME = Kleptomaniac Octopus From bfdec055e8021f25d708f671a774d9208419db8d Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Mon, 4 Oct 2021 18:03:28 +0100 Subject: [PATCH 1256/5344] ARM: 9133/1: mm: proc-macros: ensure *_tlb_fns are 4B aligned commit e6a0c958bdf9b2e1b57501fc9433a461f0a6aadd upstream. A kernel built with CONFIG_THUMB2_KERNEL=y and using clang as the assembler could generate non-naturally-aligned v7wbi_tlb_fns which results in a boot failure. The original commit adding the macro missed the .align directive on this data. Link: https://github.com/ClangBuiltLinux/linux/issues/1447 Link: https://lore.kernel.org/all/0699da7b-354f-aecc-a62f-e25693209af4@linaro.org/ Debugged-by: Ard Biesheuvel Debugged-by: Nathan Chancellor Debugged-by: Richard Henderson Fixes: 66a625a88174 ("ARM: mm: proc-macros: Add generic proc/cache/tlb struct definition macros") Suggested-by: Ard Biesheuvel Acked-by: Ard Biesheuvel Signed-off-by: Nick Desaulniers Tested-by: Nathan Chancellor Signed-off-by: Russell King (Oracle) Signed-off-by: Greg Kroah-Hartman --- arch/arm/mm/proc-macros.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/mm/proc-macros.S b/arch/arm/mm/proc-macros.S index 60ac7c5999a98..86e54447dc916 100644 --- a/arch/arm/mm/proc-macros.S +++ b/arch/arm/mm/proc-macros.S @@ -342,6 +342,7 @@ ENTRY(\name\()_cache_fns) .macro define_tlb_functions name:req, flags_up:req, flags_smp .type \name\()_tlb_fns, #object + .align 2 ENTRY(\name\()_tlb_fns) .long \name\()_flush_user_tlb_range .long \name\()_flush_kern_tlb_range From 0b77c9654fab174e124beffef488223030977de3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 18 Oct 2021 15:30:04 +0100 Subject: [PATCH 1257/5344] ARM: 9134/1: remove duplicate memcpy() definition commit eaf6cc7165c9c5aa3c2f9faa03a98598123d0afb upstream. Both the decompressor code and the kasan logic try to override the memcpy() and memmove() definitions, which leading to a clash in a KASAN-enabled kernel with XZ decompression: arch/arm/boot/compressed/decompress.c:50:9: error: 'memmove' macro redefined [-Werror,-Wmacro-redefined] #define memmove memmove ^ arch/arm/include/asm/string.h:59:9: note: previous definition is here #define memmove(dst, src, len) __memmove(dst, src, len) ^ arch/arm/boot/compressed/decompress.c:51:9: error: 'memcpy' macro redefined [-Werror,-Wmacro-redefined] #define memcpy memcpy ^ arch/arm/include/asm/string.h:58:9: note: previous definition is here #define memcpy(dst, src, len) __memcpy(dst, src, len) ^ Here we want the set of functions from the decompressor, so undefine the other macros before the override. Link: https://lore.kernel.org/linux-arm-kernel/CACRpkdZYJogU_SN3H9oeVq=zJkRgRT1gDz3xp59gdqWXxw-B=w@mail.gmail.com/ Link: https://lore.kernel.org/lkml/202105091112.F5rmd4By-lkp@intel.com/ Fixes: d6d51a96c7d6 ("ARM: 9014/2: Replace string mem* functions for KASan") Fixes: a7f464f3db93 ("ARM: 7001/2: Wire up support for the XZ decompressor") Reported-by: kernel test robot Reviewed-by: Linus Walleij Signed-off-by: Arnd Bergmann Signed-off-by: Russell King (Oracle) Signed-off-by: Greg Kroah-Hartman --- arch/arm/boot/compressed/decompress.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm/boot/compressed/decompress.c b/arch/arm/boot/compressed/decompress.c index aa075d8372ea2..74255e8198314 100644 --- a/arch/arm/boot/compressed/decompress.c +++ b/arch/arm/boot/compressed/decompress.c @@ -47,7 +47,10 @@ extern char * strchrnul(const char *, int); #endif #ifdef CONFIG_KERNEL_XZ +/* Prevent KASAN override of string helpers in decompressor */ +#undef memmove #define memmove memmove +#undef memcpy #define memcpy memcpy #include "../../../../lib/decompress_unxz.c" #endif From 1fbc5597620f882fd63335f2a83aab320cdea3a5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 18 Oct 2021 15:30:09 +0100 Subject: [PATCH 1258/5344] ARM: 9139/1: kprobes: fix arch_init_kprobes() prototype commit 1f323127cab086e4fd618981b1e5edc396eaf0f4 upstream. With extra warnings enabled, gcc complains about this function definition: arch/arm/probes/kprobes/core.c: In function 'arch_init_kprobes': arch/arm/probes/kprobes/core.c:465:12: warning: old-style function definition [-Wold-style-definition] 465 | int __init arch_init_kprobes() Link: https://lore.kernel.org/all/20201027093057.c685a14b386acacb3c449e3d@kernel.org/ Fixes: 24ba613c9d6c ("ARM kprobes: core code") Acked-by: Masami Hiramatsu Signed-off-by: Arnd Bergmann Signed-off-by: Russell King (Oracle) Signed-off-by: Greg Kroah-Hartman --- arch/arm/probes/kprobes/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/probes/kprobes/core.c b/arch/arm/probes/kprobes/core.c index 90b5bc723c83f..0a783bd4641c5 100644 --- a/arch/arm/probes/kprobes/core.c +++ b/arch/arm/probes/kprobes/core.c @@ -534,7 +534,7 @@ static struct undef_hook kprobes_arm_break_hook = { #endif /* !CONFIG_THUMB2_KERNEL */ -int __init arch_init_kprobes() +int __init arch_init_kprobes(void) { arm_probes_decode_init(); #ifdef CONFIG_THUMB2_KERNEL From ff9d2858bd127d7ccb15d11f38eb48bfb924782e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 18 Oct 2021 15:30:37 +0100 Subject: [PATCH 1259/5344] ARM: 9141/1: only warn about XIP address when not compile testing commit 48ccc8edf5b90622cdc4f8878e0042ab5883e2ca upstream. In randconfig builds, we sometimes come across this warning: arm-linux-gnueabi-ld: XIP start address may cause MPU programming issues While this is helpful for actual systems to figure out why it fails, the warning does not provide any benefit for build testing, so guard it in a check for CONFIG_COMPILE_TEST, which is usually set on randconfig builds. Fixes: 216218308cfb ("ARM: 8713/1: NOMMU: Support MPU in XIP configuration") Signed-off-by: Arnd Bergmann Signed-off-by: Russell King (Oracle) Signed-off-by: Greg Kroah-Hartman --- arch/arm/kernel/vmlinux-xip.lds.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/kernel/vmlinux-xip.lds.S b/arch/arm/kernel/vmlinux-xip.lds.S index 8c74037ade229..b1f366df620b0 100644 --- a/arch/arm/kernel/vmlinux-xip.lds.S +++ b/arch/arm/kernel/vmlinux-xip.lds.S @@ -180,7 +180,7 @@ ASSERT(__hyp_idmap_text_end - (__hyp_idmap_text_start & PAGE_MASK) <= PAGE_SIZE, ASSERT((_end - __bss_start) >= 12288, ".bss too small for CONFIG_XIP_DEFLATED_DATA") #endif -#ifdef CONFIG_ARM_MPU +#if defined(CONFIG_ARM_MPU) && !defined(CONFIG_COMPILE_TEST) /* * Due to PMSAv7 restriction on base address and size we have to * enforce minimal alignment restrictions. It was seen that weaker From 36f7a2129c882f194a8090fade09049cdc4797ef Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Wed, 6 Oct 2021 01:55:22 +0530 Subject: [PATCH 1260/5344] powerpc/bpf: Fix BPF_MOD when imm == 1 commit 8bbc9d822421d9ac8ff9ed26a3713c9afc69d6c8 upstream. Only ignore the operation if dividing by 1. Fixes: 156d0e290e969c ("powerpc/ebpf/jit: Implement JIT compiler for extended BPF") Signed-off-by: Naveen N. Rao Tested-by: Johan Almbladh Reviewed-by: Christophe Leroy Acked-by: Song Liu Acked-by: Johan Almbladh Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/c674ca18c3046885602caebb326213731c675d06.1633464148.git.naveen.n.rao@linux.vnet.ibm.com [cascardo: use PPC_LI instead of EMIT(PPC_RAW_LI)] Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/net/bpf_jit_comp64.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 20bfd753bcba6..81e6279c9874f 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -408,8 +408,14 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, case BPF_ALU64 | BPF_DIV | BPF_K: /* dst /= imm */ if (imm == 0) return -EINVAL; - else if (imm == 1) - goto bpf_alu32_trunc; + if (imm == 1) { + if (BPF_OP(code) == BPF_DIV) { + goto bpf_alu32_trunc; + } else { + PPC_LI(dst_reg, 0); + break; + } + } PPC_LI32(b2p[TMP_REG_1], imm); switch (BPF_CLASS(code)) { From d4138831b7fea1be7423d06fc55f14397a244f2c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Oct 2021 20:56:30 +0300 Subject: [PATCH 1261/5344] ipv6: use siphash in rt6_exception_hash() commit 4785305c05b25a242e5314cc821f54ade4c18810 upstream. A group of security researchers brought to our attention the weakness of hash function used in rt6_exception_hash() Lets use siphash instead of Jenkins Hash, to considerably reduce security risks. Following patch deals with IPv4. Fixes: 35732d01fe31 ("ipv6: introduce a hash table to store dst cache") Signed-off-by: Eric Dumazet Reported-by: Keyu Man Cc: Wei Wang Cc: Martin KaFai Lau Acked-by: Wei Wang Signed-off-by: David S. Miller [OP: adjusted context for 5.4 stable] Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- net/ipv6/route.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3fb259c20546e..daa876c6ae8db 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -1502,17 +1503,24 @@ static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) static u32 rt6_exception_hash(const struct in6_addr *dst, const struct in6_addr *src) { - static u32 seed __read_mostly; - u32 val; + static siphash_key_t rt6_exception_key __read_mostly; + struct { + struct in6_addr dst; + struct in6_addr src; + } __aligned(SIPHASH_ALIGNMENT) combined = { + .dst = *dst, + }; + u64 val; - net_get_random_once(&seed, sizeof(seed)); - val = jhash(dst, sizeof(*dst), seed); + net_get_random_once(&rt6_exception_key, sizeof(rt6_exception_key)); #ifdef CONFIG_IPV6_SUBTREES if (src) - val = jhash(src, sizeof(*src), val); + combined.src = *src; #endif - return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); + val = siphash(&combined, sizeof(combined), &rt6_exception_key); + + return hash_64(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); } /* Helper function to find the cached rt in the hash table From 67460b588cd27af5942c8a440d8dc4988a025eb1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Oct 2021 20:56:31 +0300 Subject: [PATCH 1262/5344] ipv4: use siphash instead of Jenkins in fnhe_hashfun() commit 6457378fe796815c973f631a1904e147d6ee33b1 upstream. A group of security researchers brought to our attention the weakness of hash function used in fnhe_hashfun(). Lets use siphash instead of Jenkins Hash, to considerably reduce security risks. Also remove the inline keyword, this really is distracting. Fixes: d546c621542d ("ipv4: harden fnhe_hashfun()") Signed-off-by: Eric Dumazet Reported-by: Keyu Man Cc: Willy Tarreau Signed-off-by: David S. Miller [OP: adjusted context for 5.4 stable] Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- net/ipv4/route.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 539492998864e..d1feec97fa062 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -631,14 +631,14 @@ static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash) kfree_rcu(oldest, rcu); } -static inline u32 fnhe_hashfun(__be32 daddr) +static u32 fnhe_hashfun(__be32 daddr) { - static u32 fnhe_hashrnd __read_mostly; - u32 hval; + static siphash_key_t fnhe_hash_key __read_mostly; + u64 hval; - net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd)); - hval = jhash_1word((__force u32) daddr, fnhe_hashrnd); - return hash_32(hval, FNHE_HASH_SHIFT); + net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key)); + hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key); + return hash_64(hval, FNHE_HASH_SHIFT); } static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe) From f222333321596ce74fa308b9e095b1682214734c Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Thu, 21 Oct 2021 14:29:44 +0200 Subject: [PATCH 1263/5344] usbnet: sanity check for maxpacket commit 397430b50a363d8b7bdda00522123f82df6adc5e upstream. maxpacket of 0 makes no sense and oopses as we need to divide by it. Give up. V2: fixed typo in log and stylistic issues Signed-off-by: Oliver Neukum Reported-by: syzbot+76bb1d34ffa0adc03baa@syzkaller.appspotmail.com Reviewed-by: Johan Hovold Link: https://lore.kernel.org/r/20211021122944.21816-1-oneukum@suse.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/usbnet.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index dde05e2fdc3e6..be8b4321ffaad 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -1773,6 +1773,10 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod) if (!dev->rx_urb_size) dev->rx_urb_size = dev->hard_mtu; dev->maxpacket = usb_maxpacket (dev->udev, dev->out, 1); + if (dev->maxpacket == 0) { + /* that is a broken device */ + goto out4; + } /* let userspace know we have a random address */ if (ether_addr_equal(net->dev_addr, node_id)) From d9acf38b9c9dc17cc47b6868cc16da1b3aa5014a Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Tue, 26 Oct 2021 20:40:15 +0800 Subject: [PATCH 1264/5344] usbnet: fix error return code in usbnet_probe() commit 6f7c88691191e6c52ef2543d6f1da8d360b27a24 upstream. Return error code if usb_maxpacket() returns 0 in usbnet_probe() Fixes: 397430b50a36 ("usbnet: sanity check for maxpacket") Reported-by: Hulk Robot Signed-off-by: Wang Hai Reviewed-by: Johan Hovold Link: https://lore.kernel.org/r/20211026124015.3025136-1-wanghai38@huawei.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/usbnet.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index be8b4321ffaad..b8b9df82f51ef 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -1775,6 +1775,7 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod) dev->maxpacket = usb_maxpacket (dev->udev, dev->out, 1); if (dev->maxpacket == 0) { /* that is a broken device */ + status = -ENODEV; goto out4; } From 036a0995cda1dbc5680b200956e6010e91e9702f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Fri, 8 Oct 2021 22:59:38 +0200 Subject: [PATCH 1265/5344] Revert "pinctrl: bcm: ns: support updated DT binding as syscon subnode" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 6dba4bdfd7a30e77b848a45404b224588bf989e5 upstream. This reverts commit a49d784d5a8272d0f63c448fe8dc69e589db006e. The updated binding was wrong / invalid and has been reverted. There isn't any upstream kernel DTS using it and Broadcom isn't known to use it neither. There is close to zero chance this will cause regression for anyone. Actually in-kernel bcm5301x.dtsi still uses the old good binding and so it's broken since the driver update. This revert fixes it. Signed-off-by: Rafał Miłecki Link: https://lore.kernel.org/r/20211008205938.29925-3-zajec5@gmail.com Signed-off-by: Linus Walleij Signed-off-by: Greg Kroah-Hartman --- drivers/pinctrl/bcm/pinctrl-ns.c | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/drivers/pinctrl/bcm/pinctrl-ns.c b/drivers/pinctrl/bcm/pinctrl-ns.c index e79690bd8b85f..d7f8175d2c1c8 100644 --- a/drivers/pinctrl/bcm/pinctrl-ns.c +++ b/drivers/pinctrl/bcm/pinctrl-ns.c @@ -5,7 +5,6 @@ #include #include -#include #include #include #include @@ -13,7 +12,6 @@ #include #include #include -#include #include #define FLAG_BCM4708 BIT(1) @@ -24,8 +22,7 @@ struct ns_pinctrl { struct device *dev; unsigned int chipset_flag; struct pinctrl_dev *pctldev; - struct regmap *regmap; - u32 offset; + void __iomem *base; struct pinctrl_desc pctldesc; struct ns_pinctrl_group *groups; @@ -232,9 +229,9 @@ static int ns_pinctrl_set_mux(struct pinctrl_dev *pctrl_dev, unset |= BIT(pin_number); } - regmap_read(ns_pinctrl->regmap, ns_pinctrl->offset, &tmp); + tmp = readl(ns_pinctrl->base); tmp &= ~unset; - regmap_write(ns_pinctrl->regmap, ns_pinctrl->offset, tmp); + writel(tmp, ns_pinctrl->base); return 0; } @@ -266,13 +263,13 @@ static const struct of_device_id ns_pinctrl_of_match_table[] = { static int ns_pinctrl_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; - struct device_node *np = dev->of_node; const struct of_device_id *of_id; struct ns_pinctrl *ns_pinctrl; struct pinctrl_desc *pctldesc; struct pinctrl_pin_desc *pin; struct ns_pinctrl_group *group; struct ns_pinctrl_function *function; + struct resource *res; int i; ns_pinctrl = devm_kzalloc(dev, sizeof(*ns_pinctrl), GFP_KERNEL); @@ -290,18 +287,12 @@ static int ns_pinctrl_probe(struct platform_device *pdev) return -EINVAL; ns_pinctrl->chipset_flag = (uintptr_t)of_id->data; - ns_pinctrl->regmap = syscon_node_to_regmap(of_get_parent(np)); - if (IS_ERR(ns_pinctrl->regmap)) { - int err = PTR_ERR(ns_pinctrl->regmap); - - dev_err(dev, "Failed to map pinctrl regs: %d\n", err); - - return err; - } - - if (of_property_read_u32(np, "offset", &ns_pinctrl->offset)) { - dev_err(dev, "Failed to get register offset\n"); - return -ENOENT; + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, + "cru_gpio_control"); + ns_pinctrl->base = devm_ioremap_resource(dev, res); + if (IS_ERR(ns_pinctrl->base)) { + dev_err(dev, "Failed to map pinctrl regs\n"); + return PTR_ERR(ns_pinctrl->base); } memcpy(pctldesc, &ns_pinctrl_desc, sizeof(*pctldesc)); From 15749662627aa37b430f9f3d0afd7de2881c7da3 Mon Sep 17 00:00:00 2001 From: Zheyu Ma Date: Fri, 22 Oct 2021 09:12:26 +0000 Subject: [PATCH 1266/5344] ata: sata_mv: Fix the error handling of mv_chip_id() commit a0023bb9dd9bc439d44604eeec62426a990054cd upstream. mv_init_host() propagates the value returned by mv_chip_id() which in turn gets propagated by mv_pci_init_one() and hits local_pci_probe(). During the process of driver probing, the probe function should return < 0 for failure, otherwise, the kernel will treat value > 0 as success. Since this is a bug rather than a recoverable runtime error we should use dev_alert() instead of dev_err(). Signed-off-by: Zheyu Ma Signed-off-by: Damien Le Moal Signed-off-by: Greg Kroah-Hartman --- drivers/ata/sata_mv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c index d1ed9679c7177..3fca3e13ed6ae 100644 --- a/drivers/ata/sata_mv.c +++ b/drivers/ata/sata_mv.c @@ -3892,8 +3892,8 @@ static int mv_chip_id(struct ata_host *host, unsigned int board_idx) break; default: - dev_err(host->dev, "BUG: invalid board index %u\n", board_idx); - return 1; + dev_alert(host->dev, "BUG: invalid board index %u\n", board_idx); + return -EINVAL; } hpriv->hp_flags = hp_flags; From 992f9c2b7da172d76c039a30c2c8e44cff3cce9d Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 25 Oct 2021 16:49:36 +0200 Subject: [PATCH 1267/5344] nfc: port100: fix using -ERRNO as command type mask commit 2195f2062e4cc93870da8e71c318ef98a1c51cef upstream. During probing, the driver tries to get a list (mask) of supported command types in port100_get_command_type_mask() function. The value is u64 and 0 is treated as invalid mask (no commands supported). The function however returns also -ERRNO as u64 which will be interpret as valid command mask. Return 0 on every error case of port100_get_command_type_mask(), so the probing will stop. Cc: Fixes: 0347a6ab300a ("NFC: port100: Commands mechanism implementation") Signed-off-by: Krzysztof Kozlowski Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/nfc/port100.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nfc/port100.c b/drivers/nfc/port100.c index 8e4d355dc3aec..1caebefb25ff1 100644 --- a/drivers/nfc/port100.c +++ b/drivers/nfc/port100.c @@ -1003,11 +1003,11 @@ static u64 port100_get_command_type_mask(struct port100 *dev) skb = port100_alloc_skb(dev, 0); if (!skb) - return -ENOMEM; + return 0; resp = port100_send_cmd_sync(dev, PORT100_CMD_GET_COMMAND_TYPE, skb); if (IS_ERR(resp)) - return PTR_ERR(resp); + return 0; if (resp->len < 8) mask = 0; From 87cc31f22cc1f26fd0336b3de33adeb72e47fff4 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Thu, 30 Sep 2021 20:49:42 +0300 Subject: [PATCH 1268/5344] Revert "net: mdiobus: Fix memory leak in __mdiobus_register" commit 10eff1f5788b6ffac212c254e2f3666219576889 upstream. This reverts commit ab609f25d19858513919369ff3d9a63c02cd9e2e. This patch is correct in the sense that we _should_ call device_put() in case of device_register() failure, but the problem in this code is more vast. We need to set bus->state to UNMDIOBUS_REGISTERED before calling device_register() to correctly release the device in mdiobus_free(). This patch prevents us from doing it, since in case of device_register() failure put_device() will be called 2 times and it will cause UAF or something else. Also, Reported-by: tag in revered commit was wrong, since syzbot reported different leak in same function. Link: https://lore.kernel.org/netdev/20210928092657.GI2048@kadam/ Acked-by: Yanfei Xu Signed-off-by: Pavel Skripkin Link: https://lore.kernel.org/r/f12fb1faa4eccf0f355788225335eb4309ff2599.1633024062.git.paskripkin@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/phy/mdio_bus.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 8377ff229a303..bec73f0640d03 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -395,7 +395,6 @@ int __mdiobus_register(struct mii_bus *bus, struct module *owner) err = device_register(&bus->dev); if (err) { pr_err("mii_bus %s failed to register\n", bus->id); - put_device(&bus->dev); return -EINVAL; } From bdbe4ab9f04d1744d04882c0c9fd1e27d69a4218 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Wed, 27 Oct 2021 17:59:20 -0400 Subject: [PATCH 1269/5344] net/tls: Fix flipped sign in tls_err_abort() calls commit da353fac65fede6b8b4cfe207f0d9408e3121105 upstream. sk->sk_err appears to expect a positive value, a convention that ktls doesn't always follow and that leads to memory corruption in other code. For instance, [kworker] tls_encrypt_done(..., err=) tls_err_abort(.., err) sk->sk_err = err; [task] splice_from_pipe_feed ... tls_sw_do_sendpage if (sk->sk_err) { ret = -sk->sk_err; // ret is positive splice_from_pipe_feed (continued) ret = actor(...) // ret is still positive and interpreted as bytes // written, resulting in underflow of buf->len and // sd->len, leading to huge buf->offset and bogus // addresses computed in later calls to actor() Fix all tls_err_abort() callers to pass a negative error code consistently and centralize the error-prone sign flip there, throwing in a warning to catch future misuse and uninlining the function so it really does only warn once. Cc: stable@vger.kernel.org Fixes: c46234ebb4d1e ("tls: RX path for ktls") Reported-by: syzbot+b187b77c8474f9648fae@syzkaller.appspotmail.com Signed-off-by: Daniel Jordan Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/tls.h | 9 ++------- net/tls/tls_sw.c | 17 +++++++++++++---- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index 697df45c0bcee..7f220e03ebb2d 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -360,6 +360,7 @@ int tls_sk_query(struct sock *sk, int optname, char __user *optval, int __user *optlen); int tls_sk_attach(struct sock *sk, int optname, char __user *optval, unsigned int optlen); +void tls_err_abort(struct sock *sk, int err); int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx); void tls_sw_strparser_arm(struct sock *sk, struct tls_context *ctx); @@ -465,12 +466,6 @@ static inline bool tls_is_sk_tx_device_offloaded(struct sock *sk) #endif } -static inline void tls_err_abort(struct sock *sk, int err) -{ - sk->sk_err = err; - sk->sk_error_report(sk); -} - static inline bool tls_bigint_increment(unsigned char *seq, int len) { int i; @@ -499,7 +494,7 @@ static inline void tls_advance_record_sn(struct sock *sk, struct cipher_context *ctx) { if (tls_bigint_increment(ctx->rec_seq, prot->rec_seq_size)) - tls_err_abort(sk, EBADMSG); + tls_err_abort(sk, -EBADMSG); if (prot->version != TLS_1_3_VERSION) tls_bigint_increment(ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 7fb5c067f4293..8c9d2055d92b0 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -35,6 +35,7 @@ * SOFTWARE. */ +#include #include #include #include @@ -43,6 +44,14 @@ #include #include +noinline void tls_err_abort(struct sock *sk, int err) +{ + WARN_ON_ONCE(err >= 0); + /* sk->sk_err should contain a positive error code. */ + sk->sk_err = -err; + sk->sk_error_report(sk); +} + static int __skb_nsg(struct sk_buff *skb, int offset, int len, unsigned int recursion_level) { @@ -416,7 +425,7 @@ int tls_tx_records(struct sock *sk, int flags) tx_err: if (rc < 0 && rc != -EAGAIN) - tls_err_abort(sk, EBADMSG); + tls_err_abort(sk, -EBADMSG); return rc; } @@ -761,7 +770,7 @@ static int tls_push_record(struct sock *sk, int flags, msg_pl->sg.size + prot->tail_size, i); if (rc < 0) { if (rc != -EINPROGRESS) { - tls_err_abort(sk, EBADMSG); + tls_err_abort(sk, -EBADMSG); if (split) { tls_ctx->pending_open_record_frags = true; tls_merge_open_record(sk, rec, tmp, orig_end); @@ -1822,7 +1831,7 @@ int tls_sw_recvmsg(struct sock *sk, err = decrypt_skb_update(sk, skb, &msg->msg_iter, &chunk, &zc, async_capable); if (err < 0 && err != -EINPROGRESS) { - tls_err_abort(sk, EBADMSG); + tls_err_abort(sk, -EBADMSG); goto recv_end; } @@ -2002,7 +2011,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, } if (err < 0) { - tls_err_abort(sk, EBADMSG); + tls_err_abort(sk, -EBADMSG); goto splice_read_end; } ctx->decrypted = true; From 64ba8c21ceca741d8455349546c2d6ac4ab51d88 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 25 Oct 2021 13:56:08 +0200 Subject: [PATCH 1270/5344] mmc: vub300: fix control-message timeouts commit 8c8171929116cc23f74743d99251eedadf62341a upstream. USB control-message timeouts are specified in milliseconds and should specifically not vary with CONFIG_HZ. Fixes: 88095e7b473a ("mmc: Add new VUB300 USB-to-SD/SDIO/MMC driver") Cc: stable@vger.kernel.org # 3.0 Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20211025115608.5287-1-johan@kernel.org Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/host/vub300.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/mmc/host/vub300.c b/drivers/mmc/host/vub300.c index 156046e56a584..5e1d7025dbf78 100644 --- a/drivers/mmc/host/vub300.c +++ b/drivers/mmc/host/vub300.c @@ -576,7 +576,7 @@ static void check_vub300_port_status(struct vub300_mmc_host *vub300) GET_SYSTEM_PORT_STATUS, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0x0000, 0x0000, &vub300->system_port_status, - sizeof(vub300->system_port_status), HZ); + sizeof(vub300->system_port_status), 1000); if (sizeof(vub300->system_port_status) == retval) new_system_port_status(vub300); } @@ -1241,7 +1241,7 @@ static void __download_offload_pseudocode(struct vub300_mmc_host *vub300, SET_INTERRUPT_PSEUDOCODE, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0x0000, 0x0000, - xfer_buffer, xfer_length, HZ); + xfer_buffer, xfer_length, 1000); kfree(xfer_buffer); if (retval < 0) goto copy_error_message; @@ -1284,7 +1284,7 @@ static void __download_offload_pseudocode(struct vub300_mmc_host *vub300, SET_TRANSFER_PSEUDOCODE, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0x0000, 0x0000, - xfer_buffer, xfer_length, HZ); + xfer_buffer, xfer_length, 1000); kfree(xfer_buffer); if (retval < 0) goto copy_error_message; @@ -1991,7 +1991,7 @@ static void __set_clock_speed(struct vub300_mmc_host *vub300, u8 buf[8], usb_control_msg(vub300->udev, usb_sndctrlpipe(vub300->udev, 0), SET_CLOCK_SPEED, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, - 0x00, 0x00, buf, buf_array_size, HZ); + 0x00, 0x00, buf, buf_array_size, 1000); if (retval != 8) { dev_err(&vub300->udev->dev, "SET_CLOCK_SPEED" " %dkHz failed with retval=%d\n", kHzClock, retval); @@ -2013,14 +2013,14 @@ static void vub300_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) usb_control_msg(vub300->udev, usb_sndctrlpipe(vub300->udev, 0), SET_SD_POWER, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, - 0x0000, 0x0000, NULL, 0, HZ); + 0x0000, 0x0000, NULL, 0, 1000); /* must wait for the VUB300 u-proc to boot up */ msleep(600); } else if ((ios->power_mode == MMC_POWER_UP) && !vub300->card_powered) { usb_control_msg(vub300->udev, usb_sndctrlpipe(vub300->udev, 0), SET_SD_POWER, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, - 0x0001, 0x0000, NULL, 0, HZ); + 0x0001, 0x0000, NULL, 0, 1000); msleep(600); vub300->card_powered = 1; } else if (ios->power_mode == MMC_POWER_ON) { @@ -2282,14 +2282,14 @@ static int vub300_probe(struct usb_interface *interface, GET_HC_INF0, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0x0000, 0x0000, &vub300->hc_info, - sizeof(vub300->hc_info), HZ); + sizeof(vub300->hc_info), 1000); if (retval < 0) goto error5; retval = usb_control_msg(vub300->udev, usb_sndctrlpipe(vub300->udev, 0), SET_ROM_WAIT_STATES, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, - firmware_rom_wait_states, 0x0000, NULL, 0, HZ); + firmware_rom_wait_states, 0x0000, NULL, 0, 1000); if (retval < 0) goto error5; dev_info(&vub300->udev->dev, @@ -2304,7 +2304,7 @@ static int vub300_probe(struct usb_interface *interface, GET_SYSTEM_PORT_STATUS, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0x0000, 0x0000, &vub300->system_port_status, - sizeof(vub300->system_port_status), HZ); + sizeof(vub300->system_port_status), 1000); if (retval < 0) { goto error4; } else if (sizeof(vub300->system_port_status) == retval) { From 9a69d2c79785a08abdde62eed568043b501ec4d7 Mon Sep 17 00:00:00 2001 From: Wenbin Mei Date: Tue, 26 Oct 2021 15:08:12 +0800 Subject: [PATCH 1271/5344] mmc: cqhci: clear HALT state after CQE enable commit 92b18252b91de567cd875f2e84722b10ab34ee28 upstream. While mmc0 enter suspend state, we need halt CQE to send legacy cmd(flush cache) and disable cqe, for resume back, we enable CQE and not clear HALT state. In this case MediaTek mmc host controller will keep the value for HALT state after CQE disable/enable flow, so the next CQE transfer after resume will be timeout due to CQE is in HALT state, the log as below: <4>.(4)[318:kworker/4:1H]mmc0: cqhci: timeout for tag 2 <4>.(4)[318:kworker/4:1H]mmc0: cqhci: ============ CQHCI REGISTER DUMP =========== <4>.(4)[318:kworker/4:1H]mmc0: cqhci: Caps: 0x100020b6 | Version: 0x00000510 <4>.(4)[318:kworker/4:1H]mmc0: cqhci: Config: 0x00001103 | Control: 0x00000001 <4>.(4)[318:kworker/4:1H]mmc0: cqhci: Int stat: 0x00000000 | Int enab: 0x00000006 <4>.(4)[318:kworker/4:1H]mmc0: cqhci: Int sig: 0x00000006 | Int Coal: 0x00000000 <4>.(4)[318:kworker/4:1H]mmc0: cqhci: TDL base: 0xfd05f000 | TDL up32: 0x00000000 <4>.(4)[318:kworker/4:1H]mmc0: cqhci: Doorbell: 0x8000203c | TCN: 0x00000000 <4>.(4)[318:kworker/4:1H]mmc0: cqhci: Dev queue: 0x00000000 | Dev Pend: 0x00000000 <4>.(4)[318:kworker/4:1H]mmc0: cqhci: Task clr: 0x00000000 | SSC1: 0x00001000 <4>.(4)[318:kworker/4:1H]mmc0: cqhci: SSC2: 0x00000001 | DCMD rsp: 0x00000000 <4>.(4)[318:kworker/4:1H]mmc0: cqhci: RED mask: 0xfdf9a080 | TERRI: 0x00000000 <4>.(4)[318:kworker/4:1H]mmc0: cqhci: Resp idx: 0x00000000 | Resp arg: 0x00000000 <4>.(4)[318:kworker/4:1H]mmc0: cqhci: CRNQP: 0x00000000 | CRNQDUN: 0x00000000 <4>.(4)[318:kworker/4:1H]mmc0: cqhci: CRNQIS: 0x00000000 | CRNQIE: 0x00000000 This change check HALT state after CQE enable, if CQE is in HALT state, we will clear it. Signed-off-by: Wenbin Mei Cc: stable@vger.kernel.org Acked-by: Adrian Hunter Fixes: a4080225f51d ("mmc: cqhci: support for command queue enabled host") Link: https://lore.kernel.org/r/20211026070812.9359-1-wenbin.mei@mediatek.com Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/host/cqhci.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/mmc/host/cqhci.c b/drivers/mmc/host/cqhci.c index 2d65b32d205a5..ec56464eaf23e 100644 --- a/drivers/mmc/host/cqhci.c +++ b/drivers/mmc/host/cqhci.c @@ -273,6 +273,9 @@ static void __cqhci_enable(struct cqhci_host *cq_host) cqhci_writel(cq_host, cqcfg, CQHCI_CFG); + if (cqhci_readl(cq_host, CQHCI_CTL) & CQHCI_HALT) + cqhci_writel(cq_host, 0, CQHCI_CTL); + mmc->cqe_on = true; if (cq_host->ops->enable) From b8a643aa6353b6629f3df914d718958547427f35 Mon Sep 17 00:00:00 2001 From: Jaehoon Chung Date: Fri, 22 Oct 2021 17:21:06 +0900 Subject: [PATCH 1272/5344] mmc: dw_mmc: exynos: fix the finding clock sample value commit 697542bceae51f7620af333b065dd09d213629fb upstream. Even though there are candiates value if can't find best value, it's returned -EIO. It's not proper behavior. If there is not best value, use a first candiate value to work eMMC. Signed-off-by: Jaehoon Chung Tested-by: Marek Szyprowski Tested-by: Christian Hewitt Cc: stable@vger.kernel.org Fixes: c537a1c5ff63 ("mmc: dw_mmc: exynos: add variable delay tuning sequence") Link: https://lore.kernel.org/r/20211022082106.1557-1-jh80.chung@samsung.com Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/host/dw_mmc-exynos.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/mmc/host/dw_mmc-exynos.c b/drivers/mmc/host/dw_mmc-exynos.c index 5e3d95b636769..ae2c74186e1d2 100644 --- a/drivers/mmc/host/dw_mmc-exynos.c +++ b/drivers/mmc/host/dw_mmc-exynos.c @@ -462,6 +462,18 @@ static s8 dw_mci_exynos_get_best_clksmpl(u8 candiates) } } + /* + * If there is no cadiates value, then it needs to return -EIO. + * If there are candiates values and don't find bset clk sample value, + * then use a first candiates clock sample value. + */ + for (i = 0; i < iter; i++) { + __c = ror8(candiates, i); + if ((__c & 0x1) == 0x1) { + loc = i; + goto out; + } + } out: return loc; } @@ -492,6 +504,8 @@ static int dw_mci_exynos_execute_tuning(struct dw_mci_slot *slot, u32 opcode) priv->tuned_sample = found; } else { ret = -EIO; + dev_warn(&mmc->class_dev, + "There is no candiates value about clksmpl!\n"); } return ret; From b4934639cea68dc65857571fc97a05c126e4b5cc Mon Sep 17 00:00:00 2001 From: Shawn Guo Date: Mon, 4 Oct 2021 10:49:35 +0800 Subject: [PATCH 1273/5344] mmc: sdhci: Map more voltage level to SDHCI_POWER_330 commit 4217d07b9fb328751f877d3bd9550122014860a2 upstream. On Thundercomm TurboX CM2290, the eMMC OCR reports vdd = 23 (3.5 ~ 3.6 V), which is being treated as an invalid value by sdhci_set_power_noreg(). And thus eMMC is totally broken on the platform. [ 1.436599] ------------[ cut here ]------------ [ 1.436606] mmc0: Invalid vdd 0x17 [ 1.436640] WARNING: CPU: 2 PID: 69 at drivers/mmc/host/sdhci.c:2048 sdhci_set_power_noreg+0x168/0x2b4 [ 1.436655] Modules linked in: [ 1.436662] CPU: 2 PID: 69 Comm: kworker/u8:1 Tainted: G W 5.15.0-rc1+ #137 [ 1.436669] Hardware name: Thundercomm TurboX CM2290 (DT) [ 1.436674] Workqueue: events_unbound async_run_entry_fn [ 1.436685] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 1.436692] pc : sdhci_set_power_noreg+0x168/0x2b4 [ 1.436698] lr : sdhci_set_power_noreg+0x168/0x2b4 [ 1.436703] sp : ffff800010803a60 [ 1.436705] x29: ffff800010803a60 x28: ffff6a9102465f00 x27: ffff6a9101720a70 [ 1.436715] x26: ffff6a91014de1c0 x25: ffff6a91014de010 x24: ffff6a91016af280 [ 1.436724] x23: ffffaf7b1b276640 x22: 0000000000000000 x21: ffff6a9101720000 [ 1.436733] x20: ffff6a9101720370 x19: ffff6a9101720580 x18: 0000000000000020 [ 1.436743] x17: 0000000000000000 x16: 0000000000000004 x15: ffffffffffffffff [ 1.436751] x14: 0000000000000000 x13: 00000000fffffffd x12: ffffaf7b1b84b0bc [ 1.436760] x11: ffffaf7b1b720d10 x10: 000000000000000a x9 : ffff800010803a60 [ 1.436769] x8 : 000000000000000a x7 : 000000000000000f x6 : 00000000fffff159 [ 1.436778] x5 : 0000000000000000 x4 : 0000000000000000 x3 : 00000000ffffffff [ 1.436787] x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff6a9101718d80 [ 1.436797] Call trace: [ 1.436800] sdhci_set_power_noreg+0x168/0x2b4 [ 1.436805] sdhci_set_ios+0xa0/0x7fc [ 1.436811] mmc_power_up.part.0+0xc4/0x164 [ 1.436818] mmc_start_host+0xa0/0xb0 [ 1.436824] mmc_add_host+0x60/0x90 [ 1.436830] __sdhci_add_host+0x174/0x330 [ 1.436836] sdhci_msm_probe+0x7c0/0x920 [ 1.436842] platform_probe+0x68/0xe0 [ 1.436850] really_probe.part.0+0x9c/0x31c [ 1.436857] __driver_probe_device+0x98/0x144 [ 1.436863] driver_probe_device+0xc8/0x15c [ 1.436869] __device_attach_driver+0xb4/0x120 [ 1.436875] bus_for_each_drv+0x78/0xd0 [ 1.436881] __device_attach_async_helper+0xac/0xd0 [ 1.436888] async_run_entry_fn+0x34/0x110 [ 1.436895] process_one_work+0x1d0/0x354 [ 1.436903] worker_thread+0x13c/0x470 [ 1.436910] kthread+0x150/0x160 [ 1.436915] ret_from_fork+0x10/0x20 [ 1.436923] ---[ end trace fcfac44cb045c3a8 ]--- Fix the issue by mapping MMC_VDD_35_36 (and MMC_VDD_34_35) to SDHCI_POWER_330 as well. Signed-off-by: Shawn Guo Acked-by: Adrian Hunter Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20211004024935.15326-1-shawn.guo@linaro.org Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/host/sdhci.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 2ecd9acebb2f0..cb54fa2120d72 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -1741,6 +1741,12 @@ void sdhci_set_power_noreg(struct sdhci_host *host, unsigned char mode, break; case MMC_VDD_32_33: case MMC_VDD_33_34: + /* + * 3.4 ~ 3.6V are valid only for those platforms where it's + * known that the voltage range is supported by hardware. + */ + case MMC_VDD_34_35: + case MMC_VDD_35_36: pwr = SDHCI_POWER_330; break; default: From 579bdce03360948e52c218f1dcbf3f67214a6da2 Mon Sep 17 00:00:00 2001 From: Haibo Chen Date: Fri, 15 Oct 2021 10:00:36 +0800 Subject: [PATCH 1274/5344] mmc: sdhci-esdhc-imx: clear the buffer_read_ready to reset standard tuning circuit commit 9af372dc70e9fdcbb70939dac75365e7b88580b4 upstream. To reset standard tuning circuit completely, after clear ESDHC_MIX_CTRL_EXE_TUNE, also need to clear bit buffer_read_ready, this operation will finally clear the USDHC IP internal logic flag execute_tuning_with_clr_buf, make sure the following normal data transfer will not be impacted by standard tuning logic used before. Find this issue when do quick SD card insert/remove stress test. During standard tuning prodedure, if remove SD card, USDHC standard tuning logic can't clear the internal flag execute_tuning_with_clr_buf. Next time when insert SD card, all data related commands can't get any data related interrupts, include data transfer complete interrupt, data timeout interrupt, data CRC interrupt, data end bit interrupt. Always trigger software timeout issue. Even reset the USDHC through bits in register SYS_CTRL (0x2C, bit28 reset tuning, bit26 reset data, bit 25 reset command, bit 24 reset all) can't recover this. From the user's point of view, USDHC stuck, SD can't be recognized any more. Fixes: d9370424c948 ("mmc: sdhci-esdhc-imx: reset tuning circuit when power on mmc card") Signed-off-by: Haibo Chen Acked-by: Adrian Hunter Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/1634263236-6111-1-git-send-email-haibo.chen@nxp.com Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/host/sdhci-esdhc-imx.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c b/drivers/mmc/host/sdhci-esdhc-imx.c index 771676209005b..2c01e2ebef7aa 100644 --- a/drivers/mmc/host/sdhci-esdhc-imx.c +++ b/drivers/mmc/host/sdhci-esdhc-imx.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "sdhci-pltfm.h" #include "sdhci-esdhc.h" #include "cqhci.h" @@ -1022,6 +1023,7 @@ static void esdhc_reset_tuning(struct sdhci_host *host) struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); struct pltfm_imx_data *imx_data = sdhci_pltfm_priv(pltfm_host); u32 ctrl; + int ret; /* Reset the tuning circuit */ if (esdhc_is_usdhc(imx_data)) { @@ -1034,7 +1036,22 @@ static void esdhc_reset_tuning(struct sdhci_host *host) } else if (imx_data->socdata->flags & ESDHC_FLAG_STD_TUNING) { ctrl = readl(host->ioaddr + SDHCI_AUTO_CMD_STATUS); ctrl &= ~ESDHC_MIX_CTRL_SMPCLK_SEL; + ctrl &= ~ESDHC_MIX_CTRL_EXE_TUNE; writel(ctrl, host->ioaddr + SDHCI_AUTO_CMD_STATUS); + /* Make sure ESDHC_MIX_CTRL_EXE_TUNE cleared */ + ret = readl_poll_timeout(host->ioaddr + SDHCI_AUTO_CMD_STATUS, + ctrl, !(ctrl & ESDHC_MIX_CTRL_EXE_TUNE), 1, 50); + if (ret == -ETIMEDOUT) + dev_warn(mmc_dev(host->mmc), + "Warning! clear execute tuning bit failed\n"); + /* + * SDHCI_INT_DATA_AVAIL is W1C bit, set this bit will clear the + * usdhc IP internal logic flag execute_tuning_with_clr_buf, which + * will finally make sure the normal data transfer logic correct. + */ + ctrl = readl(host->ioaddr + SDHCI_INT_STATUS); + ctrl |= SDHCI_INT_DATA_AVAIL; + writel(ctrl, host->ioaddr + SDHCI_INT_STATUS); } } } From 266311e107a4c02527283249c101dcaf32d79bf1 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 30 Sep 2021 13:11:21 +0200 Subject: [PATCH 1275/5344] cfg80211: scan: fix RCU in cfg80211_add_nontrans_list() commit a2083eeb119fb9307258baea9b7c243ca9a2e0b6 upstream. The SSID pointer is pointing to RCU protected data, so we need to have it under rcu_read_lock() for the entire use. Fix this. Cc: stable@vger.kernel.org Fixes: 0b8fb8235be8 ("cfg80211: Parsing of Multiple BSSID information in scanning") Link: https://lore.kernel.org/r/20210930131120.6ddfc603aa1d.I2137344c4e2426525b1a8e4ce5fca82f8ecbfe7e@changeid Signed-off-by: Johannes Berg Signed-off-by: Greg Kroah-Hartman --- net/wireless/scan.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 1580535d53f86..6cefaad3b7f84 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -379,14 +379,17 @@ cfg80211_add_nontrans_list(struct cfg80211_bss *trans_bss, } ssid_len = ssid[1]; ssid = ssid + 2; - rcu_read_unlock(); /* check if nontrans_bss is in the list */ list_for_each_entry(bss, &trans_bss->nontrans_list, nontrans_list) { - if (is_bss(bss, nontrans_bss->bssid, ssid, ssid_len)) + if (is_bss(bss, nontrans_bss->bssid, ssid, ssid_len)) { + rcu_read_unlock(); return 0; + } } + rcu_read_unlock(); + /* add to the list */ list_add_tail(&nontrans_bss->nontrans_list, &trans_bss->nontrans_list); return 0; From 122ae75a48ec2e07fddb34cff0e1a9c7dcc839bd Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 26 Oct 2021 12:36:17 +0200 Subject: [PATCH 1276/5344] net: lan78xx: fix division by zero in send path commit db6c3c064f5d55fa9969f33eafca3cdbefbb3541 upstream. Add the missing endpoint max-packet sanity check to probe() to avoid division by zero in lan78xx_tx_bh() in case a malicious device has broken descriptors (or when doing descriptor fuzz testing). Note that USB core will reject URBs submitted for endpoints with zero wMaxPacketSize but that drivers doing packet-size calculations still need to handle this (cf. commit 2548288b4fb0 ("USB: Fix: Don't skip endpoint descriptors with maxpacket=0")). Fixes: 55d7de9de6c3 ("Microchip's LAN7800 family USB 2/3 to 10/100/1000 Ethernet device driver") Cc: stable@vger.kernel.org # 4.3 Cc: Woojung.Huh@microchip.com Signed-off-by: Johan Hovold Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/lan78xx.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 92d9d3407b79b..fe830b72c3b0f 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -3753,6 +3753,12 @@ static int lan78xx_probe(struct usb_interface *intf, dev->maxpacket = usb_maxpacket(dev->udev, dev->pipe_out, 1); + /* Reject broken descriptors. */ + if (dev->maxpacket == 0) { + ret = -ENODEV; + goto out4; + } + /* driver requires remote-wakeup capability during autosuspend. */ intf->needs_remote_wakeup = 1; From cfcef31e6093981feb9b1db88d88df9f16c2a639 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Wed, 20 Oct 2021 19:19:46 +0200 Subject: [PATCH 1277/5344] drm/ttm: fix memleak in ttm_transfered_destroy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 0db55f9a1bafbe3dac750ea669de9134922389b5 upstream. We need to cleanup the fences for ghost objects as well. Signed-off-by: Christian König Reported-by: Erhard F. Tested-by: Erhard F. Reviewed-by: Huang Rui Bug: https://bugzilla.kernel.org/show_bug.cgi?id=214029 Bug: https://bugzilla.kernel.org/show_bug.cgi?id=214447 CC: Link: https://patchwork.freedesktop.org/patch/msgid/20211020173211.2247-1-christian.koenig@amd.com Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/ttm/ttm_bo_util.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c index fe81c565e7ef8..a7b88ca8b97b3 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_util.c +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c @@ -463,6 +463,7 @@ static void ttm_transfered_destroy(struct ttm_buffer_object *bo) struct ttm_transfer_obj *fbo; fbo = container_of(bo, struct ttm_transfer_obj, base); + dma_resv_fini(&fbo->base.base._resv); ttm_bo_put(fbo->bo); kfree(fbo); } From 93551d5b38043dc5543550408c59b78a945b23ea Mon Sep 17 00:00:00 2001 From: Liu Jian Date: Tue, 12 Oct 2021 13:20:19 +0800 Subject: [PATCH 1278/5344] tcp_bpf: Fix one concurrency problem in the tcp_bpf_send_verdict function commit cd9733f5d75c94a32544d6ce5be47e14194cf137 upstream. With two Msgs, msgA and msgB and a user doing nonblocking sendmsg calls (or multiple cores) on a single socket 'sk' we could get the following flow. msgA, sk msgB, sk ----------- --------------- tcp_bpf_sendmsg() lock(sk) psock = sk->psock tcp_bpf_sendmsg() lock(sk) ... blocking tcp_bpf_send_verdict if (psock->eval == NONE) psock->eval = sk_psock_msg_verdict .. < handle SK_REDIRECT case > release_sock(sk) < lock dropped so grab here > ret = tcp_bpf_sendmsg_redir psock = sk->psock tcp_bpf_send_verdict lock_sock(sk) ... blocking on B if (psock->eval == NONE) <- boom. psock->eval will have msgA state The problem here is we dropped the lock on msgA and grabbed it with msgB. Now we have old state in psock and importantly psock->eval has not been cleared. So msgB will run whatever action was done on A and the verdict program may never see it. Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Liu Jian Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20211012052019.184398-1-liujian56@huawei.com Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_bpf.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 6a0c4326d9cf2..7df7ec74807ac 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -313,6 +313,7 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, bool cork = false, enospc = sk_msg_full(msg); struct sock *sk_redir; u32 tosend, delta = 0; + u32 eval = __SK_NONE; int ret; more_data: @@ -356,13 +357,24 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, case __SK_REDIRECT: sk_redir = psock->sk_redir; sk_msg_apply_bytes(psock, tosend); + if (!psock->apply_bytes) { + /* Clean up before releasing the sock lock. */ + eval = psock->eval; + psock->eval = __SK_NONE; + psock->sk_redir = NULL; + } if (psock->cork) { cork = true; psock->cork = NULL; } sk_msg_return(sk, msg, tosend); release_sock(sk); + ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags); + + if (eval == __SK_REDIRECT) + sock_put(sk_redir); + lock_sock(sk); if (unlikely(ret < 0)) { int free = sk_msg_free_nocharge(sk, msg); From f860a0c626f9ac710d36102159b12f81ef528f8f Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Tue, 12 Oct 2021 13:55:19 -0400 Subject: [PATCH 1279/5344] IB/qib: Protect from buffer overflow in struct qib_user_sdma_pkt fields commit d39bf40e55e666b5905fdbd46a0dced030ce87be upstream. Overflowing either addrlimit or bytes_togo can allow userspace to trigger a buffer overflow of kernel memory. Check for overflows in all the places doing math on user controlled buffers. Fixes: f931551bafe1 ("IB/qib: Add new qib driver for QLogic PCIe InfiniBand adapters") Link: https://lore.kernel.org/r/20211012175519.7298.77738.stgit@awfm-01.cornelisnetworks.com Reported-by: Ilja Van Sprundel Reviewed-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/hw/qib/qib_user_sdma.c | 33 ++++++++++++++++------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c b/drivers/infiniband/hw/qib/qib_user_sdma.c index 05190edc2611e..5fd28574124fb 100644 --- a/drivers/infiniband/hw/qib/qib_user_sdma.c +++ b/drivers/infiniband/hw/qib/qib_user_sdma.c @@ -602,7 +602,7 @@ static int qib_user_sdma_coalesce(const struct qib_devdata *dd, /* * How many pages in this iovec element? */ -static int qib_user_sdma_num_pages(const struct iovec *iov) +static size_t qib_user_sdma_num_pages(const struct iovec *iov) { const unsigned long addr = (unsigned long) iov->iov_base; const unsigned long len = iov->iov_len; @@ -658,7 +658,7 @@ static void qib_user_sdma_free_pkt_frag(struct device *dev, static int qib_user_sdma_pin_pages(const struct qib_devdata *dd, struct qib_user_sdma_queue *pq, struct qib_user_sdma_pkt *pkt, - unsigned long addr, int tlen, int npages) + unsigned long addr, int tlen, size_t npages) { struct page *pages[8]; int i, j; @@ -722,7 +722,7 @@ static int qib_user_sdma_pin_pkt(const struct qib_devdata *dd, unsigned long idx; for (idx = 0; idx < niov; idx++) { - const int npages = qib_user_sdma_num_pages(iov + idx); + const size_t npages = qib_user_sdma_num_pages(iov + idx); const unsigned long addr = (unsigned long) iov[idx].iov_base; ret = qib_user_sdma_pin_pages(dd, pq, pkt, addr, @@ -824,8 +824,8 @@ static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd, unsigned pktnw; unsigned pktnwc; int nfrags = 0; - int npages = 0; - int bytes_togo = 0; + size_t npages = 0; + size_t bytes_togo = 0; int tiddma = 0; int cfur; @@ -885,7 +885,11 @@ static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd, npages += qib_user_sdma_num_pages(&iov[idx]); - bytes_togo += slen; + if (check_add_overflow(bytes_togo, slen, &bytes_togo) || + bytes_togo > type_max(typeof(pkt->bytes_togo))) { + ret = -EINVAL; + goto free_pbc; + } pktnwc += slen >> 2; idx++; nfrags++; @@ -904,8 +908,7 @@ static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd, } if (frag_size) { - int tidsmsize, n; - size_t pktsize; + size_t tidsmsize, n, pktsize, sz, addrlimit; n = npages*((2*PAGE_SIZE/frag_size)+1); pktsize = struct_size(pkt, addr, n); @@ -923,14 +926,24 @@ static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd, else tidsmsize = 0; - pkt = kmalloc(pktsize+tidsmsize, GFP_KERNEL); + if (check_add_overflow(pktsize, tidsmsize, &sz)) { + ret = -EINVAL; + goto free_pbc; + } + pkt = kmalloc(sz, GFP_KERNEL); if (!pkt) { ret = -ENOMEM; goto free_pbc; } pkt->largepkt = 1; pkt->frag_size = frag_size; - pkt->addrlimit = n + ARRAY_SIZE(pkt->addr); + if (check_add_overflow(n, ARRAY_SIZE(pkt->addr), + &addrlimit) || + addrlimit > type_max(typeof(pkt->addrlimit))) { + ret = -EINVAL; + goto free_pbc; + } + pkt->addrlimit = addrlimit; if (tiddma) { char *tidsm = (char *)pkt + pktsize; From 752eaa0eb813adfe2a8e7b0000c6cd7eb60c1753 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Wed, 13 Oct 2021 10:18:52 -0400 Subject: [PATCH 1280/5344] IB/hfi1: Fix abba locking issue with sc_disable() commit 13bac861952a78664907a0f927d3e874e9a59034 upstream. sc_disable() after having disabled the send context wakes up any waiters by calling hfi1_qp_wakeup() while holding the waitlock for the sc. This is contrary to the model for all other calls to hfi1_qp_wakeup() where the waitlock is dropped and a local is used to drive calls to hfi1_qp_wakeup(). Fix by moving the sc->piowait into a local list and driving the wakeup calls from the list. Fixes: 099a884ba4c0 ("IB/hfi1: Handle wakeup of orphaned QPs for pio") Link: https://lore.kernel.org/r/20211013141852.128104.2682.stgit@awfm-01.cornelisnetworks.com Signed-off-by: Mike Marciniszyn Reported-by: TOTE Robot Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/hw/hfi1/pio.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index 79126b2b14ab0..1a82ea73a0fc2 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -920,6 +920,7 @@ void sc_disable(struct send_context *sc) { u64 reg; struct pio_buf *pbuf; + LIST_HEAD(wake_list); if (!sc) return; @@ -954,19 +955,21 @@ void sc_disable(struct send_context *sc) spin_unlock(&sc->release_lock); write_seqlock(&sc->waitlock); - while (!list_empty(&sc->piowait)) { + if (!list_empty(&sc->piowait)) + list_move(&sc->piowait, &wake_list); + write_sequnlock(&sc->waitlock); + while (!list_empty(&wake_list)) { struct iowait *wait; struct rvt_qp *qp; struct hfi1_qp_priv *priv; - wait = list_first_entry(&sc->piowait, struct iowait, list); + wait = list_first_entry(&wake_list, struct iowait, list); qp = iowait_to_qp(wait); priv = qp->priv; list_del_init(&priv->s_iowait.list); priv->s_iowait.lock = NULL; hfi1_qp_wakeup(qp, RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN); } - write_sequnlock(&sc->waitlock); spin_unlock_irq(&sc->alloc_lock); } From 89b6a851983fbcdef8853344afd85c62b41b3cee Mon Sep 17 00:00:00 2001 From: Varun Prakash Date: Mon, 25 Oct 2021 22:46:54 +0530 Subject: [PATCH 1281/5344] nvmet-tcp: fix data digest pointer calculation commit e790de54e94a7a15fb725b34724d41d41cbaa60c upstream. exp_ddgst is of type __le32, &cmd->exp_ddgst + cmd->offset increases &cmd->exp_ddgst by 4 * cmd->offset, fix this by type casting &cmd->exp_ddgst to u8 *. Fixes: 872d26a391da ("nvmet-tcp: add NVMe over TCP target driver") Signed-off-by: Varun Prakash Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Greg Kroah-Hartman --- drivers/nvme/target/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 7cafc2f41ce2c..522cb908e1c30 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -633,7 +633,7 @@ static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd) struct nvmet_tcp_queue *queue = cmd->queue; struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; struct kvec iov = { - .iov_base = &cmd->exp_ddgst + cmd->offset, + .iov_base = (u8 *)&cmd->exp_ddgst + cmd->offset, .iov_len = NVME_TCP_DIGEST_LENGTH - cmd->offset }; int ret; From 971dcb655d5227bb7156e3d196ea89d8a6583520 Mon Sep 17 00:00:00 2001 From: Varun Prakash Date: Mon, 25 Oct 2021 22:47:30 +0530 Subject: [PATCH 1282/5344] nvme-tcp: fix data digest pointer calculation commit d89b9f3bbb58e9e378881209756b0723694f22ff upstream. ddgst is of type __le32, &req->ddgst + req->offset increases &req->ddgst by 4 * req->offset, fix this by type casting &req->ddgst to u8 *. Fixes: 3f2304f8c6d6 ("nvme-tcp: add NVMe over TCP host driver") Signed-off-by: Varun Prakash Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Greg Kroah-Hartman --- drivers/nvme/host/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 38bbbbbc6f47f..ff0d06e8ebb53 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -962,7 +962,7 @@ static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req) int ret; struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR }; struct kvec iov = { - .iov_base = &req->ddgst + req->offset, + .iov_base = (u8 *)&req->ddgst + req->offset, .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset }; From a7eb69f932b4a083e738caa54bdb906737a6ef5a Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Wed, 6 Oct 2021 12:31:53 +0300 Subject: [PATCH 1283/5344] RDMA/mlx5: Set user priority for DCT commit 1ab52ac1e9bc9391f592c9fa8340a6e3e9c36286 upstream. Currently, the driver doesn't set the PCP-based priority for DCT, hence DCT response packets are transmitted without user priority. Fix it by setting user provided priority in the eth_prio field in the DCT context, which in turn sets the value in the transmitted packet. Fixes: 776a3906b692 ("IB/mlx5: Add support for DC target QP") Link: https://lore.kernel.org/r/5fd2d94a13f5742d8803c218927322257d53205c.1633512672.git.leonro@nvidia.com Signed-off-by: Patrisious Haddad Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/hw/mlx5/qp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 4540835e05bda..634f29cb7395c 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3865,6 +3865,8 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, MLX5_SET(dctc, dctc, mtu, attr->path_mtu); MLX5_SET(dctc, dctc, my_addr_index, attr->ah_attr.grh.sgid_index); MLX5_SET(dctc, dctc, hop_limit, attr->ah_attr.grh.hop_limit); + if (attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE) + MLX5_SET(dctc, dctc, eth_prio, attr->ah_attr.sl & 0x7); err = mlx5_core_create_dct(dev->mdev, &qp->dct.mdct, qp->dct.in, MLX5_ST_SZ_BYTES(create_dct_in), out, From 4c48c31db5cee1923ad97ac8dd9a1d54a4875a5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20B=C5=93sch?= Date: Sun, 5 Sep 2021 02:20:27 +0200 Subject: [PATCH 1284/5344] arm64: dts: allwinner: h5: NanoPI Neo 2: Fix ethernet node MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 0764e365dacd0b8f75c1736f9236be280649bd18 upstream. RX and TX delay are provided by ethernet PHY. Reflect that in ethernet node. Fixes: 44a94c7ef989 ("arm64: dts: allwinner: H5: Restore EMAC changes") Signed-off-by: Clément Bœsch Reviewed-by: Jernej Skrabec Reviewed-by: Andrew Lunn Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20210905002027.171984-1-u@pkh.me Signed-off-by: Greg Kroah-Hartman --- arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts b/arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts index 57a6f45036c1f..d7177465b0968 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts @@ -114,7 +114,7 @@ pinctrl-0 = <&emac_rgmii_pins>; phy-supply = <®_gmac_3v3>; phy-handle = <&ext_rgmii_phy>; - phy-mode = "rgmii"; + phy-mode = "rgmii-id"; status = "okay"; }; From f89d2064b6715caa7797118f464e7c9009036da1 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 12 Oct 2021 10:37:35 +0800 Subject: [PATCH 1285/5344] regmap: Fix possible double-free in regcache_rbtree_exit() commit 55e6d8037805b3400096d621091dfbf713f97e83 upstream. In regcache_rbtree_insert_to_block(), when 'present' realloc failed, the 'blk' which is supposed to assign to 'rbnode->block' will be freed, so 'rbnode->block' points a freed memory, in the error handling path of regcache_rbtree_init(), 'rbnode->block' will be freed again in regcache_rbtree_exit(), KASAN will report double-free as follows: BUG: KASAN: double-free or invalid-free in kfree+0xce/0x390 Call Trace: slab_free_freelist_hook+0x10d/0x240 kfree+0xce/0x390 regcache_rbtree_exit+0x15d/0x1a0 regcache_rbtree_init+0x224/0x2c0 regcache_init+0x88d/0x1310 __regmap_init+0x3151/0x4a80 __devm_regmap_init+0x7d/0x100 madera_spi_probe+0x10f/0x333 [madera_spi] spi_probe+0x183/0x210 really_probe+0x285/0xc30 To fix this, moving up the assignment of rbnode->block to immediately after the reallocation has succeeded so that the data structure stays valid even if the second reallocation fails. Reported-by: Hulk Robot Fixes: 3f4ff561bc88b ("regmap: rbtree: Make cache_present bitmap per node") Signed-off-by: Yang Yingliang Link: https://lore.kernel.org/r/20211012023735.1632786-1-yangyingliang@huawei.com Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- drivers/base/regmap/regcache-rbtree.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/base/regmap/regcache-rbtree.c b/drivers/base/regmap/regcache-rbtree.c index cfa29dc89bbff..fabf87058d80b 100644 --- a/drivers/base/regmap/regcache-rbtree.c +++ b/drivers/base/regmap/regcache-rbtree.c @@ -281,14 +281,14 @@ static int regcache_rbtree_insert_to_block(struct regmap *map, if (!blk) return -ENOMEM; + rbnode->block = blk; + if (BITS_TO_LONGS(blklen) > BITS_TO_LONGS(rbnode->blklen)) { present = krealloc(rbnode->cache_present, BITS_TO_LONGS(blklen) * sizeof(*present), GFP_KERNEL); - if (!present) { - kfree(blk); + if (!present) return -ENOMEM; - } memset(present + BITS_TO_LONGS(rbnode->blklen), 0, (BITS_TO_LONGS(blklen) - BITS_TO_LONGS(rbnode->blklen)) @@ -305,7 +305,6 @@ static int regcache_rbtree_insert_to_block(struct regmap *map, } /* update the rbnode block, its size and the base register */ - rbnode->block = blk; rbnode->blklen = blklen; rbnode->base_reg = base_reg; rbnode->cache_present = present; From 10fb86671dffe77a7233a6d01724a156f14ce23e Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Sun, 24 Oct 2021 16:13:56 +0300 Subject: [PATCH 1286/5344] net: batman-adv: fix error handling commit 6f68cd634856f8ca93bafd623ba5357e0f648c68 upstream. Syzbot reported ODEBUG warning in batadv_nc_mesh_free(). The problem was in wrong error handling in batadv_mesh_init(). Before this patch batadv_mesh_init() was calling batadv_mesh_free() in case of any batadv_*_init() calls failure. This approach may work well, when there is some kind of indicator, which can tell which parts of batadv are initialized; but there isn't any. All written above lead to cleaning up uninitialized fields. Even if we hide ODEBUG warning by initializing bat_priv->nc.work, syzbot was able to hit GPF in batadv_nc_purge_paths(), because hash pointer in still NULL. [1] To fix these bugs we can unwind batadv_*_init() calls one by one. It is good approach for 2 reasons: 1) It fixes bugs on error handling path 2) It improves the performance, since we won't call unneeded batadv_*_free() functions. So, this patch makes all batadv_*_init() clean up all allocated memory before returning with an error to no call correspoing batadv_*_free() and open-codes batadv_mesh_free() with proper order to avoid touching uninitialized fields. Link: https://lore.kernel.org/netdev/000000000000c87fbd05cef6bcb0@google.com/ [1] Reported-and-tested-by: syzbot+28b0702ada0bf7381f58@syzkaller.appspotmail.com Fixes: c6c8fea29769 ("net: Add batman-adv meshing protocol") Signed-off-by: Pavel Skripkin Acked-by: Sven Eckelmann Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/batman-adv/bridge_loop_avoidance.c | 8 +++- net/batman-adv/main.c | 56 ++++++++++++++++++-------- net/batman-adv/network-coding.c | 4 +- net/batman-adv/translation-table.c | 4 +- 4 files changed, 52 insertions(+), 20 deletions(-) diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index a6b26ca5c6973..2e818eca3e1c6 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1561,10 +1561,14 @@ int batadv_bla_init(struct batadv_priv *bat_priv) return 0; bat_priv->bla.claim_hash = batadv_hash_new(128); - bat_priv->bla.backbone_hash = batadv_hash_new(32); + if (!bat_priv->bla.claim_hash) + return -ENOMEM; - if (!bat_priv->bla.claim_hash || !bat_priv->bla.backbone_hash) + bat_priv->bla.backbone_hash = batadv_hash_new(32); + if (!bat_priv->bla.backbone_hash) { + batadv_hash_destroy(bat_priv->bla.claim_hash); return -ENOMEM; + } batadv_hash_set_lock_class(bat_priv->bla.claim_hash, &batadv_claim_hash_lock_class_key); diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 4a89177def647..6a183c94cdeb4 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -197,29 +197,41 @@ int batadv_mesh_init(struct net_device *soft_iface) bat_priv->gw.generation = 0; - ret = batadv_v_mesh_init(bat_priv); - if (ret < 0) - goto err; - ret = batadv_originator_init(bat_priv); - if (ret < 0) - goto err; + if (ret < 0) { + atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); + goto err_orig; + } ret = batadv_tt_init(bat_priv); - if (ret < 0) - goto err; + if (ret < 0) { + atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); + goto err_tt; + } + + ret = batadv_v_mesh_init(bat_priv); + if (ret < 0) { + atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); + goto err_v; + } ret = batadv_bla_init(bat_priv); - if (ret < 0) - goto err; + if (ret < 0) { + atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); + goto err_bla; + } ret = batadv_dat_init(bat_priv); - if (ret < 0) - goto err; + if (ret < 0) { + atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); + goto err_dat; + } ret = batadv_nc_mesh_init(bat_priv); - if (ret < 0) - goto err; + if (ret < 0) { + atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); + goto err_nc; + } batadv_gw_init(bat_priv); batadv_mcast_init(bat_priv); @@ -229,8 +241,20 @@ int batadv_mesh_init(struct net_device *soft_iface) return 0; -err: - batadv_mesh_free(soft_iface); +err_nc: + batadv_dat_free(bat_priv); +err_dat: + batadv_bla_free(bat_priv); +err_bla: + batadv_v_mesh_free(bat_priv); +err_v: + batadv_tt_free(bat_priv); +err_tt: + batadv_originator_free(bat_priv); +err_orig: + batadv_purge_outstanding_packets(bat_priv, NULL); + atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE); + return ret; } diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 70e3b161c6635..850f927f33de2 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -155,8 +155,10 @@ int batadv_nc_mesh_init(struct batadv_priv *bat_priv) &batadv_nc_coding_hash_lock_class_key); bat_priv->nc.decoding_hash = batadv_hash_new(128); - if (!bat_priv->nc.decoding_hash) + if (!bat_priv->nc.decoding_hash) { + batadv_hash_destroy(bat_priv->nc.coding_hash); goto err; + } batadv_hash_set_lock_class(bat_priv->nc.decoding_hash, &batadv_nc_decoding_hash_lock_class_key); diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index c5271ea4dc832..515205d7b650f 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -4405,8 +4405,10 @@ int batadv_tt_init(struct batadv_priv *bat_priv) return ret; ret = batadv_tt_global_init(bat_priv); - if (ret < 0) + if (ret < 0) { + batadv_tt_local_table_free(bat_priv); return ret; + } batadv_tvlv_handler_register(bat_priv, batadv_tt_tvlv_ogm_handler_v1, batadv_tt_tvlv_unicast_handler_v1, From ca80cb9cdab8c6e270aa6222a6a887111118360a Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Mon, 25 Oct 2021 05:05:28 -0400 Subject: [PATCH 1287/5344] net: Prevent infinite while loop in skb_tx_hash() commit 0c57eeecc559ca6bc18b8c4e2808bc78dbe769b0 upstream. Drivers call netdev_set_num_tc() and then netdev_set_tc_queue() to set the queue count and offset for each TC. So the queue count and offset for the TCs may be zero for a short period after dev->num_tc has been set. If a TX packet is being transmitted at this time in the code path netdev_pick_tx() -> skb_tx_hash(), skb_tx_hash() may see nonzero dev->num_tc but zero qcount for the TC. The while loop that keeps looping while hash >= qcount will not end. Fix it by checking the TC's qcount to be nonzero before using it. Fixes: eadec877ce9c ("net: Add support for subordinate traffic classes to netdev_pick_tx") Reviewed-by: Andy Gospodarek Signed-off-by: Michael Chan Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/dev.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/core/dev.c b/net/core/dev.c index 52ac625c82265..54e7e1490b8d1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2787,6 +2787,12 @@ static u16 skb_tx_hash(const struct net_device *dev, qoffset = sb_dev->tc_to_txq[tc].offset; qcount = sb_dev->tc_to_txq[tc].count; + if (unlikely(!qcount)) { + net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n", + sb_dev->name, qoffset, tc); + qoffset = 0; + qcount = dev->real_num_tx_queues; + } } if (skb_rx_queue_recorded(skb)) { From 8466b39a9c0ca5b82f989c01c6005d6085e47c60 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Sun, 24 Oct 2021 09:08:20 +0300 Subject: [PATCH 1288/5344] RDMA/sa_query: Use strscpy_pad instead of memcpy to copy a string commit 64733956ebba7cc629856f4a6ee35a52bc9c023f upstream. When copying the device name, the length of the data memcpy copied exceeds the length of the source buffer, which cause the KASAN issue below. Use strscpy_pad() instead. BUG: KASAN: slab-out-of-bounds in ib_nl_set_path_rec_attrs+0x136/0x320 [ib_core] Read of size 64 at addr ffff88811a10f5e0 by task rping/140263 CPU: 3 PID: 140263 Comm: rping Not tainted 5.15.0-rc1+ #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x57/0x7d print_address_description.constprop.0+0x1d/0xa0 kasan_report+0xcb/0x110 kasan_check_range+0x13d/0x180 memcpy+0x20/0x60 ib_nl_set_path_rec_attrs+0x136/0x320 [ib_core] ib_nl_make_request+0x1c6/0x380 [ib_core] send_mad+0x20a/0x220 [ib_core] ib_sa_path_rec_get+0x3e3/0x800 [ib_core] cma_query_ib_route+0x29b/0x390 [rdma_cm] rdma_resolve_route+0x308/0x3e0 [rdma_cm] ucma_resolve_route+0xe1/0x150 [rdma_ucm] ucma_write+0x17b/0x1f0 [rdma_ucm] vfs_write+0x142/0x4d0 ksys_write+0x133/0x160 do_syscall_64+0x43/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f26499aa90f Code: 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 29 fd ff ff 48 8b 54 24 18 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 44 24 08 e8 5c fd ff ff 48 RSP: 002b:00007f26495f2dc0 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 00000000000007d0 RCX: 00007f26499aa90f RDX: 0000000000000010 RSI: 00007f26495f2e00 RDI: 0000000000000003 RBP: 00005632a8315440 R08: 0000000000000000 R09: 0000000000000001 R10: 0000000000000000 R11: 0000000000000293 R12: 00007f26495f2e00 R13: 00005632a83154e0 R14: 00005632a8315440 R15: 00005632a830a810 Allocated by task 131419: kasan_save_stack+0x1b/0x40 __kasan_kmalloc+0x7c/0x90 proc_self_get_link+0x8b/0x100 pick_link+0x4f1/0x5c0 step_into+0x2eb/0x3d0 walk_component+0xc8/0x2c0 link_path_walk+0x3b8/0x580 path_openat+0x101/0x230 do_filp_open+0x12e/0x240 do_sys_openat2+0x115/0x280 __x64_sys_openat+0xce/0x140 do_syscall_64+0x43/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae Fixes: 2ca546b92a02 ("IB/sa: Route SA pathrecord query through netlink") Link: https://lore.kernel.org/r/72ede0f6dab61f7f23df9ac7a70666e07ef314b0.1635055496.git.leonro@nvidia.com Signed-off-by: Mark Zhang Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/core/sa_query.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index d2d70c89193ff..11ab6390eda4d 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -760,8 +760,9 @@ static void ib_nl_set_path_rec_attrs(struct sk_buff *skb, /* Construct the family header first */ header = skb_put(skb, NLMSG_ALIGN(sizeof(*header))); - memcpy(header->device_name, dev_name(&query->port->agent->device->dev), - LS_DEVICE_NAME_MAX); + strscpy_pad(header->device_name, + dev_name(&query->port->agent->device->dev), + LS_DEVICE_NAME_MAX); header->port_num = query->port->port_num; if ((comp_mask & IB_SA_PATH_REC_REVERSIBLE) && From 7b892a4289770df157e3beb4322a57c82b110aba Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Wed, 20 Oct 2021 12:11:16 -0700 Subject: [PATCH 1289/5344] nios2: Make NIOS2_DTB_SOURCE_BOOL depend on !COMPILE_TEST commit 4a089e95b4d6bb625044d47aed0c442a8f7bd093 upstream. nios2:allmodconfig builds fail with make[1]: *** No rule to make target 'arch/nios2/boot/dts/""', needed by 'arch/nios2/boot/dts/built-in.a'. Stop. make: [Makefile:1868: arch/nios2/boot/dts] Error 2 (ignored) This is seen with compile tests since those enable NIOS2_DTB_SOURCE_BOOL, which in turn enables NIOS2_DTB_SOURCE. This causes the build error because the default value for NIOS2_DTB_SOURCE is an empty string. Disable NIOS2_DTB_SOURCE_BOOL for compile tests to avoid the error. Fixes: 2fc8483fdcde ("nios2: Build infrastructure") Signed-off-by: Guenter Roeck Reviewed-by: Randy Dunlap Signed-off-by: Dinh Nguyen Signed-off-by: Greg Kroah-Hartman --- arch/nios2/platform/Kconfig.platform | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/nios2/platform/Kconfig.platform b/arch/nios2/platform/Kconfig.platform index 9e32fb7f3d4ce..e849daff6fd16 100644 --- a/arch/nios2/platform/Kconfig.platform +++ b/arch/nios2/platform/Kconfig.platform @@ -37,6 +37,7 @@ config NIOS2_DTB_PHYS_ADDR config NIOS2_DTB_SOURCE_BOOL bool "Compile and link device tree into kernel image" + depends on !COMPILE_TEST help This allows you to specify a dts (device tree source) file which will be compiled and linked into the kernel image. From b2565b00bcd48663720735e78648a0d373e397d0 Mon Sep 17 00:00:00 2001 From: Yuiko Oshino Date: Fri, 22 Oct 2021 11:13:53 -0400 Subject: [PATCH 1290/5344] net: ethernet: microchip: lan743x: Fix driver crash when lan743x_pm_resume fails commit d6423d2ec39cce2bfca418c81ef51792891576bc upstream. The driver needs to clean up and return when the initialization fails on resume. Fixes: 23f0703c125b ("lan743x: Add main source files for new lan743x driver") Signed-off-by: Yuiko Oshino Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/microchip/lan743x_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index dfa0ded169ee9..db8de80672958 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -3001,6 +3001,8 @@ static int lan743x_pm_resume(struct device *dev) if (ret) { netif_err(adapter, probe, adapter->netdev, "lan743x_hardware_init returned %d\n", ret); + lan743x_pci_cleanup(adapter); + return ret; } /* open netdev when netdev is at running state while resume. From 94e2deebbfdf3403798ec234821b199699f856f1 Mon Sep 17 00:00:00 2001 From: Yuiko Oshino Date: Fri, 22 Oct 2021 11:53:43 -0400 Subject: [PATCH 1291/5344] net: ethernet: microchip: lan743x: Fix dma allocation failure by using dma_set_mask_and_coherent commit 95a359c9553342d36d408d35331ff0bfce75272f upstream. The dma failure was reported in the raspberry pi github (issue #4117). https://github.com/raspberrypi/linux/issues/4117 The use of dma_set_mask_and_coherent fixes the issue. Tested on 32/64-bit raspberry pi CM4 and 64-bit ubuntu x86 PC with EVB-LAN7430. Fixes: 23f0703c125b ("lan743x: Add main source files for new lan743x driver") Signed-off-by: Yuiko Oshino Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/microchip/lan743x_main.c | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index db8de80672958..a109120da0e7c 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -1706,6 +1706,16 @@ static int lan743x_tx_ring_init(struct lan743x_tx *tx) ret = -EINVAL; goto cleanup; } + if (dma_set_mask_and_coherent(&tx->adapter->pdev->dev, + DMA_BIT_MASK(64))) { + if (dma_set_mask_and_coherent(&tx->adapter->pdev->dev, + DMA_BIT_MASK(32))) { + dev_warn(&tx->adapter->pdev->dev, + "lan743x_: No suitable DMA available\n"); + ret = -ENOMEM; + goto cleanup; + } + } ring_allocation_size = ALIGN(tx->ring_size * sizeof(struct lan743x_tx_descriptor), PAGE_SIZE); @@ -2256,6 +2266,16 @@ static int lan743x_rx_ring_init(struct lan743x_rx *rx) ret = -EINVAL; goto cleanup; } + if (dma_set_mask_and_coherent(&rx->adapter->pdev->dev, + DMA_BIT_MASK(64))) { + if (dma_set_mask_and_coherent(&rx->adapter->pdev->dev, + DMA_BIT_MASK(32))) { + dev_warn(&rx->adapter->pdev->dev, + "lan743x_: No suitable DMA available\n"); + ret = -ENOMEM; + goto cleanup; + } + } ring_allocation_size = ALIGN(rx->ring_size * sizeof(struct lan743x_rx_descriptor), PAGE_SIZE); From aee2930cd6193fd5f95861ed3cfb475a87b7acd2 Mon Sep 17 00:00:00 2001 From: Trevor Woerner Date: Sun, 24 Oct 2021 13:50:02 -0400 Subject: [PATCH 1292/5344] net: nxp: lpc_eth.c: avoid hang when bringing interface down commit ace19b992436a257d9a793672e57abc28fe83e2e upstream. A hard hang is observed whenever the ethernet interface is brought down. If the PHY is stopped before the LPC core block is reset, the SoC will hang. Comparing lpc_eth_close() and lpc_eth_open() I re-arranged the ordering of the functions calls in lpc_eth_close() to reset the hardware before stopping the PHY. Fixes: b7370112f519 ("lpc32xx: Added ethernet driver") Signed-off-by: Trevor Woerner Acked-by: Vladimir Zapolskiy Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/nxp/lpc_eth.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/nxp/lpc_eth.c b/drivers/net/ethernet/nxp/lpc_eth.c index 1d59ef367a85c..3b177421651f1 100644 --- a/drivers/net/ethernet/nxp/lpc_eth.c +++ b/drivers/net/ethernet/nxp/lpc_eth.c @@ -1007,9 +1007,6 @@ static int lpc_eth_close(struct net_device *ndev) napi_disable(&pldat->napi); netif_stop_queue(ndev); - if (ndev->phydev) - phy_stop(ndev->phydev); - spin_lock_irqsave(&pldat->lock, flags); __lpc_eth_reset(pldat); netif_carrier_off(ndev); @@ -1017,6 +1014,8 @@ static int lpc_eth_close(struct net_device *ndev) writel(0, LPC_ENET_MAC2(pldat->net_base)); spin_unlock_irqrestore(&pldat->lock, flags); + if (ndev->phydev) + phy_stop(ndev->phydev); clk_disable_unprepare(pldat->clk); return 0; From 27fb23b90d5fb8d0c113b22956fa3c1ddd8649d7 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Wed, 27 Oct 2021 17:59:21 -0400 Subject: [PATCH 1293/5344] net/tls: Fix flipped sign in async_wait.err assignment commit 1d9d6fd21ad4a28b16ed9ee5432ae738b9dc58aa upstream. sk->sk_err contains a positive number, yet async_wait.err wants the opposite. Fix the missed sign flip, which Jakub caught by inspection. Fixes: a42055e8d2c3 ("net/tls: Add support for async encryption of records for performance") Suggested-by: Jakub Kicinski Signed-off-by: Daniel Jordan Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tls/tls_sw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 8c9d2055d92b0..02821b9140546 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -456,7 +456,7 @@ static void tls_encrypt_done(struct crypto_async_request *req, int err) /* If err is already set on socket, return the same code */ if (sk->sk_err) { - ctx->async_wait.err = sk->sk_err; + ctx->async_wait.err = -sk->sk_err; } else { ctx->async_wait.err = err; tls_err_abort(sk, err); From 635a7f91c2af0aee0a6e5c3510186f72184ff2c0 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 24 Oct 2021 21:48:02 +0200 Subject: [PATCH 1294/5344] phy: phy_ethtool_ksettings_get: Lock the phy for consistency commit c10a485c3de5ccbf1fff65a382cebcb2730c6b06 upstream. The PHY structure should be locked while copying information out if it, otherwise there is no guarantee of self consistency. Without the lock the PHY state machine could be updating the structure. Fixes: 2d55173e71b0 ("phy: add generic function to support ksetting support") Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/phy/phy.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index b0b8a3ce82b68..168a36857036d 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -367,6 +367,7 @@ EXPORT_SYMBOL(phy_ethtool_ksettings_set); void phy_ethtool_ksettings_get(struct phy_device *phydev, struct ethtool_link_ksettings *cmd) { + mutex_lock(&phydev->lock); linkmode_copy(cmd->link_modes.supported, phydev->supported); linkmode_copy(cmd->link_modes.advertising, phydev->advertising); linkmode_copy(cmd->link_modes.lp_advertising, phydev->lp_advertising); @@ -383,6 +384,7 @@ void phy_ethtool_ksettings_get(struct phy_device *phydev, cmd->base.autoneg = phydev->autoneg; cmd->base.eth_tp_mdix_ctrl = phydev->mdix_ctrl; cmd->base.eth_tp_mdix = phydev->mdix; + mutex_unlock(&phydev->lock); } EXPORT_SYMBOL(phy_ethtool_ksettings_get); From 4d5818cd91d8751cf197414707611c19f576cd96 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 24 Oct 2021 21:48:04 +0200 Subject: [PATCH 1295/5344] phy: phy_start_aneg: Add an unlocked version commit 707293a56f95f8e7e0cfae008010c7933fb68973 upstream. Split phy_start_aneg into a wrapper which takes the PHY lock, and a helper doing the real work. This will be needed when phy_ethtook_ksettings_set takes the lock. Fixes: 2d55173e71b0 ("phy: add generic function to support ksetting support") Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/phy/phy.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 168a36857036d..6c52ff8c0d2eb 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -555,7 +555,7 @@ static int phy_check_link_status(struct phy_device *phydev) } /** - * phy_start_aneg - start auto-negotiation for this PHY device + * _phy_start_aneg - start auto-negotiation for this PHY device * @phydev: the phy_device struct * * Description: Sanitizes the settings (if we're not autonegotiating @@ -563,25 +563,43 @@ static int phy_check_link_status(struct phy_device *phydev) * If the PHYCONTROL Layer is operating, we change the state to * reflect the beginning of Auto-negotiation or forcing. */ -int phy_start_aneg(struct phy_device *phydev) +static int _phy_start_aneg(struct phy_device *phydev) { int err; + lockdep_assert_held(&phydev->lock); + if (!phydev->drv) return -EIO; - mutex_lock(&phydev->lock); - if (AUTONEG_DISABLE == phydev->autoneg) phy_sanitize_settings(phydev); err = phy_config_aneg(phydev); if (err < 0) - goto out_unlock; + return err; if (phy_is_started(phydev)) err = phy_check_link_status(phydev); -out_unlock: + + return err; +} + +/** + * phy_start_aneg - start auto-negotiation for this PHY device + * @phydev: the phy_device struct + * + * Description: Sanitizes the settings (if we're not autonegotiating + * them), and then calls the driver's config_aneg function. + * If the PHYCONTROL Layer is operating, we change the state to + * reflect the beginning of Auto-negotiation or forcing. + */ +int phy_start_aneg(struct phy_device *phydev) +{ + int err; + + mutex_lock(&phydev->lock); + err = _phy_start_aneg(phydev); mutex_unlock(&phydev->lock); return err; From f429981a68fc5bf12a62aaeee0f06d2b6f385fb6 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 20 Oct 2021 07:42:41 -0400 Subject: [PATCH 1296/5344] sctp: use init_tag from inithdr for ABORT chunk [ Upstream commit 4f7019c7eb33967eb87766e0e4602b5576873680 ] Currently Linux SCTP uses the verification tag of the existing SCTP asoc when failing to process and sending the packet with the ABORT chunk. This will result in the peer accepting the ABORT chunk and removing the SCTP asoc. One could exploit this to terminate a SCTP asoc. This patch is to fix it by always using the initiate tag of the received INIT chunk for the ABORT chunk to be sent. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/sctp/sm_statefuns.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 82a202d71a31e..962b848459f5d 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -6248,6 +6248,7 @@ static struct sctp_packet *sctp_ootb_pkt_new( * yet. */ switch (chunk->chunk_hdr->type) { + case SCTP_CID_INIT: case SCTP_CID_INIT_ACK: { struct sctp_initack_chunk *initack; From a8a1a6c196bd34d548dfce0ebaeb49ca3b624e9e Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 20 Oct 2021 07:42:43 -0400 Subject: [PATCH 1297/5344] sctp: fix the processing for INIT_ACK chunk [ Upstream commit 438b95a7c98f77d51cbf4db021f41b602d750a3f ] Currently INIT_ACK chunk in non-cookie_echoed state is processed in sctp_sf_discard_chunk() to send an abort with the existent asoc's vtag if the chunk length is not valid. But the vtag in the chunk's sctphdr is not verified, which may be exploited by one to cook a malicious chunk to terminal a SCTP asoc. sctp_sf_discard_chunk() also is called in many other places to send an abort, and most of those have this problem. This patch is to fix it by sending abort with the existent asoc's vtag only if the vtag from the chunk's sctphdr is verified in sctp_sf_discard_chunk(). Note on sctp_sf_do_9_1_abort() and sctp_sf_shutdown_pending_abort(), the chunk length has been verified before sctp_sf_discard_chunk(), so replace it with sctp_sf_discard(). On sctp_sf_do_asconf_ack() and sctp_sf_do_asconf(), move the sctp_chunk_length_valid check ahead of sctp_sf_discard_chunk(), then replace it with sctp_sf_discard(). Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/sctp/sm_statefuns.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 962b848459f5d..80e19f5d17384 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -2280,7 +2280,7 @@ enum sctp_disposition sctp_sf_shutdown_pending_abort( */ if (SCTP_ADDR_DEL == sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest)) - return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands); + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); if (!sctp_err_chunk_valid(chunk)) return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); @@ -2326,7 +2326,7 @@ enum sctp_disposition sctp_sf_shutdown_sent_abort( */ if (SCTP_ADDR_DEL == sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest)) - return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands); + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); if (!sctp_err_chunk_valid(chunk)) return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); @@ -2596,7 +2596,7 @@ enum sctp_disposition sctp_sf_do_9_1_abort( */ if (SCTP_ADDR_DEL == sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest)) - return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands); + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); if (!sctp_err_chunk_valid(chunk)) return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); @@ -3745,6 +3745,11 @@ enum sctp_disposition sctp_sf_do_asconf(struct net *net, return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); } + /* Make sure that the ASCONF ADDIP chunk has a valid length. */ + if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_addip_chunk))) + return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, + commands); + /* ADD-IP: Section 4.1.1 * This chunk MUST be sent in an authenticated way by using * the mechanism defined in [I-D.ietf-tsvwg-sctp-auth]. If this chunk @@ -3753,13 +3758,7 @@ enum sctp_disposition sctp_sf_do_asconf(struct net *net, */ if (!asoc->peer.asconf_capable || (!net->sctp.addip_noauth && !chunk->auth)) - return sctp_sf_discard_chunk(net, ep, asoc, type, arg, - commands); - - /* Make sure that the ASCONF ADDIP chunk has a valid length. */ - if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_addip_chunk))) - return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, - commands); + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); hdr = (struct sctp_addiphdr *)chunk->skb->data; serial = ntohl(hdr->serial); @@ -3888,6 +3887,12 @@ enum sctp_disposition sctp_sf_do_asconf_ack(struct net *net, return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); } + /* Make sure that the ADDIP chunk has a valid length. */ + if (!sctp_chunk_length_valid(asconf_ack, + sizeof(struct sctp_addip_chunk))) + return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, + commands); + /* ADD-IP, Section 4.1.2: * This chunk MUST be sent in an authenticated way by using * the mechanism defined in [I-D.ietf-tsvwg-sctp-auth]. If this chunk @@ -3896,14 +3901,7 @@ enum sctp_disposition sctp_sf_do_asconf_ack(struct net *net, */ if (!asoc->peer.asconf_capable || (!net->sctp.addip_noauth && !asconf_ack->auth)) - return sctp_sf_discard_chunk(net, ep, asoc, type, arg, - commands); - - /* Make sure that the ADDIP chunk has a valid length. */ - if (!sctp_chunk_length_valid(asconf_ack, - sizeof(struct sctp_addip_chunk))) - return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, - commands); + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); addip_hdr = (struct sctp_addiphdr *)asconf_ack->skb->data; rcvd_serial = ntohl(addip_hdr->serial); @@ -4475,6 +4473,9 @@ enum sctp_disposition sctp_sf_discard_chunk(struct net *net, { struct sctp_chunk *chunk = arg; + if (asoc && !sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + /* Make sure that the chunk has a valid length. * Since we don't know the chunk type, we use a general * chunkhdr structure to make a comparison. From ec4214f10b4e62441246519fa7b7f4af03c25df9 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 20 Oct 2021 07:42:44 -0400 Subject: [PATCH 1298/5344] sctp: fix the processing for COOKIE_ECHO chunk [ Upstream commit a64b341b8695e1c744dd972b39868371b4f68f83 ] 1. In closed state: in sctp_sf_do_5_1D_ce(): When asoc is NULL, making packet for abort will use chunk's vtag in sctp_ootb_pkt_new(). But when asoc exists, vtag from the chunk should be verified before using peer.i.init_tag to make packet for abort in sctp_ootb_pkt_new(), and just discard it if vtag is not correct. 2. In the other states: in sctp_sf_do_5_2_4_dupcook(): asoc always exists, but duplicate cookie_echo's vtag will be handled by sctp_tietags_compare() and then take actions, so before that we only verify the vtag for the abort sent for invalid chunk length. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/sctp/sm_statefuns.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 80e19f5d17384..1e3f6be5bab9b 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -697,6 +697,9 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net, struct sock *sk; int error = 0; + if (asoc && !sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + /* If the packet is an OOTB packet which is temporarily on the * control endpoint, respond with an ABORT. */ @@ -711,7 +714,8 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net, * in sctp_unpack_cookie(). */ if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr))) - return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, + commands); /* If the endpoint is not listening or if the number of associations * on the TCP-style socket exceed the max backlog, respond with an @@ -2141,9 +2145,11 @@ enum sctp_disposition sctp_sf_do_5_2_4_dupcook( * enough for the chunk header. Cookie length verification is * done later. */ - if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr))) - return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, - commands); + if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr))) { + if (!sctp_vtag_verify(chunk, asoc)) + asoc = NULL; + return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, commands); + } /* "Decode" the chunk. We have no optional parameters so we * are in good shape. From 59cedf41bcedad6291e8160f18b6621a7166e31d Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 20 Oct 2021 07:42:45 -0400 Subject: [PATCH 1299/5344] sctp: add vtag check in sctp_sf_violation [ Upstream commit aa0f697e45286a6b5f0ceca9418acf54b9099d99 ] sctp_sf_violation() is called when processing HEARTBEAT_ACK chunk in cookie_wait state, and some other places are also using it. The vtag in the chunk's sctphdr should be verified, otherwise, as later in chunk length check, it may send abort with the existent asoc's vtag, which can be exploited by one to cook a malicious chunk to terminate a SCTP asoc. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/sctp/sm_statefuns.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 1e3f6be5bab9b..35701acbed739 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -4549,6 +4549,9 @@ enum sctp_disposition sctp_sf_violation(struct net *net, { struct sctp_chunk *chunk = arg; + if (!sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + /* Make sure that the chunk has a valid length. */ if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr))) return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, From 5e59193fc1b12abfcc7aed6c2d7af4003a062a9f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 20 Oct 2021 07:42:46 -0400 Subject: [PATCH 1300/5344] sctp: add vtag check in sctp_sf_do_8_5_1_E_sa [ Upstream commit ef16b1734f0a176277b7bb9c71a6d977a6ef3998 ] sctp_sf_do_8_5_1_E_sa() is called when processing SHUTDOWN_ACK chunk in cookie_wait and cookie_echoed state. The vtag in the chunk's sctphdr should be verified, otherwise, as later in chunk length check, it may send abort with the existent asoc's vtag, which can be exploited by one to cook a malicious chunk to terminate a SCTP asoc. Note that when fails to verify the vtag from SHUTDOWN-ACK chunk, SHUTDOWN COMPLETE message will still be sent back to peer, but with the vtag from SHUTDOWN-ACK chunk, as said in 5) of rfc4960#section-8.4. While at it, also remove the unnecessary chunk length check from sctp_sf_shut_8_4_5(), as it's already done in both places where it calls sctp_sf_shut_8_4_5(). Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/sctp/sm_statefuns.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 35701acbed739..877420868a420 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -3683,12 +3683,6 @@ static enum sctp_disposition sctp_sf_shut_8_4_5( SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS); - /* If the chunk length is invalid, we don't want to process - * the reset of the packet. - */ - if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr))) - return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); - /* We need to discard the rest of the packet to prevent * potential bomming attacks from additional bundled chunks. * This is documented in SCTP Threats ID. @@ -3716,6 +3710,9 @@ enum sctp_disposition sctp_sf_do_8_5_1_E_sa(struct net *net, { struct sctp_chunk *chunk = arg; + if (!sctp_vtag_verify(chunk, asoc)) + asoc = NULL; + /* Make sure that the SHUTDOWN_ACK chunk has a valid length. */ if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr))) return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, From f9d7cb94933372f6a27198e282ecf41335c3dc41 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 20 Oct 2021 07:42:47 -0400 Subject: [PATCH 1301/5344] sctp: add vtag check in sctp_sf_ootb [ Upstream commit 9d02831e517aa36ee6bdb453a0eb47bd49923fe3 ] sctp_sf_ootb() is called when processing DATA chunk in closed state, and many other places are also using it. The vtag in the chunk's sctphdr should be verified, otherwise, as later in chunk length check, it may send abort with the existent asoc's vtag, which can be exploited by one to cook a malicious chunk to terminate a SCTP asoc. When fails to verify the vtag from the chunk, this patch sets asoc to NULL, so that the abort will be made with the vtag from the received chunk later. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/sctp/sm_statefuns.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 877420868a420..7c6dcbc8e98ba 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -3568,6 +3568,9 @@ enum sctp_disposition sctp_sf_ootb(struct net *net, SCTP_INC_STATS(net, SCTP_MIB_OUTOFBLUES); + if (asoc && !sctp_vtag_verify(chunk, asoc)) + asoc = NULL; + ch = (struct sctp_chunkhdr *)chunk->chunk_hdr; do { /* Report violation if the chunk is less then minimal */ From 914655fbf2a040cc047b0347af72be3a4f92e342 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Thu, 20 Feb 2020 09:00:07 +0100 Subject: [PATCH 1302/5344] net: use netif_is_bridge_port() to check for IFF_BRIDGE_PORT [ Upstream commit 2e92a2d0e450740ebe7e7a816162327ad1fde94b ] Trivial cleanup, so that all bridge port-specific code can be found in one go. CC: Johannes Berg CC: Roopa Prabhu CC: Nikolay Aleksandrov Signed-off-by: Julian Wiedmann Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/bonding/bond_main.c | 2 +- drivers/net/ethernet/micrel/ksz884x.c | 2 +- net/core/rtnetlink.c | 12 ++++++------ net/wireless/nl80211.c | 2 +- net/wireless/util.c | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 1949f631e1bc5..a7eaf80f500c0 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1219,7 +1219,7 @@ static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb) skb->dev = bond->dev; if (BOND_MODE(bond) == BOND_MODE_ALB && - bond->dev->priv_flags & IFF_BRIDGE_PORT && + netif_is_bridge_port(bond->dev) && skb->pkt_type == PACKET_HOST) { if (unlikely(skb_cow_head(skb, diff --git a/drivers/net/ethernet/micrel/ksz884x.c b/drivers/net/ethernet/micrel/ksz884x.c index 7dc451fdaf35e..2431723bc2fb4 100644 --- a/drivers/net/ethernet/micrel/ksz884x.c +++ b/drivers/net/ethernet/micrel/ksz884x.c @@ -5693,7 +5693,7 @@ static void dev_set_promiscuous(struct net_device *dev, struct dev_priv *priv, * from the bridge. */ if ((hw->features & STP_SUPPORT) && !promiscuous && - (dev->priv_flags & IFF_BRIDGE_PORT)) { + netif_is_bridge_port(dev)) { struct ksz_switch *sw = hw->ksz_switch; int port = priv->port.first_port; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a53b101ce41ae..55c0f32b9375b 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3729,7 +3729,7 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, /* Support fdb on master device the net/bridge default case */ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && - (dev->priv_flags & IFF_BRIDGE_PORT)) { + netif_is_bridge_port(dev)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops = br_dev->netdev_ops; @@ -3840,7 +3840,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, /* Support fdb on master device the net/bridge default case */ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && - (dev->priv_flags & IFF_BRIDGE_PORT)) { + netif_is_bridge_port(dev)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops = br_dev->netdev_ops; @@ -4066,13 +4066,13 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) continue; if (!br_idx) { /* user did not specify a specific bridge */ - if (dev->priv_flags & IFF_BRIDGE_PORT) { + if (netif_is_bridge_port(dev)) { br_dev = netdev_master_upper_dev_get(dev); cops = br_dev->netdev_ops; } } else { if (dev != br_dev && - !(dev->priv_flags & IFF_BRIDGE_PORT)) + !netif_is_bridge_port(dev)) continue; if (br_dev != netdev_master_upper_dev_get(dev) && @@ -4084,7 +4084,7 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) if (idx < s_idx) goto cont; - if (dev->priv_flags & IFF_BRIDGE_PORT) { + if (netif_is_bridge_port(dev)) { if (cops && cops->ndo_fdb_dump) { err = cops->ndo_fdb_dump(skb, cb, br_dev, dev, @@ -4234,7 +4234,7 @@ static int rtnl_fdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (dev) { if (!ndm_flags || (ndm_flags & NTF_MASTER)) { - if (!(dev->priv_flags & IFF_BRIDGE_PORT)) { + if (!netif_is_bridge_port(dev)) { NL_SET_ERR_MSG(extack, "Device is not a bridge port"); return -EINVAL; } diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 7b170ed6923e7..7633d6a74bc2b 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3480,7 +3480,7 @@ static int nl80211_valid_4addr(struct cfg80211_registered_device *rdev, enum nl80211_iftype iftype) { if (!use_4addr) { - if (netdev && (netdev->priv_flags & IFF_BRIDGE_PORT)) + if (netdev && netif_is_bridge_port(netdev)) return -EBUSY; return 0; } diff --git a/net/wireless/util.c b/net/wireless/util.c index f0247eab5bc94..82b3baed2c7dd 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -976,7 +976,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, return -EOPNOTSUPP; /* if it's part of a bridge, reject changing type to station/ibss */ - if ((dev->priv_flags & IFF_BRIDGE_PORT) && + if (netif_is_bridge_port(dev) && (ntype == NL80211_IFTYPE_ADHOC || ntype == NL80211_IFTYPE_STATION || ntype == NL80211_IFTYPE_P2P_CLIENT)) From 8c0113ed0589da9cf8f3521983cecec70ef2ee10 Mon Sep 17 00:00:00 2001 From: Janusz Dziedzic Date: Sun, 24 Oct 2021 22:15:46 +0200 Subject: [PATCH 1303/5344] cfg80211: correct bridge/4addr mode check [ Upstream commit 689a0a9f505f7bffdefe6f17fddb41c8ab6344f6 ] Without the patch we fail: $ sudo brctl addbr br0 $ sudo brctl addif br0 wlp1s0 $ sudo iw wlp1s0 set 4addr on command failed: Device or resource busy (-16) Last command failed but iface was already in 4addr mode. Fixes: ad4bb6f8883a ("cfg80211: disallow bridging managed/adhoc interfaces") Signed-off-by: Janusz Dziedzic Link: https://lore.kernel.org/r/20211024201546.614379-1-janusz.dziedzic@gmail.com [add fixes tag, fix indentation, edit commit log] Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin --- net/wireless/util.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/net/wireless/util.c b/net/wireless/util.c index 82b3baed2c7dd..aaefaf3422a1a 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -975,14 +975,14 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, !(rdev->wiphy.interface_modes & (1 << ntype))) return -EOPNOTSUPP; - /* if it's part of a bridge, reject changing type to station/ibss */ - if (netif_is_bridge_port(dev) && - (ntype == NL80211_IFTYPE_ADHOC || - ntype == NL80211_IFTYPE_STATION || - ntype == NL80211_IFTYPE_P2P_CLIENT)) - return -EBUSY; - if (ntype != otype) { + /* if it's part of a bridge, reject changing type to station/ibss */ + if (netif_is_bridge_port(dev) && + (ntype == NL80211_IFTYPE_ADHOC || + ntype == NL80211_IFTYPE_STATION || + ntype == NL80211_IFTYPE_P2P_CLIENT)) + return -EBUSY; + dev->ieee80211_ptr->use_4addr = false; dev->ieee80211_ptr->mesh_id_up_len = 0; wdev_lock(dev->ieee80211_ptr); From 59cb0a6ea2f1b21b771d2fa4c35fc180c0db6778 Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Tue, 19 Oct 2021 19:53:59 +0200 Subject: [PATCH 1304/5344] KVM: s390: clear kicked_mask before sleeping again [ Upstream commit 9b57e9d5010bbed7c0d9d445085840f7025e6f9a ] The idea behind kicked mask is that we should not re-kick a vcpu that is already in the "kick" process, i.e. that was kicked and is is about to be dispatched if certain conditions are met. The problem with the current implementation is, that it assumes the kicked vcpu is going to enter SIE shortly. But under certain circumstances, the vcpu we just kicked will be deemed non-runnable and will remain in wait state. This can happen, if the interrupt(s) this vcpu got kicked to deal with got already cleared (because the interrupts got delivered to another vcpu). In this case kvm_arch_vcpu_runnable() would return false, and the vcpu would remain in kvm_vcpu_block(), but this time with its kicked_mask bit set. So next time around we wouldn't kick the vcpu form __airqs_kick_single_vcpu(), but would assume that we just kicked it. Let us make sure the kicked_mask is cleared before we give up on re-dispatching the vcpu. Fixes: 9f30f6216378 ("KVM: s390: add gib_alert_irq_handler()") Reported-by: Matthew Rosato Signed-off-by: Halil Pasic Reviewed-by: Christian Borntraeger Reviewed-by: Michael Mueller Reviewed-by: Claudio Imbrenda Link: https://lore.kernel.org/r/20211019175401.3757927-2-pasic@linux.ibm.com Signed-off-by: Christian Borntraeger Signed-off-by: Sasha Levin --- arch/s390/kvm/kvm-s390.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 04967831e0be1..8986935d9e4a8 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3092,6 +3092,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { + clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.gisa_int.kicked_mask); return kvm_s390_vcpu_has_irq(vcpu, 0); } From 26c2447c99a86de3783b7b57434b659a8bd9e148 Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Tue, 19 Oct 2021 19:54:00 +0200 Subject: [PATCH 1305/5344] KVM: s390: preserve deliverable_mask in __airqs_kick_single_vcpu [ Upstream commit 0e9ff65f455dfd0a8aea5e7843678ab6fe097e21 ] Changing the deliverable mask in __airqs_kick_single_vcpu() is a bug. If one idle vcpu can't take the interrupts we want to deliver, we should look for another vcpu that can, instead of saying that we don't want to deliver these interrupts by clearing the bits from the deliverable_mask. Fixes: 9f30f6216378 ("KVM: s390: add gib_alert_irq_handler()") Signed-off-by: Halil Pasic Reviewed-by: Christian Borntraeger Reviewed-by: Michael Mueller Reviewed-by: Claudio Imbrenda Link: https://lore.kernel.org/r/20211019175401.3757927-3-pasic@linux.ibm.com Signed-off-by: Christian Borntraeger Signed-off-by: Sasha Levin --- arch/s390/kvm/interrupt.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index fa9483aa4f575..fd73a8aa89d23 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2987,13 +2987,14 @@ static void __airqs_kick_single_vcpu(struct kvm *kvm, u8 deliverable_mask) int vcpu_idx, online_vcpus = atomic_read(&kvm->online_vcpus); struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; struct kvm_vcpu *vcpu; + u8 vcpu_isc_mask; for_each_set_bit(vcpu_idx, kvm->arch.idle_mask, online_vcpus) { vcpu = kvm_get_vcpu(kvm, vcpu_idx); if (psw_ioint_disabled(vcpu)) continue; - deliverable_mask &= (u8)(vcpu->arch.sie_block->gcr[6] >> 24); - if (deliverable_mask) { + vcpu_isc_mask = (u8)(vcpu->arch.sie_block->gcr[6] >> 24); + if (deliverable_mask & vcpu_isc_mask) { /* lately kicked but not yet running */ if (test_and_set_bit(vcpu_idx, gi->kicked_mask)) return; From 89ccff5ca574d82653a6f707e7623a9bbcd847ba Mon Sep 17 00:00:00 2001 From: Song Liu Date: Sun, 3 Oct 2021 22:32:38 -0700 Subject: [PATCH 1306/5344] perf script: Check session->header.env.arch before using it commit 29c77550eef31b0d72a45b49eeab03b8963264e8 upstream. When perf.data is not written cleanly, we would like to process existing data as much as possible (please see f_header.data.size == 0 condition in perf_session__read_header). However, perf.data with partial data may crash perf. Specifically, we see crash in 'perf script' for NULL session->header.env.arch. Fix this by checking session->header.env.arch before using it to determine native_arch. Also split the if condition so it is easier to read. Committer notes: If it is a pipe, we already assume is a native arch, so no need to check session->header.env.arch. Signed-off-by: Song Liu Cc: Peter Zijlstra Cc: kernel-team@fb.com Cc: stable@vger.kernel.org Link: http://lore.kernel.org/lkml/20211004053238.514936-1-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Greg Kroah-Hartman --- tools/perf/builtin-script.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index f3ff825d9dd33..3a169a026635d 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -3779,11 +3779,15 @@ int cmd_script(int argc, const char **argv) goto out_delete; uname(&uts); - if (data.is_pipe || /* assume pipe_mode indicates native_arch */ - !strcmp(uts.machine, session->header.env.arch) || - (!strcmp(uts.machine, "x86_64") && - !strcmp(session->header.env.arch, "i386"))) + if (data.is_pipe) { /* Assume pipe_mode indicates native_arch */ native_arch = true; + } else if (session->header.env.arch) { + if (!strcmp(uts.machine, session->header.env.arch)) + native_arch = true; + else if (!strcmp(uts.machine, "x86_64") && + !strcmp(session->header.env.arch, "i386")) + native_arch = true; + } script.session = session; script__setup_sample_type(&script); From 72ad9bfa4ca1378a3d7117116905c5e123eb7d27 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 2 Nov 2021 19:46:16 +0100 Subject: [PATCH 1307/5344] Linux 5.4.157 Link: https://lore.kernel.org/r/20211101082500.203657870@linuxfoundation.org Link: https://lore.kernel.org/r/20211101114235.515637019@linuxfoundation.org Tested-by: Jon Hunter Tested-by: Shuah Khan Tested-by: Guenter Roeck Tested-by: Linux Kernel Functional Testing Tested-by: Hulk Robot Tested-by: Sudip Mukherjee Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index ced1f0fd48dc6..49d639fe5a801 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 4 -SUBLEVEL = 156 +SUBLEVEL = 157 EXTRAVERSION = NAME = Kleptomaniac Octopus From 9cc56afd556a375f2e76705717986a0fd33fc9fe Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 8 Oct 2021 13:01:18 +0800 Subject: [PATCH 1308/5344] scsi: core: Put LLD module refcnt after SCSI device is released commit f2b85040acec9a928b4eb1b57a989324e8e38d3f upstream. SCSI host release is triggered when SCSI device is freed. We have to make sure that the low-level device driver module won't be unloaded before SCSI host instance is released because shost->hostt is required in the release handler. Make sure to put LLD module refcnt after SCSI device is released. Fixes a kernel panic of 'BUG: unable to handle page fault for address' reported by Changhui and Yi. Link: https://lore.kernel.org/r/20211008050118.1440686-1-ming.lei@redhat.com Cc: Greg Kroah-Hartman Reported-by: Changhui Zhong Reported-by: Yi Zhang Tested-by: Yi Zhang Signed-off-by: Ming Lei Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/scsi.c | 4 +++- drivers/scsi/scsi_sysfs.c | 9 +++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index 1f5b5c8a7f726..1ce3f90f782fd 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -555,8 +555,10 @@ EXPORT_SYMBOL(scsi_device_get); */ void scsi_device_put(struct scsi_device *sdev) { - module_put(sdev->host->hostt->module); + struct module *mod = sdev->host->hostt->module; + put_device(&sdev->sdev_gendev); + module_put(mod); } EXPORT_SYMBOL(scsi_device_put); diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index 6aeb79e744e0b..12064ce76777d 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -438,9 +438,12 @@ static void scsi_device_dev_release_usercontext(struct work_struct *work) struct list_head *this, *tmp; struct scsi_vpd *vpd_pg80 = NULL, *vpd_pg83 = NULL; unsigned long flags; + struct module *mod; sdev = container_of(work, struct scsi_device, ew.work); + mod = sdev->host->hostt->module; + scsi_dh_release_device(sdev); parent = sdev->sdev_gendev.parent; @@ -481,11 +484,17 @@ static void scsi_device_dev_release_usercontext(struct work_struct *work) if (parent) put_device(parent); + module_put(mod); } static void scsi_device_dev_release(struct device *dev) { struct scsi_device *sdp = to_scsi_device(dev); + + /* Set module pointer as NULL in case of module unloading */ + if (!try_module_get(sdp->host->hostt->module)) + sdp->host->hostt->module = NULL; + execute_in_process_context(scsi_device_dev_release_usercontext, &sdp->ew); } From dc6552c330aaccde05a58fa9e594e8b4fabd9abb Mon Sep 17 00:00:00 2001 From: Eugene Crosser Date: Mon, 18 Oct 2021 20:22:50 +0200 Subject: [PATCH 1309/5344] vrf: Revert "Reset skb conntrack connection..." commit 55161e67d44fdd23900be166a81e996abd6e3be9 upstream. This reverts commit 09e856d54bda5f288ef8437a90ab2b9b3eab83d1. When an interface is enslaved in a VRF, prerouting conntrack hook is called twice: once in the context of the original input interface, and once in the context of the VRF interface. If no special precausions are taken, this leads to creation of two conntrack entries instead of one, and breaks SNAT. Commit above was intended to avoid creation of extra conntrack entries when input interface is enslaved in a VRF. It did so by resetting conntrack related data associated with the skb when it enters VRF context. However it breaks netfilter operation. Imagine a use case when conntrack zone must be assigned based on the original input interface, rather than VRF interface (that would make original interfaces indistinguishable). One could create netfilter rules similar to these: chain rawprerouting { type filter hook prerouting priority raw; iif realiface1 ct zone set 1 return iif realiface2 ct zone set 2 return } This works before the mentioned commit, but not after: zone assignment is "forgotten", and any subsequent NAT or filtering that is dependent on the conntrack zone does not work. Here is a reproducer script that demonstrates the difference in behaviour. ========== #!/bin/sh # This script demonstrates unexpected change of nftables behaviour # caused by commit 09e856d54bda5f28 ""vrf: Reset skb conntrack # connection on VRF rcv" # https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=09e856d54bda5f288ef8437a90ab2b9b3eab83d1 # # Before the commit, it was possible to assign conntrack zone to a # packet (or mark it for `notracking`) in the prerouting chanin, raw # priority, based on the `iif` (interface from which the packet # arrived). # After the change, # if the interface is enslaved in a VRF, such # assignment is lost. Instead, assignment based on the `iif` matching # the VRF master interface is honored. Thus it is impossible to # distinguish packets based on the original interface. # # This script demonstrates this change of behaviour: conntrack zone 1 # or 2 is assigned depending on the match with the original interface # or the vrf master interface. It can be observed that conntrack entry # appears in different zone in the kernel versions before and after # the commit. IPIN=172.30.30.1 IPOUT=172.30.30.2 PFXL=30 ip li sh vein >/dev/null 2>&1 && ip li del vein ip li sh tvrf >/dev/null 2>&1 && ip li del tvrf nft list table testct >/dev/null 2>&1 && nft delete table testct ip li add vein type veth peer veout ip li add tvrf type vrf table 9876 ip li set veout master tvrf ip li set vein up ip li set veout up ip li set tvrf up /sbin/sysctl -w net.ipv4.conf.veout.accept_local=1 /sbin/sysctl -w net.ipv4.conf.veout.rp_filter=0 ip addr add $IPIN/$PFXL dev vein ip addr add $IPOUT/$PFXL dev veout nft -f - <<__END__ table testct { chain rawpre { type filter hook prerouting priority raw; iif { veout, tvrf } meta nftrace set 1 iif veout ct zone set 1 return iif tvrf ct zone set 2 return notrack } chain rawout { type filter hook output priority raw; notrack } } __END__ uname -rv conntrack -F ping -W 1 -c 1 -I vein $IPOUT conntrack -L Signed-off-by: Eugene Crosser Acked-by: David Ahern Signed-off-by: David S. Miller Cc: Florian Westphal Signed-off-by: Greg Kroah-Hartman --- drivers/net/vrf.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 9b626c169554f..f08ed52d51f3f 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1036,8 +1036,6 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, bool need_strict = rt6_need_strict(&ipv6_hdr(skb)->daddr); bool is_ndisc = ipv6_ndisc_frame(skb); - nf_reset_ct(skb); - /* loopback, multicast & non-ND link-local traffic; do not push through * packet taps again. Reset pkt_type for upper layers to process skb. * For strict packets with a source LLA, determine the dst using the @@ -1094,8 +1092,6 @@ static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev, skb->skb_iif = vrf_dev->ifindex; IPCB(skb)->flags |= IPSKB_L3SLAVE; - nf_reset_ct(skb); - if (ipv4_is_multicast(ip_hdr(skb)->daddr)) goto out; From 4e54b6c193ca24b5d00905833575fa5e99ad4b8d Mon Sep 17 00:00:00 2001 From: Yuiko Oshino Date: Wed, 27 Oct 2021 14:23:02 -0400 Subject: [PATCH 1310/5344] net: ethernet: microchip: lan743x: Fix skb allocation failure commit e8684db191e4164f3f5f3ad7dec04a6734c25f1c upstream. The driver allocates skb during ndo_open with GFP_ATOMIC which has high chance of failure when there are multiple instances. GFP_KERNEL is enough while open and use GFP_ATOMIC only from interrupt context. Fixes: 23f0703c125b ("lan743x: Add main source files for new lan743x driver") Signed-off-by: Yuiko Oshino Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/microchip/lan743x_main.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index a109120da0e7c..22beeb5be9c41 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -1898,13 +1898,13 @@ static int lan743x_rx_next_index(struct lan743x_rx *rx, int index) return ((++index) % rx->ring_size); } -static struct sk_buff *lan743x_rx_allocate_skb(struct lan743x_rx *rx) +static struct sk_buff *lan743x_rx_allocate_skb(struct lan743x_rx *rx, gfp_t gfp) { int length = 0; length = (LAN743X_MAX_FRAME_SIZE + ETH_HLEN + 4 + RX_HEAD_PADDING); return __netdev_alloc_skb(rx->adapter->netdev, - length, GFP_ATOMIC | GFP_DMA); + length, gfp); } static void lan743x_rx_update_tail(struct lan743x_rx *rx, int index) @@ -2077,7 +2077,8 @@ static int lan743x_rx_process_packet(struct lan743x_rx *rx) struct sk_buff *new_skb = NULL; int packet_length; - new_skb = lan743x_rx_allocate_skb(rx); + new_skb = lan743x_rx_allocate_skb(rx, + GFP_ATOMIC | GFP_DMA); if (!new_skb) { /* failed to allocate next skb. * Memory is very low. @@ -2314,7 +2315,8 @@ static int lan743x_rx_ring_init(struct lan743x_rx *rx) rx->last_head = 0; for (index = 0; index < rx->ring_size; index++) { - struct sk_buff *new_skb = lan743x_rx_allocate_skb(rx); + struct sk_buff *new_skb = lan743x_rx_allocate_skb(rx, + GFP_KERNEL); ret = lan743x_rx_init_ring_element(rx, index, new_skb); if (ret) From 29e41bc90c60f0286f37612549d6c6a1e6c3b4ee Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 7 Jun 2021 17:23:48 +0200 Subject: [PATCH 1311/5344] media: firewire: firedtv-avc: fix a buffer overflow in avc_ca_pmt() commit 35d2969ea3c7d32aee78066b1f3cf61a0d935a4e upstream. The bounds checking in avc_ca_pmt() is not strict enough. It should be checking "read_pos + 4" because it's reading 5 bytes. If the "es_info_length" is non-zero then it reads a 6th byte so there needs to be an additional check for that. I also added checks for the "write_pos". I don't think these are required because "read_pos" and "write_pos" are tied together so checking one ought to be enough. But they make the code easier to understand for me. The check on write_pos is: if (write_pos + 4 >= sizeof(c->operand) - 4) { The first "+ 4" is because we're writing 5 bytes and the last " - 4" is to leave space for the CRC. The other problem is that "length" can be invalid. It comes from "data_length" in fdtv_ca_pmt(). Cc: stable@vger.kernel.org Reported-by: Luo Likang Signed-off-by: Dan Carpenter Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/firewire/firedtv-avc.c | 14 +++++++++++--- drivers/media/firewire/firedtv-ci.c | 2 ++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/media/firewire/firedtv-avc.c b/drivers/media/firewire/firedtv-avc.c index 2bf9467b917d1..71991f8638e6b 100644 --- a/drivers/media/firewire/firedtv-avc.c +++ b/drivers/media/firewire/firedtv-avc.c @@ -1165,7 +1165,11 @@ int avc_ca_pmt(struct firedtv *fdtv, char *msg, int length) read_pos += program_info_length; write_pos += program_info_length; } - while (read_pos < length) { + while (read_pos + 4 < length) { + if (write_pos + 4 >= sizeof(c->operand) - 4) { + ret = -EINVAL; + goto out; + } c->operand[write_pos++] = msg[read_pos++]; c->operand[write_pos++] = msg[read_pos++]; c->operand[write_pos++] = msg[read_pos++]; @@ -1177,13 +1181,17 @@ int avc_ca_pmt(struct firedtv *fdtv, char *msg, int length) c->operand[write_pos++] = es_info_length >> 8; c->operand[write_pos++] = es_info_length & 0xff; if (es_info_length > 0) { + if (read_pos >= length) { + ret = -EINVAL; + goto out; + } pmt_cmd_id = msg[read_pos++]; if (pmt_cmd_id != 1 && pmt_cmd_id != 4) dev_err(fdtv->device, "invalid pmt_cmd_id %d at stream level\n", pmt_cmd_id); - if (es_info_length > sizeof(c->operand) - 4 - - write_pos) { + if (es_info_length > sizeof(c->operand) - 4 - write_pos || + es_info_length > length - read_pos) { ret = -EINVAL; goto out; } diff --git a/drivers/media/firewire/firedtv-ci.c b/drivers/media/firewire/firedtv-ci.c index 9363d005e2b61..e0d57e09dab0c 100644 --- a/drivers/media/firewire/firedtv-ci.c +++ b/drivers/media/firewire/firedtv-ci.c @@ -134,6 +134,8 @@ static int fdtv_ca_pmt(struct firedtv *fdtv, void *arg) } else { data_length = msg->msg[3]; } + if (data_length > sizeof(msg->msg) - data_pos) + return -EINVAL; return avc_ca_pmt(fdtv, &msg->msg[data_pos], data_length); } From d315bca846ce32137f3ea4aa145c089d51d5a9ce Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 3 Nov 2021 16:51:12 +0100 Subject: [PATCH 1312/5344] Revert "xhci: Set HCD flag to defer primary roothub registration" This reverts commit 2d7c20db7220bc8dbc560de6e58f024696c790e5 which is commit b7a0a792f864583207c593b50fd1b752ed89f4c1 upstream. It has been reported to be causing problems in Arch and Fedora bug reports. Reported-by: Hans de Goede Link: https://bbs.archlinux.org/viewtopic.php?pid=2000956#p2000956 Link: https://bugzilla.redhat.com/show_bug.cgi?id=2019542 Link: https://bugzilla.redhat.com/show_bug.cgi?id=2019576 Link: https://lore.kernel.org/r/42bcbea6-5eb8-16c7-336a-2cb72e71bc36@redhat.com Cc: Mathias Nyman Cc: Chris Chiu Cc: Alan Stern Cc: Kishon Vijay Abraham I Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index 4ef7484dff8b2..4bb850370bb6b 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -693,7 +693,6 @@ int xhci_run(struct usb_hcd *hcd) if (ret) xhci_free_command(xhci, command); } - set_bit(HCD_FLAG_DEFER_RH_REGISTER, &hcd->flags); xhci_dbg_trace(xhci, trace_xhci_dbg_init, "Finished xhci_run for USB2 roothub"); From 618426c4313ec8f70b8e18aab4c67cc673545662 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 3 Nov 2021 16:51:36 +0100 Subject: [PATCH 1313/5344] Revert "usb: core: hcd: Add support for deferring roothub registration" This reverts commit 20c9fdde30fbe797aec0e0a04fb77013fe473886 which is commit 58877b0824da15698bd85a0a9dbfa8c354e6ecb7 upstream. It has been reported to be causing problems in Arch and Fedora bug reports. Reported-by: Hans de Goede Link: https://bbs.archlinux.org/viewtopic.php?pid=2000956#p2000956 Link: https://bugzilla.redhat.com/show_bug.cgi?id=2019542 Link: https://bugzilla.redhat.com/show_bug.cgi?id=2019576 Link: https://lore.kernel.org/r/42bcbea6-5eb8-16c7-336a-2cb72e71bc36@redhat.com Cc: Mathias Nyman Cc: Chris Chiu Cc: Alan Stern Cc: Kishon Vijay Abraham I Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hcd.c | 29 ++++++----------------------- include/linux/usb/hcd.h | 2 -- 2 files changed, 6 insertions(+), 25 deletions(-) diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index 48ff9c66ae46d..d0f45600b6698 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -2636,7 +2636,6 @@ int usb_add_hcd(struct usb_hcd *hcd, { int retval; struct usb_device *rhdev; - struct usb_hcd *shared_hcd; if (!hcd->skip_phy_initialization && usb_hcd_is_primary_hcd(hcd)) { hcd->phy_roothub = usb_phy_roothub_alloc(hcd->self.sysdev); @@ -2793,26 +2792,13 @@ int usb_add_hcd(struct usb_hcd *hcd, goto err_hcd_driver_start; } - /* starting here, usbcore will pay attention to the shared HCD roothub */ - shared_hcd = hcd->shared_hcd; - if (!usb_hcd_is_primary_hcd(hcd) && shared_hcd && HCD_DEFER_RH_REGISTER(shared_hcd)) { - retval = register_root_hub(shared_hcd); - if (retval != 0) - goto err_register_root_hub; - - if (shared_hcd->uses_new_polling && HCD_POLL_RH(shared_hcd)) - usb_hcd_poll_rh_status(shared_hcd); - } - /* starting here, usbcore will pay attention to this root hub */ - if (!HCD_DEFER_RH_REGISTER(hcd)) { - retval = register_root_hub(hcd); - if (retval != 0) - goto err_register_root_hub; + retval = register_root_hub(hcd); + if (retval != 0) + goto err_register_root_hub; - if (hcd->uses_new_polling && HCD_POLL_RH(hcd)) - usb_hcd_poll_rh_status(hcd); - } + if (hcd->uses_new_polling && HCD_POLL_RH(hcd)) + usb_hcd_poll_rh_status(hcd); return retval; @@ -2855,7 +2841,6 @@ EXPORT_SYMBOL_GPL(usb_add_hcd); void usb_remove_hcd(struct usb_hcd *hcd) { struct usb_device *rhdev = hcd->self.root_hub; - bool rh_registered; dev_info(hcd->self.controller, "remove, state %x\n", hcd->state); @@ -2866,7 +2851,6 @@ void usb_remove_hcd(struct usb_hcd *hcd) dev_dbg(hcd->self.controller, "roothub graceful disconnect\n"); spin_lock_irq (&hcd_root_hub_lock); - rh_registered = hcd->rh_registered; hcd->rh_registered = 0; spin_unlock_irq (&hcd_root_hub_lock); @@ -2876,8 +2860,7 @@ void usb_remove_hcd(struct usb_hcd *hcd) cancel_work_sync(&hcd->died_work); mutex_lock(&usb_bus_idr_lock); - if (rh_registered) - usb_disconnect(&rhdev); /* Sets rhdev to NULL */ + usb_disconnect(&rhdev); /* Sets rhdev to NULL */ mutex_unlock(&usb_bus_idr_lock); /* diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h index c0eb85b2981e0..712b2a603645f 100644 --- a/include/linux/usb/hcd.h +++ b/include/linux/usb/hcd.h @@ -124,7 +124,6 @@ struct usb_hcd { #define HCD_FLAG_RH_RUNNING 5 /* root hub is running? */ #define HCD_FLAG_DEAD 6 /* controller has died? */ #define HCD_FLAG_INTF_AUTHORIZED 7 /* authorize interfaces? */ -#define HCD_FLAG_DEFER_RH_REGISTER 8 /* Defer roothub registration */ /* The flags can be tested using these macros; they are likely to * be slightly faster than test_bit(). @@ -135,7 +134,6 @@ struct usb_hcd { #define HCD_WAKEUP_PENDING(hcd) ((hcd)->flags & (1U << HCD_FLAG_WAKEUP_PENDING)) #define HCD_RH_RUNNING(hcd) ((hcd)->flags & (1U << HCD_FLAG_RH_RUNNING)) #define HCD_DEAD(hcd) ((hcd)->flags & (1U << HCD_FLAG_DEAD)) -#define HCD_DEFER_RH_REGISTER(hcd) ((hcd)->flags & (1U << HCD_FLAG_DEFER_RH_REGISTER)) /* * Specifies if interfaces are authorized by default From da39df072f6e5c512acfdc25089fa071fea338cd Mon Sep 17 00:00:00 2001 From: Erik Ekman Date: Sun, 17 Oct 2021 19:16:57 +0200 Subject: [PATCH 1314/5344] sfc: Fix reading non-legacy supported link modes commit 041c61488236a5a84789083e3d9f0a51139b6edf upstream. Everything except the first 32 bits was lost when the pause flags were added. This makes the 50000baseCR2 mode flag (bit 34) not appear. I have tested this with a 10G card (SFN5122F-R7) by modifying it to return a non-legacy link mode (10000baseCR). Signed-off-by: Erik Ekman Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/sfc/ethtool.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c index 86b965875540d..d53e945dd08fc 100644 --- a/drivers/net/ethernet/sfc/ethtool.c +++ b/drivers/net/ethernet/sfc/ethtool.c @@ -128,20 +128,14 @@ efx_ethtool_get_link_ksettings(struct net_device *net_dev, { struct efx_nic *efx = netdev_priv(net_dev); struct efx_link_state *link_state = &efx->link_state; - u32 supported; mutex_lock(&efx->mac_lock); efx->phy_op->get_link_ksettings(efx, cmd); mutex_unlock(&efx->mac_lock); /* Both MACs support pause frames (bidirectional and respond-only) */ - ethtool_convert_link_mode_to_legacy_u32(&supported, - cmd->link_modes.supported); - - supported |= SUPPORTED_Pause | SUPPORTED_Asym_Pause; - - ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.supported, - supported); + ethtool_link_ksettings_add_link_mode(cmd, supported, Pause); + ethtool_link_ksettings_add_link_mode(cmd, supported, Asym_Pause); if (LOOPBACK_INTERNAL(efx)) { cmd->base.speed = link_state->speed; From fd768b23200da6d7c964449c587e8da0b5e03f58 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 4 Nov 2021 09:35:57 +0100 Subject: [PATCH 1315/5344] Revert "drm/ttm: fix memleak in ttm_transfered_destroy" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit bd99782f3ca491879e8524c89b1c0f40071903bd which is commit 0db55f9a1bafbe3dac750ea669de9134922389b5 upstream. Seems that the older kernels can not handle this fix because, to quote Christian: The problem is this memory leak could potentially happen with 5.10 as wel, just much much much less likely. But my guess is that 5.10 is so buggy that when the leak does NOT happen we double free and obviously causing a crash. So it needs to be reverted. Link: https://lore.kernel.org/r/1a1cc125-9314-f569-a6c4-40fc4509a377@amd.com Cc: Christian König Cc: Erhard F. Cc: Erhard F. Cc: Huang Rui Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/ttm/ttm_bo_util.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c index a7b88ca8b97b3..fe81c565e7ef8 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_util.c +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c @@ -463,7 +463,6 @@ static void ttm_transfered_destroy(struct ttm_buffer_object *bo) struct ttm_transfer_obj *fbo; fbo = container_of(bo, struct ttm_transfer_obj, base); - dma_resv_fini(&fbo->base.base._resv); ttm_bo_put(fbo->bo); kfree(fbo); } From 9a27d36a8fdcb49ed53b818dcb9be1a6ca2c8563 Mon Sep 17 00:00:00 2001 From: Wang Kefeng Date: Mon, 23 Aug 2021 10:41:42 +0100 Subject: [PATCH 1316/5344] ARM: 9120/1: Revert "amba: make use of -1 IRQs warn" commit eb4f756915875b0ea0757751cd29841f0504d547 upstream. After commit 77a7300abad7 ("of/irq: Get rid of NO_IRQ usage"), no irq case has been removed, irq_of_parse_and_map() will return 0 in all cases when get error from parse and map an interrupt into linux virq space. amba_device_register() is only used on no-DT initialization, see s3c64xx_pl080_init() arch/arm/mach-s3c/pl080.c ep93xx_init_devices() arch/arm/mach-ep93xx/core.c They won't set -1 to irq[0], so no need the warn. This reverts commit 2eac58d5026e4ec8b17ff8b62877fea9e1d2f1b3. Reviewed-by: Rob Herring Signed-off-by: Kefeng Wang Signed-off-by: Russell King (Oracle) Signed-off-by: Greg Kroah-Hartman --- drivers/amba/bus.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c index af58768a03937..702284bcd467c 100644 --- a/drivers/amba/bus.c +++ b/drivers/amba/bus.c @@ -375,9 +375,6 @@ static int amba_device_try_add(struct amba_device *dev, struct resource *parent) void __iomem *tmp; int i, ret; - WARN_ON(dev->irq[0] == (unsigned int)-1); - WARN_ON(dev->irq[1] == (unsigned int)-1); - ret = request_resource(parent, &dev->res); if (ret) goto err_out; From 1862dbde43ef8953aa406f6dd408a575865a1493 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 6 Nov 2021 13:59:45 +0100 Subject: [PATCH 1317/5344] Linux 5.4.158 Link: https://lore.kernel.org/r/20211104141158.384397574@linuxfoundation.org Tested-by: Florian Fainelli Tested-by: Shuah Khan Tested-by: Jon Hunter Tested-by: Linux Kernel Functional Testing Tested-by: Sudip Mukherjee Tested-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 49d639fe5a801..cef1d2704c410 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 4 -SUBLEVEL = 157 +SUBLEVEL = 158 EXTRAVERSION = NAME = Kleptomaniac Octopus From 5bb3321c16963c3bb3aebb241f0b825ab824f2b0 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 13 Sep 2021 15:57:43 +0200 Subject: [PATCH 1318/5344] Revert "x86/kvm: fix vcpu-id indexed array sizes" commit 1e254d0d86a0f2efd4190a89d5204b37c18c6381 upstream. This reverts commit 76b4f357d0e7d8f6f0013c733e6cba1773c266d3. The commit has the wrong reasoning, as KVM_MAX_VCPU_ID is not defining the maximum allowed vcpu-id as its name suggests, but the number of vcpu-ids. So revert this patch again. Suggested-by: Eduardo Habkost Signed-off-by: Juergen Gross Signed-off-by: Paolo Bonzini Message-Id: <20210913135745.13944-2-jgross@suse.com> Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/ioapic.c | 2 +- arch/x86/kvm/ioapic.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 642031b896f64..24a6905d60ee2 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -91,7 +91,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic) { ioapic->rtc_status.pending_eoi = 0; - bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_ID + 1); + bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_ID); } static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 283f1f489bcac..ea1a4e0297dae 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -43,13 +43,13 @@ struct kvm_vcpu; struct dest_map { /* vcpu bitmap where IRQ has been sent */ - DECLARE_BITMAP(map, KVM_MAX_VCPU_ID + 1); + DECLARE_BITMAP(map, KVM_MAX_VCPU_ID); /* * Vector sent to a given vcpu, only valid when * the vcpu's bit in map is set */ - u8 vectors[KVM_MAX_VCPU_ID + 1]; + u8 vectors[KVM_MAX_VCPU_ID]; }; From 40f98a16d977e60de15aa97d3131c5a327579ac0 Mon Sep 17 00:00:00 2001 From: Neal Liu Date: Fri, 10 Sep 2021 15:36:19 +0800 Subject: [PATCH 1319/5344] usb: ehci: handshake CMD_RUN instead of STS_HALT commit 7f2d73788d9067fd4f677ac5f60ffd25945af7af upstream. For Aspeed, HCHalted status depends on not only Run/Stop but also ASS/PSS status. Handshake CMD_RUN on startup instead. Tested-by: Tao Ren Reviewed-by: Tao Ren Acked-by: Alan Stern Signed-off-by: Neal Liu Link: https://lore.kernel.org/r/20210910073619.26095-1-neal_liu@aspeedtech.com Cc: Joel Stanley Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/ehci-hcd.c | 11 ++++++++++- drivers/usb/host/ehci-platform.c | 6 ++++++ drivers/usb/host/ehci.h | 1 + 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c index 5c6ce1ef3f4b0..c0e344c3a8487 100644 --- a/drivers/usb/host/ehci-hcd.c +++ b/drivers/usb/host/ehci-hcd.c @@ -634,7 +634,16 @@ static int ehci_run (struct usb_hcd *hcd) /* Wait until HC become operational */ ehci_readl(ehci, &ehci->regs->command); /* unblock posted writes */ msleep(5); - rc = ehci_handshake(ehci, &ehci->regs->status, STS_HALT, 0, 100 * 1000); + + /* For Aspeed, STS_HALT also depends on ASS/PSS status. + * Check CMD_RUN instead. + */ + if (ehci->is_aspeed) + rc = ehci_handshake(ehci, &ehci->regs->command, CMD_RUN, + 1, 100 * 1000); + else + rc = ehci_handshake(ehci, &ehci->regs->status, STS_HALT, + 0, 100 * 1000); up_write(&ehci_cf_port_reset_rwsem); diff --git a/drivers/usb/host/ehci-platform.c b/drivers/usb/host/ehci-platform.c index e4fc3f66d43bf..accbcc989c503 100644 --- a/drivers/usb/host/ehci-platform.c +++ b/drivers/usb/host/ehci-platform.c @@ -286,6 +286,12 @@ static int ehci_platform_probe(struct platform_device *dev) "has-transaction-translator")) hcd->has_tt = 1; + if (of_device_is_compatible(dev->dev.of_node, + "aspeed,ast2500-ehci") || + of_device_is_compatible(dev->dev.of_node, + "aspeed,ast2600-ehci")) + ehci->is_aspeed = 1; + if (soc_device_match(quirk_poll_match)) priv->quirk_poll = true; diff --git a/drivers/usb/host/ehci.h b/drivers/usb/host/ehci.h index ac5e967907d14..356738462ce45 100644 --- a/drivers/usb/host/ehci.h +++ b/drivers/usb/host/ehci.h @@ -218,6 +218,7 @@ struct ehci_hcd { /* one per controller */ unsigned frame_index_bug:1; /* MosChip (AKA NetMos) */ unsigned need_oc_pp_cycle:1; /* MPC834X port power */ unsigned imx28_write_fix:1; /* For Freescale i.MX28 */ + unsigned is_aspeed:1; /* required for usb32 quirk */ #define OHCI_CTRL_HCFS (3 << 6) From c94ca5d2942759a93a9228a1631bdfcb7ceda3c0 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 27 Oct 2021 10:08:49 +0200 Subject: [PATCH 1320/5344] usb: gadget: Mark USB_FSL_QE broken on 64-bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit a0548b26901f082684ad1fb3ba397d2de3a1406a upstream. On 64-bit: drivers/usb/gadget/udc/fsl_qe_udc.c: In function ‘qe_ep0_rx’: drivers/usb/gadget/udc/fsl_qe_udc.c:842:13: error: cast from pointer to integer of different size [-Werror=pointer-to-int-cast] 842 | vaddr = (u32)phys_to_virt(in_be32(&bd->buf)); | ^ In file included from drivers/usb/gadget/udc/fsl_qe_udc.c:41: drivers/usb/gadget/udc/fsl_qe_udc.c:843:28: error: cast to pointer from integer of different size [-Werror=int-to-pointer-cast] 843 | frame_set_data(pframe, (u8 *)vaddr); | ^ The driver assumes physical and virtual addresses are 32-bit, hence it cannot work on 64-bit platforms. Acked-by: Li Yang Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20211027080849.3276289-1-geert@linux-m68k.org Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/gadget/udc/Kconfig b/drivers/usb/gadget/udc/Kconfig index d354036ff6c86..f985bb4a42db2 100644 --- a/drivers/usb/gadget/udc/Kconfig +++ b/drivers/usb/gadget/udc/Kconfig @@ -330,6 +330,7 @@ config USB_AMD5536UDC config USB_FSL_QE tristate "Freescale QE/CPM USB Device Controller" depends on FSL_SOC && (QUICC_ENGINE || CPM) + depends on !64BIT || BROKEN help Some of Freescale PowerPC processors have a Full Speed QE/CPM2 USB controller, which support device mode with 4 From fe3dee27b4fc34e0c83ff6e94bebeeb4dd96d5bd Mon Sep 17 00:00:00 2001 From: Viraj Shah Date: Thu, 21 Oct 2021 11:36:44 +0200 Subject: [PATCH 1321/5344] usb: musb: Balance list entry in musb_gadget_queue commit 21b5fcdccb32ff09b6b63d4a83c037150665a83f upstream. musb_gadget_queue() adds the passed request to musb_ep::req_list. If the endpoint is idle and it is the first request then it invokes musb_queue_resume_work(). If the function returns an error then the error is passed to the caller without any clean-up and the request remains enqueued on the list. If the caller enqueues the request again then the list corrupts. Remove the request from the list on error. Fixes: ea2f35c01d5ea ("usb: musb: Fix sleeping function called from invalid context for hdrc glue") Cc: stable Signed-off-by: Viraj Shah Link: https://lore.kernel.org/r/20211021093644.4734-1-viraj.shah@linutronix.de Signed-off-by: Greg Kroah-Hartman --- drivers/usb/musb/musb_gadget.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/musb/musb_gadget.c b/drivers/usb/musb/musb_gadget.c index ffe462a657b15..4622400ba4ddb 100644 --- a/drivers/usb/musb/musb_gadget.c +++ b/drivers/usb/musb/musb_gadget.c @@ -1248,9 +1248,11 @@ static int musb_gadget_queue(struct usb_ep *ep, struct usb_request *req, status = musb_queue_resume_work(musb, musb_ep_restart_resume_work, request); - if (status < 0) + if (status < 0) { dev_err(musb->controller, "%s resume work: %i\n", __func__, status); + list_del(&request->list); + } } unlock: From 37d960a2b2d40551c5ddddd6f7dd8c324cf8eb44 Mon Sep 17 00:00:00 2001 From: James Buren Date: Wed, 13 Oct 2021 20:55:04 -0500 Subject: [PATCH 1322/5344] usb-storage: Add compatibility quirk flags for iODD 2531/2541 commit 05c8f1b67e67dcd786ae3fe44492bbc617b4bd12 upstream. These drive enclosures have firmware bugs that make it impossible to mount a new virtual ISO image after Linux ejects the old one if the device is locked by Linux. Windows bypasses this problem by the fact that they do not lock the device. Add a quirk to disable device locking for these drive enclosures. Acked-by: Alan Stern Signed-off-by: James Buren Cc: stable Link: https://lore.kernel.org/r/20211014015504.2695089-1-braewoods+lkml@braewoods.net Signed-off-by: Greg Kroah-Hartman --- drivers/usb/storage/unusual_devs.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h index 7442793fe0502..3ba4e060fd051 100644 --- a/drivers/usb/storage/unusual_devs.h +++ b/drivers/usb/storage/unusual_devs.h @@ -406,6 +406,16 @@ UNUSUAL_DEV( 0x04b8, 0x0602, 0x0110, 0x0110, "785EPX Storage", USB_SC_SCSI, USB_PR_BULK, NULL, US_FL_SINGLE_LUN), +/* + * Reported by James Buren + * Virtual ISOs cannot be remounted if ejected while the device is locked + * Disable locking to mimic Windows behavior that bypasses the issue + */ +UNUSUAL_DEV( 0x04c5, 0x2028, 0x0001, 0x0001, + "iODD", + "2531/2541", + USB_SC_DEVICE, USB_PR_DEVICE, NULL, US_FL_NOT_LOCKABLE), + /* * Not sure who reported this originally but * Pavel Machek reported that the extra US_FL_SINGLE_LUN From 08847fa03193c1be9f92166b65309f2eb5c5476d Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Fri, 15 Oct 2021 16:38:11 -0700 Subject: [PATCH 1323/5344] binder: don't detect sender/target during buffer cleanup commit 32e9f56a96d8d0f23cb2aeb2a3cd18d40393e787 upstream. When freeing txn buffers, binder_transaction_buffer_release() attempts to detect whether the current context is the target by comparing current->group_leader to proc->tsk. This is an unreliable test. Instead explicitly pass an 'is_failure' boolean. Detecting the sender was being used as a way to tell if the transaction failed to be sent. When cleaning up after failing to send a transaction, there is no need to close the fds associated with a BINDER_TYPE_FDA object. Now 'is_failure' can be used to accurately detect this case. Fixes: 44d8047f1d87 ("binder: use standard functions to allocate fds") Cc: stable Acked-by: Christian Brauner Signed-off-by: Todd Kjos Link: https://lore.kernel.org/r/20211015233811.3532235-1-tkjos@google.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 721218a0f7bac..96006696ad534 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2259,7 +2259,7 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, binder_dec_node(buffer->target_node, 1, 0); off_start_offset = ALIGN(buffer->data_size, sizeof(void *)); - off_end_offset = is_failure ? failed_at : + off_end_offset = is_failure && failed_at ? failed_at : off_start_offset + buffer->offsets_size; for (buffer_offset = off_start_offset; buffer_offset < off_end_offset; buffer_offset += sizeof(binder_size_t)) { @@ -2345,9 +2345,8 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, binder_size_t fd_buf_size; binder_size_t num_valid; - if (proc->tsk != current->group_leader) { + if (is_failure) { /* - * Nothing to do if running in sender context * The fd fixups have not been applied so no * fds need to be closed. */ @@ -3550,6 +3549,7 @@ static void binder_transaction(struct binder_proc *proc, * binder_free_buf() - free the specified buffer * @proc: binder proc that owns buffer * @buffer: buffer to be freed + * @is_failure: failed to send transaction * * If buffer for an async transaction, enqueue the next async * transaction from the node. @@ -3559,7 +3559,7 @@ static void binder_transaction(struct binder_proc *proc, static void binder_free_buf(struct binder_proc *proc, struct binder_thread *thread, - struct binder_buffer *buffer) + struct binder_buffer *buffer, bool is_failure) { binder_inner_proc_lock(proc); if (buffer->transaction) { @@ -3587,7 +3587,7 @@ binder_free_buf(struct binder_proc *proc, binder_node_inner_unlock(buf_node); } trace_binder_transaction_buffer_release(buffer); - binder_transaction_buffer_release(proc, thread, buffer, 0, false); + binder_transaction_buffer_release(proc, thread, buffer, 0, is_failure); binder_alloc_free_buf(&proc->alloc, buffer); } @@ -3788,7 +3788,7 @@ static int binder_thread_write(struct binder_proc *proc, proc->pid, thread->pid, (u64)data_ptr, buffer->debug_id, buffer->transaction ? "active" : "finished"); - binder_free_buf(proc, thread, buffer); + binder_free_buf(proc, thread, buffer, false); break; } @@ -4476,7 +4476,7 @@ static int binder_thread_read(struct binder_proc *proc, buffer->transaction = NULL; binder_cleanup_transaction(t, "fd fixups failed", BR_FAILED_REPLY); - binder_free_buf(proc, thread, buffer); + binder_free_buf(proc, thread, buffer, true); binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, "%d:%d %stransaction %d fd fixups failed %d/%d, line %d\n", proc->pid, thread->pid, From 17a72a62d8c2bd81a41937f3238d72603749ca7b Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 11 Nov 2020 14:54:50 +0100 Subject: [PATCH 1324/5344] printk/console: Allow to disable console output by using console="" or console=null commit 3cffa06aeef7ece30f6b5ac0ea51f264e8fea4d0 upstream. The commit 48021f98130880dd74 ("printk: handle blank console arguments passed in.") prevented crash caused by empty console= parameter value. Unfortunately, this value is widely used on Chromebooks to disable the console output. The above commit caused performance regression because the messages were pushed on slow console even though nobody was watching it. Use ttynull driver explicitly for console="" and console=null parameters. It has been created for exactly this purpose. It causes that preferred_console is set. As a result, ttySX and ttyX are not used as a fallback. And only ttynull console gets registered by default. It still allows to register other consoles either by additional console= parameters or SPCR. It prevents regression because it worked this way even before. Also it is a sane semantic. Preventing output on all consoles should be done another way, for example, by introducing mute_console parameter. Link: https://lore.kernel.org/r/20201006025935.GA597@jagdpanzerIV.localdomain Suggested-by: Sergey Senozhatsky Reviewed-by: Guenter Roeck Tested-by: Guenter Roeck Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20201111135450.11214-3-pmladek@suse.com Cc: Yi Fan Signed-off-by: Greg Kroah-Hartman --- kernel/printk/printk.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5569ef6bc1839..23e26a203a9e9 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2193,8 +2193,15 @@ static int __init console_setup(char *str) char *s, *options, *brl_options = NULL; int idx; - if (str[0] == 0) + /* + * console="" or console=null have been suggested as a way to + * disable console output. Use ttynull that has been created + * for exacly this purpose. + */ + if (str[0] == 0 || strcmp(str, "null") == 0) { + __add_preferred_console("ttynull", 0, NULL, NULL); return 1; + } if (_braille_console_setup(&str, &brl_options)) return 1; From 27c8231544db2262531775ee9953669ce461a3db Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 18 Oct 2021 12:37:41 +0200 Subject: [PATCH 1325/5344] isofs: Fix out of bound access for corrupted isofs image commit e96a1866b40570b5950cda8602c2819189c62a48 upstream. When isofs image is suitably corrupted isofs_read_inode() can read data beyond the end of buffer. Sanity-check the directory entry length before using it. Reported-and-tested-by: syzbot+6fc7fb214625d82af7d1@syzkaller.appspotmail.com CC: stable@vger.kernel.org Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/isofs/inode.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index bf30f6ce8dd10..74e487d63c62c 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1328,6 +1328,8 @@ static int isofs_read_inode(struct inode *inode, int relocated) de = (struct iso_directory_record *) (bh->b_data + offset); de_len = *(unsigned char *) de; + if (de_len < sizeof(struct iso_directory_record)) + goto fail; if (offset + de_len > bufsize) { int frag1 = bufsize - offset; From 5db1f4112744f932fabee219333f8378cc4afa99 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 27 Oct 2021 11:35:29 +0200 Subject: [PATCH 1326/5344] comedi: dt9812: fix DMA buffers on stack commit 536de747bc48262225889a533db6650731ab25d3 upstream. USB transfer buffers are typically mapped for DMA and must not be allocated on the stack or transfers will fail. Allocate proper transfer buffers in the various command helpers and return an error on short transfers instead of acting on random stack data. Note that this also fixes a stack info leak on systems where DMA is not used as 32 bytes are always sent to the device regardless of how short the command is. Fixes: 63274cd7d38a ("Staging: comedi: add usb dt9812 driver") Cc: stable@vger.kernel.org # 2.6.29 Reviewed-by: Ian Abbott Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20211027093529.30896-3-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/staging/comedi/drivers/dt9812.c | 115 ++++++++++++++++++------ 1 file changed, 86 insertions(+), 29 deletions(-) diff --git a/drivers/staging/comedi/drivers/dt9812.c b/drivers/staging/comedi/drivers/dt9812.c index 634f57730c1e0..704b04d2980d3 100644 --- a/drivers/staging/comedi/drivers/dt9812.c +++ b/drivers/staging/comedi/drivers/dt9812.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include "../comedi_usb.h" @@ -237,22 +238,42 @@ static int dt9812_read_info(struct comedi_device *dev, { struct usb_device *usb = comedi_to_usb_dev(dev); struct dt9812_private *devpriv = dev->private; - struct dt9812_usb_cmd cmd; + struct dt9812_usb_cmd *cmd; + size_t tbuf_size; int count, ret; + void *tbuf; - cmd.cmd = cpu_to_le32(DT9812_R_FLASH_DATA); - cmd.u.flash_data_info.address = + tbuf_size = max(sizeof(*cmd), buf_size); + + tbuf = kzalloc(tbuf_size, GFP_KERNEL); + if (!tbuf) + return -ENOMEM; + + cmd = tbuf; + + cmd->cmd = cpu_to_le32(DT9812_R_FLASH_DATA); + cmd->u.flash_data_info.address = cpu_to_le16(DT9812_DIAGS_BOARD_INFO_ADDR + offset); - cmd.u.flash_data_info.numbytes = cpu_to_le16(buf_size); + cmd->u.flash_data_info.numbytes = cpu_to_le16(buf_size); /* DT9812 only responds to 32 byte writes!! */ ret = usb_bulk_msg(usb, usb_sndbulkpipe(usb, devpriv->cmd_wr.addr), - &cmd, 32, &count, DT9812_USB_TIMEOUT); + cmd, sizeof(*cmd), &count, DT9812_USB_TIMEOUT); if (ret) - return ret; + goto out; + + ret = usb_bulk_msg(usb, usb_rcvbulkpipe(usb, devpriv->cmd_rd.addr), + tbuf, buf_size, &count, DT9812_USB_TIMEOUT); + if (!ret) { + if (count == buf_size) + memcpy(buf, tbuf, buf_size); + else + ret = -EREMOTEIO; + } +out: + kfree(tbuf); - return usb_bulk_msg(usb, usb_rcvbulkpipe(usb, devpriv->cmd_rd.addr), - buf, buf_size, &count, DT9812_USB_TIMEOUT); + return ret; } static int dt9812_read_multiple_registers(struct comedi_device *dev, @@ -261,22 +282,42 @@ static int dt9812_read_multiple_registers(struct comedi_device *dev, { struct usb_device *usb = comedi_to_usb_dev(dev); struct dt9812_private *devpriv = dev->private; - struct dt9812_usb_cmd cmd; + struct dt9812_usb_cmd *cmd; int i, count, ret; + size_t buf_size; + void *buf; - cmd.cmd = cpu_to_le32(DT9812_R_MULTI_BYTE_REG); - cmd.u.read_multi_info.count = reg_count; + buf_size = max_t(size_t, sizeof(*cmd), reg_count); + + buf = kzalloc(buf_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + cmd = buf; + + cmd->cmd = cpu_to_le32(DT9812_R_MULTI_BYTE_REG); + cmd->u.read_multi_info.count = reg_count; for (i = 0; i < reg_count; i++) - cmd.u.read_multi_info.address[i] = address[i]; + cmd->u.read_multi_info.address[i] = address[i]; /* DT9812 only responds to 32 byte writes!! */ ret = usb_bulk_msg(usb, usb_sndbulkpipe(usb, devpriv->cmd_wr.addr), - &cmd, 32, &count, DT9812_USB_TIMEOUT); + cmd, sizeof(*cmd), &count, DT9812_USB_TIMEOUT); if (ret) - return ret; + goto out; + + ret = usb_bulk_msg(usb, usb_rcvbulkpipe(usb, devpriv->cmd_rd.addr), + buf, reg_count, &count, DT9812_USB_TIMEOUT); + if (!ret) { + if (count == reg_count) + memcpy(value, buf, reg_count); + else + ret = -EREMOTEIO; + } +out: + kfree(buf); - return usb_bulk_msg(usb, usb_rcvbulkpipe(usb, devpriv->cmd_rd.addr), - value, reg_count, &count, DT9812_USB_TIMEOUT); + return ret; } static int dt9812_write_multiple_registers(struct comedi_device *dev, @@ -285,19 +326,27 @@ static int dt9812_write_multiple_registers(struct comedi_device *dev, { struct usb_device *usb = comedi_to_usb_dev(dev); struct dt9812_private *devpriv = dev->private; - struct dt9812_usb_cmd cmd; + struct dt9812_usb_cmd *cmd; int i, count; + int ret; + + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); + if (!cmd) + return -ENOMEM; - cmd.cmd = cpu_to_le32(DT9812_W_MULTI_BYTE_REG); - cmd.u.read_multi_info.count = reg_count; + cmd->cmd = cpu_to_le32(DT9812_W_MULTI_BYTE_REG); + cmd->u.read_multi_info.count = reg_count; for (i = 0; i < reg_count; i++) { - cmd.u.write_multi_info.write[i].address = address[i]; - cmd.u.write_multi_info.write[i].value = value[i]; + cmd->u.write_multi_info.write[i].address = address[i]; + cmd->u.write_multi_info.write[i].value = value[i]; } /* DT9812 only responds to 32 byte writes!! */ - return usb_bulk_msg(usb, usb_sndbulkpipe(usb, devpriv->cmd_wr.addr), - &cmd, 32, &count, DT9812_USB_TIMEOUT); + ret = usb_bulk_msg(usb, usb_sndbulkpipe(usb, devpriv->cmd_wr.addr), + cmd, sizeof(*cmd), &count, DT9812_USB_TIMEOUT); + kfree(cmd); + + return ret; } static int dt9812_rmw_multiple_registers(struct comedi_device *dev, @@ -306,17 +355,25 @@ static int dt9812_rmw_multiple_registers(struct comedi_device *dev, { struct usb_device *usb = comedi_to_usb_dev(dev); struct dt9812_private *devpriv = dev->private; - struct dt9812_usb_cmd cmd; + struct dt9812_usb_cmd *cmd; int i, count; + int ret; + + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); + if (!cmd) + return -ENOMEM; - cmd.cmd = cpu_to_le32(DT9812_RMW_MULTI_BYTE_REG); - cmd.u.rmw_multi_info.count = reg_count; + cmd->cmd = cpu_to_le32(DT9812_RMW_MULTI_BYTE_REG); + cmd->u.rmw_multi_info.count = reg_count; for (i = 0; i < reg_count; i++) - cmd.u.rmw_multi_info.rmw[i] = rmw[i]; + cmd->u.rmw_multi_info.rmw[i] = rmw[i]; /* DT9812 only responds to 32 byte writes!! */ - return usb_bulk_msg(usb, usb_sndbulkpipe(usb, devpriv->cmd_wr.addr), - &cmd, 32, &count, DT9812_USB_TIMEOUT); + ret = usb_bulk_msg(usb, usb_sndbulkpipe(usb, devpriv->cmd_wr.addr), + cmd, sizeof(*cmd), &count, DT9812_USB_TIMEOUT); + kfree(cmd); + + return ret; } static int dt9812_digital_in(struct comedi_device *dev, u8 *bits) From 8699aa1b53de898c0e171a0f012251e2d595f63e Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 27 Oct 2021 11:35:28 +0200 Subject: [PATCH 1327/5344] comedi: ni_usb6501: fix NULL-deref in command paths commit 907767da8f3a925b060c740e0b5c92ea7dbec440 upstream. The driver uses endpoint-sized USB transfer buffers but had no sanity checks on the sizes. This can lead to zero-size-pointer dereferences or overflowed transfer buffers in ni6501_port_command() and ni6501_counter_command() if a (malicious) device has smaller max-packet sizes than expected (or when doing descriptor fuzz testing). Add the missing sanity checks to probe(). Fixes: a03bb00e50ab ("staging: comedi: add NI USB-6501 support") Cc: stable@vger.kernel.org # 3.18 Cc: Luca Ellero Reviewed-by: Ian Abbott Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20211027093529.30896-2-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/staging/comedi/drivers/ni_usb6501.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/staging/comedi/drivers/ni_usb6501.c b/drivers/staging/comedi/drivers/ni_usb6501.c index 360e86a19fe32..311632004f08a 100644 --- a/drivers/staging/comedi/drivers/ni_usb6501.c +++ b/drivers/staging/comedi/drivers/ni_usb6501.c @@ -144,6 +144,10 @@ static const u8 READ_COUNTER_RESPONSE[] = {0x00, 0x01, 0x00, 0x10, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00}; +/* Largest supported packets */ +static const size_t TX_MAX_SIZE = sizeof(SET_PORT_DIR_REQUEST); +static const size_t RX_MAX_SIZE = sizeof(READ_PORT_RESPONSE); + enum commands { READ_PORT, WRITE_PORT, @@ -501,6 +505,12 @@ static int ni6501_find_endpoints(struct comedi_device *dev) if (!devpriv->ep_rx || !devpriv->ep_tx) return -ENODEV; + if (usb_endpoint_maxp(devpriv->ep_rx) < RX_MAX_SIZE) + return -ENODEV; + + if (usb_endpoint_maxp(devpriv->ep_tx) < TX_MAX_SIZE) + return -ENODEV; + return 0; } From 9702c291085e51b530ad284ae7333cfc0a381543 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 25 Oct 2021 13:45:30 +0200 Subject: [PATCH 1328/5344] comedi: vmk80xx: fix transfer-buffer overflows commit a23461c47482fc232ffc9b819539d1f837adf2b1 upstream. The driver uses endpoint-sized USB transfer buffers but up until recently had no sanity checks on the sizes. Commit e1f13c879a7c ("staging: comedi: check validity of wMaxPacketSize of usb endpoints found") inadvertently fixed NULL-pointer dereferences when accessing the transfer buffers in case a malicious device has a zero wMaxPacketSize. Make sure to allocate buffers large enough to handle also the other accesses that are done without a size check (e.g. byte 18 in vmk80xx_cnt_insn_read() for the VMK8061_MODEL) to avoid writing beyond the buffers, for example, when doing descriptor fuzzing. The original driver was for a low-speed device with 8-byte buffers. Support was later added for a device that uses bulk transfers and is presumably a full-speed device with a maximum 64-byte wMaxPacketSize. Fixes: 985cafccbf9b ("Staging: Comedi: vmk80xx: Add k8061 support") Cc: stable@vger.kernel.org # 2.6.31 Signed-off-by: Johan Hovold Reviewed-by: Ian Abbott Link: https://lore.kernel.org/r/20211025114532.4599-4-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/staging/comedi/drivers/vmk80xx.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/staging/comedi/drivers/vmk80xx.c b/drivers/staging/comedi/drivers/vmk80xx.c index 7956abcbae22b..2787757cf75f8 100644 --- a/drivers/staging/comedi/drivers/vmk80xx.c +++ b/drivers/staging/comedi/drivers/vmk80xx.c @@ -90,6 +90,8 @@ enum { #define IC3_VERSION BIT(0) #define IC6_VERSION BIT(1) +#define MIN_BUF_SIZE 64 + enum vmk80xx_model { VMK8055_MODEL, VMK8061_MODEL @@ -678,12 +680,12 @@ static int vmk80xx_alloc_usb_buffers(struct comedi_device *dev) struct vmk80xx_private *devpriv = dev->private; size_t size; - size = usb_endpoint_maxp(devpriv->ep_rx); + size = max(usb_endpoint_maxp(devpriv->ep_rx), MIN_BUF_SIZE); devpriv->usb_rx_buf = kzalloc(size, GFP_KERNEL); if (!devpriv->usb_rx_buf) return -ENOMEM; - size = usb_endpoint_maxp(devpriv->ep_tx); + size = max(usb_endpoint_maxp(devpriv->ep_rx), MIN_BUF_SIZE); devpriv->usb_tx_buf = kzalloc(size, GFP_KERNEL); if (!devpriv->usb_tx_buf) return -ENOMEM; From 6a10e511041827efb665af2e7435f3644e93e9f1 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 25 Oct 2021 13:45:31 +0200 Subject: [PATCH 1329/5344] comedi: vmk80xx: fix bulk-buffer overflow commit 78cdfd62bd54af615fba9e3ca1ba35de39d3871d upstream. The driver is using endpoint-sized buffers but must not assume that the tx and rx buffers are of equal size or a malicious device could overflow the slab-allocated receive buffer when doing bulk transfers. Fixes: 985cafccbf9b ("Staging: Comedi: vmk80xx: Add k8061 support") Cc: stable@vger.kernel.org # 2.6.31 Signed-off-by: Johan Hovold Reviewed-by: Ian Abbott Link: https://lore.kernel.org/r/20211025114532.4599-5-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/staging/comedi/drivers/vmk80xx.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/staging/comedi/drivers/vmk80xx.c b/drivers/staging/comedi/drivers/vmk80xx.c index 2787757cf75f8..78c0383b8088a 100644 --- a/drivers/staging/comedi/drivers/vmk80xx.c +++ b/drivers/staging/comedi/drivers/vmk80xx.c @@ -159,22 +159,20 @@ static void vmk80xx_do_bulk_msg(struct comedi_device *dev) __u8 rx_addr; unsigned int tx_pipe; unsigned int rx_pipe; - size_t size; + size_t tx_size; + size_t rx_size; tx_addr = devpriv->ep_tx->bEndpointAddress; rx_addr = devpriv->ep_rx->bEndpointAddress; tx_pipe = usb_sndbulkpipe(usb, tx_addr); rx_pipe = usb_rcvbulkpipe(usb, rx_addr); - - /* - * The max packet size attributes of the K8061 - * input/output endpoints are identical - */ - size = usb_endpoint_maxp(devpriv->ep_tx); + tx_size = usb_endpoint_maxp(devpriv->ep_tx); + rx_size = usb_endpoint_maxp(devpriv->ep_rx); usb_bulk_msg(usb, tx_pipe, devpriv->usb_tx_buf, - size, NULL, devpriv->ep_tx->bInterval); - usb_bulk_msg(usb, rx_pipe, devpriv->usb_rx_buf, size, NULL, HZ * 10); + tx_size, NULL, devpriv->ep_tx->bInterval); + + usb_bulk_msg(usb, rx_pipe, devpriv->usb_rx_buf, rx_size, NULL, HZ * 10); } static int vmk80xx_read_packet(struct comedi_device *dev) From 1ade935808ae8b39544071b86d334f0e27832bd4 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 25 Oct 2021 13:45:32 +0200 Subject: [PATCH 1330/5344] comedi: vmk80xx: fix bulk and interrupt message timeouts commit a56d3e40bda460edf3f8d6aac00ec0b322b4ab83 upstream. USB bulk and interrupt message timeouts are specified in milliseconds and should specifically not vary with CONFIG_HZ. Note that the bulk-out transfer timeout was set to the endpoint bInterval value, which should be ignored for bulk endpoints and is typically set to zero. This meant that a failing bulk-out transfer would never time out. Assume that the 10 second timeout used for all other transfers is more than enough also for the bulk-out endpoint. Fixes: 985cafccbf9b ("Staging: Comedi: vmk80xx: Add k8061 support") Fixes: 951348b37738 ("staging: comedi: vmk80xx: wait for URBs to complete") Cc: stable@vger.kernel.org # 2.6.31 Signed-off-by: Johan Hovold Reviewed-by: Ian Abbott Link: https://lore.kernel.org/r/20211025114532.4599-6-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/staging/comedi/drivers/vmk80xx.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/staging/comedi/drivers/vmk80xx.c b/drivers/staging/comedi/drivers/vmk80xx.c index 78c0383b8088a..7769eadfaf61d 100644 --- a/drivers/staging/comedi/drivers/vmk80xx.c +++ b/drivers/staging/comedi/drivers/vmk80xx.c @@ -91,6 +91,7 @@ enum { #define IC6_VERSION BIT(1) #define MIN_BUF_SIZE 64 +#define PACKET_TIMEOUT 10000 /* ms */ enum vmk80xx_model { VMK8055_MODEL, @@ -169,10 +170,11 @@ static void vmk80xx_do_bulk_msg(struct comedi_device *dev) tx_size = usb_endpoint_maxp(devpriv->ep_tx); rx_size = usb_endpoint_maxp(devpriv->ep_rx); - usb_bulk_msg(usb, tx_pipe, devpriv->usb_tx_buf, - tx_size, NULL, devpriv->ep_tx->bInterval); + usb_bulk_msg(usb, tx_pipe, devpriv->usb_tx_buf, tx_size, NULL, + PACKET_TIMEOUT); - usb_bulk_msg(usb, rx_pipe, devpriv->usb_rx_buf, rx_size, NULL, HZ * 10); + usb_bulk_msg(usb, rx_pipe, devpriv->usb_rx_buf, rx_size, NULL, + PACKET_TIMEOUT); } static int vmk80xx_read_packet(struct comedi_device *dev) @@ -191,7 +193,7 @@ static int vmk80xx_read_packet(struct comedi_device *dev) pipe = usb_rcvintpipe(usb, ep->bEndpointAddress); return usb_interrupt_msg(usb, pipe, devpriv->usb_rx_buf, usb_endpoint_maxp(ep), NULL, - HZ * 10); + PACKET_TIMEOUT); } static int vmk80xx_write_packet(struct comedi_device *dev, int cmd) @@ -212,7 +214,7 @@ static int vmk80xx_write_packet(struct comedi_device *dev, int cmd) pipe = usb_sndintpipe(usb, ep->bEndpointAddress); return usb_interrupt_msg(usb, pipe, devpriv->usb_tx_buf, usb_endpoint_maxp(ep), NULL, - HZ * 10); + PACKET_TIMEOUT); } static int vmk80xx_reset_device(struct comedi_device *dev) From 396baa9d0e0e89962aef23902839ac01d89686b6 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 25 Oct 2021 14:09:10 +0200 Subject: [PATCH 1331/5344] staging: r8712u: fix control-message timeout commit ce4940525f36ffdcf4fa623bcedab9c2a6db893a upstream. USB control-message timeouts are specified in milliseconds and should specifically not vary with CONFIG_HZ. Fixes: 2865d42c78a9 ("staging: r8712u: Add the new driver to the mainline kernel") Cc: stable@vger.kernel.org # 2.6.37 Acked-by: Larry Finger Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20211025120910.6339-3-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/staging/rtl8712/usb_ops_linux.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/rtl8712/usb_ops_linux.c b/drivers/staging/rtl8712/usb_ops_linux.c index 9d290bc2fdb7f..2202a0caa252e 100644 --- a/drivers/staging/rtl8712/usb_ops_linux.c +++ b/drivers/staging/rtl8712/usb_ops_linux.c @@ -493,7 +493,7 @@ int r8712_usbctrl_vendorreq(struct intf_priv *pintfpriv, u8 request, u16 value, memcpy(pIo_buf, pdata, len); } status = usb_control_msg(udev, pipe, request, reqtype, value, index, - pIo_buf, len, HZ / 2); + pIo_buf, len, 500); if (status > 0) { /* Success this control transfer. */ if (requesttype == 0x01) { /* For Control read transfer, we have to copy the read From 1009cc3c094ad781176bb4897d6e0b174132baa6 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 25 Oct 2021 14:09:09 +0200 Subject: [PATCH 1332/5344] staging: rtl8192u: fix control-message timeouts commit 4cfa36d312d6789448b59a7aae770ac8425017a3 upstream. USB control-message timeouts are specified in milliseconds and should specifically not vary with CONFIG_HZ. Fixes: 8fc8598e61f6 ("Staging: Added Realtek rtl8192u driver to staging") Cc: stable@vger.kernel.org # 2.6.33 Acked-by: Larry Finger Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20211025120910.6339-2-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/staging/rtl8192u/r8192U_core.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/staging/rtl8192u/r8192U_core.c b/drivers/staging/rtl8192u/r8192U_core.c index e739d1979c877..6f65a9c9d6cf3 100644 --- a/drivers/staging/rtl8192u/r8192U_core.c +++ b/drivers/staging/rtl8192u/r8192U_core.c @@ -236,7 +236,7 @@ int write_nic_byte_E(struct net_device *dev, int indx, u8 data) status = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), RTL8187_REQ_SET_REGS, RTL8187_REQT_WRITE, - indx | 0xfe00, 0, usbdata, 1, HZ / 2); + indx | 0xfe00, 0, usbdata, 1, 500); kfree(usbdata); if (status < 0) { @@ -258,7 +258,7 @@ int read_nic_byte_E(struct net_device *dev, int indx, u8 *data) status = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), RTL8187_REQ_GET_REGS, RTL8187_REQT_READ, - indx | 0xfe00, 0, usbdata, 1, HZ / 2); + indx | 0xfe00, 0, usbdata, 1, 500); *data = *usbdata; kfree(usbdata); @@ -286,7 +286,7 @@ int write_nic_byte(struct net_device *dev, int indx, u8 data) status = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), RTL8187_REQ_SET_REGS, RTL8187_REQT_WRITE, (indx & 0xff) | 0xff00, (indx >> 8) & 0x0f, - usbdata, 1, HZ / 2); + usbdata, 1, 500); kfree(usbdata); if (status < 0) { @@ -313,7 +313,7 @@ int write_nic_word(struct net_device *dev, int indx, u16 data) status = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), RTL8187_REQ_SET_REGS, RTL8187_REQT_WRITE, (indx & 0xff) | 0xff00, (indx >> 8) & 0x0f, - usbdata, 2, HZ / 2); + usbdata, 2, 500); kfree(usbdata); if (status < 0) { @@ -340,7 +340,7 @@ int write_nic_dword(struct net_device *dev, int indx, u32 data) status = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), RTL8187_REQ_SET_REGS, RTL8187_REQT_WRITE, (indx & 0xff) | 0xff00, (indx >> 8) & 0x0f, - usbdata, 4, HZ / 2); + usbdata, 4, 500); kfree(usbdata); @@ -367,7 +367,7 @@ int read_nic_byte(struct net_device *dev, int indx, u8 *data) status = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), RTL8187_REQ_GET_REGS, RTL8187_REQT_READ, (indx & 0xff) | 0xff00, (indx >> 8) & 0x0f, - usbdata, 1, HZ / 2); + usbdata, 1, 500); *data = *usbdata; kfree(usbdata); @@ -394,7 +394,7 @@ int read_nic_word(struct net_device *dev, int indx, u16 *data) status = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), RTL8187_REQ_GET_REGS, RTL8187_REQT_READ, (indx & 0xff) | 0xff00, (indx >> 8) & 0x0f, - usbdata, 2, HZ / 2); + usbdata, 2, 500); *data = *usbdata; kfree(usbdata); @@ -418,7 +418,7 @@ static int read_nic_word_E(struct net_device *dev, int indx, u16 *data) status = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), RTL8187_REQ_GET_REGS, RTL8187_REQT_READ, - indx | 0xfe00, 0, usbdata, 2, HZ / 2); + indx | 0xfe00, 0, usbdata, 2, 500); *data = *usbdata; kfree(usbdata); @@ -444,7 +444,7 @@ int read_nic_dword(struct net_device *dev, int indx, u32 *data) status = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), RTL8187_REQ_GET_REGS, RTL8187_REQT_READ, (indx & 0xff) | 0xff00, (indx >> 8) & 0x0f, - usbdata, 4, HZ / 2); + usbdata, 4, 500); *data = *usbdata; kfree(usbdata); From 345232423dfb3eaf683de0c752e6fae0613739b0 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 10 Aug 2021 19:09:55 +0200 Subject: [PATCH 1333/5344] media: staging/intel-ipu3: css: Fix wrong size comparison imgu_css_fw_init commit a44f9d6f9dc1fb314a3f1ed2dcd4fbbcc3d9f892 upstream. There is a wrong comparison of the total size of the loaded firmware css->fw->size with the size of a pointer to struct imgu_fw_header. Turn binary_header into a flexible-array member[1][2], use the struct_size() helper and fix the wrong size comparison. Notice that the loaded firmware needs to contain at least one 'struct imgu_fw_info' item in the binary_header[] array. It's also worth mentioning that "css->fw->size < struct_size(css->fwp, binary_header, 1)" with binary_header declared as a flexible-array member is equivalent to "css->fw->size < sizeof(struct imgu_fw_header)" with binary_header declared as a one-element array (as in the original code). The replacement of the one-element array with a flexible-array member also helps with the ongoing efforts to globally enable -Warray-bounds and get us closer to being able to tighten the FORTIFY_SOURCE routines on memcpy(). [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.10/process/deprecated.html#zero-length-and-one-element-arrays Link: https://github.com/KSPP/linux/issues/79 Link: https://github.com/KSPP/linux/issues/109 Fixes: 09d290f0ba21 ("media: staging/intel-ipu3: css: Add support for firmware management") Cc: stable@vger.kernel.org Signed-off-by: Gustavo A. R. Silva Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/staging/media/ipu3/ipu3-css-fw.c | 7 +++---- drivers/staging/media/ipu3/ipu3-css-fw.h | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/staging/media/ipu3/ipu3-css-fw.c b/drivers/staging/media/ipu3/ipu3-css-fw.c index 45aff76198e2c..981693eed8155 100644 --- a/drivers/staging/media/ipu3/ipu3-css-fw.c +++ b/drivers/staging/media/ipu3/ipu3-css-fw.c @@ -124,12 +124,11 @@ int imgu_css_fw_init(struct imgu_css *css) /* Check and display fw header info */ css->fwp = (struct imgu_fw_header *)css->fw->data; - if (css->fw->size < sizeof(struct imgu_fw_header *) || + if (css->fw->size < struct_size(css->fwp, binary_header, 1) || css->fwp->file_header.h_size != sizeof(struct imgu_fw_bi_file_h)) goto bad_fw; - if (sizeof(struct imgu_fw_bi_file_h) + - css->fwp->file_header.binary_nr * sizeof(struct imgu_fw_info) > - css->fw->size) + if (struct_size(css->fwp, binary_header, + css->fwp->file_header.binary_nr) > css->fw->size) goto bad_fw; dev_info(dev, "loaded firmware version %.64s, %u binaries, %zu bytes\n", diff --git a/drivers/staging/media/ipu3/ipu3-css-fw.h b/drivers/staging/media/ipu3/ipu3-css-fw.h index 79ffa70451390..650fd25fc79ee 100644 --- a/drivers/staging/media/ipu3/ipu3-css-fw.h +++ b/drivers/staging/media/ipu3/ipu3-css-fw.h @@ -170,7 +170,7 @@ struct imgu_fw_bi_file_h { struct imgu_fw_header { struct imgu_fw_bi_file_h file_header; - struct imgu_fw_info binary_header[1]; /* binary_nr items */ + struct imgu_fw_info binary_header[]; /* binary_nr items */ }; /******************* Firmware functions *******************/ From 816e45a84c218a8bedae8adad7ffab20a58eb625 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 25 Oct 2021 14:05:22 +0200 Subject: [PATCH 1334/5344] rsi: fix control-message timeout commit 541fd20c3ce5b0bc39f0c6a52414b6b92416831c upstream. USB control-message timeouts are specified in milliseconds and should specifically not vary with CONFIG_HZ. Use the common control-message timeout define for the five-second timeout. Fixes: dad0d04fa7ba ("rsi: Add RS9113 wireless driver") Cc: stable@vger.kernel.org # 3.15 Signed-off-by: Johan Hovold Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20211025120522.6045-5-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/rsi/rsi_91x_usb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/rsi/rsi_91x_usb.c b/drivers/net/wireless/rsi/rsi_91x_usb.c index e8aa3d4bda885..1e5a2a0cc6700 100644 --- a/drivers/net/wireless/rsi/rsi_91x_usb.c +++ b/drivers/net/wireless/rsi/rsi_91x_usb.c @@ -61,7 +61,7 @@ static int rsi_usb_card_write(struct rsi_hw *adapter, (void *)seg, (int)len, &transfer, - HZ * 5); + USB_CTRL_SET_TIMEOUT); if (status < 0) { rsi_dbg(ERR_ZONE, From 7324202d0987c18189de7211f4ef66b94cb2b7a1 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 12 Nov 2021 14:43:05 +0100 Subject: [PATCH 1335/5344] Linux 5.4.159 Link: https://lore.kernel.org/r/20211110182002.206203228@linuxfoundation.org Tested-by: Florian Fainelli Tested-by: Linux Kernel Functional Testing Tested-by: Sudip Mukherjee Tested-by: Shuah Khan Tested-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index cef1d2704c410..602b5167dacd7 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 4 -SUBLEVEL = 158 +SUBLEVEL = 159 EXTRAVERSION = NAME = Kleptomaniac Octopus From ad2a8a46ea6aa23e79ff8901b715dafc821fe59e Mon Sep 17 00:00:00 2001 From: Nehal Bakulchandra Shah Date: Thu, 14 Oct 2021 15:12:00 +0300 Subject: [PATCH 1336/5344] usb: xhci: Enable runtime-pm by default on AMD Yellow Carp platform commit 660a92a59b9e831a0407e41ff62875656d30006e upstream. AMD's Yellow Carp platform supports runtime power management for XHCI Controllers, so enable the same by default for all XHCI Controllers. [ regrouped and aligned the PCI_DEVICE_ID definitions -Mathias] Cc: stable Reviewed-by: Shyam Sundar S K Reviewed-by: Mario Limonciello Reviewed-by: Basavaraj Natikar Signed-off-by: Nehal Bakulchandra Shah Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20211014121200.75433-2-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-pci.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index ae8fa4ff05ee3..beee3543950fe 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -59,6 +59,13 @@ #define PCI_DEVICE_ID_AMD_PROMONTORYA_3 0x43ba #define PCI_DEVICE_ID_AMD_PROMONTORYA_2 0x43bb #define PCI_DEVICE_ID_AMD_PROMONTORYA_1 0x43bc +#define PCI_DEVICE_ID_AMD_YELLOW_CARP_XHCI_1 0x161a +#define PCI_DEVICE_ID_AMD_YELLOW_CARP_XHCI_2 0x161b +#define PCI_DEVICE_ID_AMD_YELLOW_CARP_XHCI_3 0x161d +#define PCI_DEVICE_ID_AMD_YELLOW_CARP_XHCI_4 0x161e +#define PCI_DEVICE_ID_AMD_YELLOW_CARP_XHCI_5 0x15d6 +#define PCI_DEVICE_ID_AMD_YELLOW_CARP_XHCI_6 0x15d7 + #define PCI_DEVICE_ID_ASMEDIA_1042_XHCI 0x1042 #define PCI_DEVICE_ID_ASMEDIA_1042A_XHCI 0x1142 #define PCI_DEVICE_ID_ASMEDIA_1142_XHCI 0x1242 @@ -290,6 +297,15 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci) pdev->device == PCI_DEVICE_ID_AMD_PROMONTORYA_4)) xhci->quirks |= XHCI_NO_SOFT_RETRY; + if (pdev->vendor == PCI_VENDOR_ID_AMD && + (pdev->device == PCI_DEVICE_ID_AMD_YELLOW_CARP_XHCI_1 || + pdev->device == PCI_DEVICE_ID_AMD_YELLOW_CARP_XHCI_2 || + pdev->device == PCI_DEVICE_ID_AMD_YELLOW_CARP_XHCI_3 || + pdev->device == PCI_DEVICE_ID_AMD_YELLOW_CARP_XHCI_4 || + pdev->device == PCI_DEVICE_ID_AMD_YELLOW_CARP_XHCI_5 || + pdev->device == PCI_DEVICE_ID_AMD_YELLOW_CARP_XHCI_6)) + xhci->quirks |= XHCI_DEFAULT_PM_RUNTIME_ALLOW; + if (xhci->quirks & XHCI_RESET_ON_RESUME) xhci_dbg_trace(xhci, trace_xhci_dbg_quirks, "QUIRK: Resetting on resume"); From 083d81459e21a0bd06d4a49731e9059dbc4bf9d4 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Wed, 10 Nov 2021 15:00:23 -0800 Subject: [PATCH 1337/5344] binder: use euid from cred instead of using task commit 29bc22ac5e5bc63275e850f0c8fc549e3d0e306b upstream. Save the 'struct cred' associated with a binder process at initial open to avoid potential race conditions when converting to an euid. Set a transaction's sender_euid from the 'struct cred' saved at binder_open() instead of looking up the euid from the binder proc's 'struct task'. This ensures the euid is associated with the security context that of the task that opened binder. Cc: stable@vger.kernel.org # 4.4+ Fixes: 457b9a6f09f0 ("Staging: android: add binder driver") Signed-off-by: Todd Kjos Suggested-by: Stephen Smalley Suggested-by: Jann Horn Acked-by: Casey Schaufler Signed-off-by: Paul Moore Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 96006696ad534..536159e418ebf 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -424,6 +424,9 @@ enum binder_deferred_state { * (invariant after initialized) * @tsk task_struct for group_leader of process * (invariant after initialized) + * @cred struct cred associated with the `struct file` + * in binder_open() + * (invariant after initialized) * @deferred_work_node: element for binder_deferred_list * (protected by binder_deferred_lock) * @deferred_work: bitmap of deferred work to perform @@ -469,6 +472,7 @@ struct binder_proc { struct list_head waiting_threads; int pid; struct task_struct *tsk; + const struct cred *cred; struct hlist_node deferred_work_node; int deferred_work; bool is_dead; @@ -3093,7 +3097,7 @@ static void binder_transaction(struct binder_proc *proc, t->from = thread; else t->from = NULL; - t->sender_euid = task_euid(proc->tsk); + t->sender_euid = proc->cred->euid; t->to_proc = target_proc; t->to_thread = target_thread; t->code = tr->code; @@ -4709,6 +4713,7 @@ static void binder_free_proc(struct binder_proc *proc) } binder_alloc_deferred_release(&proc->alloc); put_task_struct(proc->tsk); + put_cred(proc->cred); binder_stats_deleted(BINDER_STAT_PROC); kfree(proc); } @@ -5236,6 +5241,7 @@ static int binder_open(struct inode *nodp, struct file *filp) spin_lock_init(&proc->outer_lock); get_task_struct(current->group_leader); proc->tsk = current->group_leader; + proc->cred = get_cred(filp->f_cred); INIT_LIST_HEAD(&proc->todo); proc->default_priority = task_nice(current); /* binderfs stashes devices in i_private */ From 8dd04a82c98af99006c835af2428415880d323de Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Wed, 10 Nov 2021 15:00:24 -0800 Subject: [PATCH 1338/5344] binder: use cred instead of task for selinux checks commit 52f88693378a58094c538662ba652aff0253c4fe upstream. Since binder was integrated with selinux, it has passed 'struct task_struct' associated with the binder_proc to represent the source and target of transactions. The conversion of task to SID was then done in the hook implementations. It turns out that there are race conditions which can result in an incorrect security context being used. Fix by using the 'struct cred' saved during binder_open and pass it to the selinux subsystem. Cc: stable@vger.kernel.org # 5.14 (need backport for earlier stables) Fixes: 79af73079d75 ("Add security hooks to binder and implement the hooks for SELinux.") Suggested-by: Jann Horn Signed-off-by: Todd Kjos Acked-by: Casey Schaufler Signed-off-by: Paul Moore Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder.c | 12 ++++++------ include/linux/lsm_hooks.h | 28 ++++++++++++++-------------- include/linux/security.h | 28 ++++++++++++++-------------- security/security.c | 14 +++++++------- security/selinux/hooks.c | 36 +++++++++++++++--------------------- 5 files changed, 56 insertions(+), 62 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 536159e418ebf..c5b7d8d81a5a5 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2448,7 +2448,7 @@ static int binder_translate_binder(struct flat_binder_object *fp, ret = -EINVAL; goto done; } - if (security_binder_transfer_binder(proc->tsk, target_proc->tsk)) { + if (security_binder_transfer_binder(proc->cred, target_proc->cred)) { ret = -EPERM; goto done; } @@ -2494,7 +2494,7 @@ static int binder_translate_handle(struct flat_binder_object *fp, proc->pid, thread->pid, fp->handle); return -EINVAL; } - if (security_binder_transfer_binder(proc->tsk, target_proc->tsk)) { + if (security_binder_transfer_binder(proc->cred, target_proc->cred)) { ret = -EPERM; goto done; } @@ -2582,7 +2582,7 @@ static int binder_translate_fd(u32 fd, binder_size_t fd_offset, ret = -EBADF; goto err_fget; } - ret = security_binder_transfer_file(proc->tsk, target_proc->tsk, file); + ret = security_binder_transfer_file(proc->cred, target_proc->cred, file); if (ret < 0) { ret = -EPERM; goto err_security; @@ -2981,8 +2981,8 @@ static void binder_transaction(struct binder_proc *proc, return_error_line = __LINE__; goto err_invalid_target_handle; } - if (security_binder_transaction(proc->tsk, - target_proc->tsk) < 0) { + if (security_binder_transaction(proc->cred, + target_proc->cred) < 0) { return_error = BR_FAILED_REPLY; return_error_param = -EPERM; return_error_line = __LINE__; @@ -4924,7 +4924,7 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp, ret = -EBUSY; goto out; } - ret = security_binder_set_context_mgr(proc->tsk); + ret = security_binder_set_context_mgr(proc->cred); if (ret < 0) goto out; if (uid_valid(context->binder_context_mgr_uid)) { diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index a3763247547c3..a21dc5413653e 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1241,22 +1241,22 @@ * * @binder_set_context_mgr: * Check whether @mgr is allowed to be the binder context manager. - * @mgr contains the task_struct for the task being registered. + * @mgr contains the struct cred for the current binder process. * Return 0 if permission is granted. * @binder_transaction: * Check whether @from is allowed to invoke a binder transaction call * to @to. - * @from contains the task_struct for the sending task. - * @to contains the task_struct for the receiving task. + * @from contains the struct cred for the sending process. + * @to contains the struct cred for the receiving process. * @binder_transfer_binder: * Check whether @from is allowed to transfer a binder reference to @to. - * @from contains the task_struct for the sending task. - * @to contains the task_struct for the receiving task. + * @from contains the struct cred for the sending process. + * @to contains the struct cred for the receiving process. * @binder_transfer_file: * Check whether @from is allowed to transfer @file to @to. - * @from contains the task_struct for the sending task. + * @from contains the struct cred for the sending process. * @file contains the struct file being transferred. - * @to contains the task_struct for the receiving task. + * @to contains the struct cred for the receiving process. * * @ptrace_access_check: * Check permission before allowing the current process to trace the @@ -1456,13 +1456,13 @@ * @what: kernel feature being accessed */ union security_list_options { - int (*binder_set_context_mgr)(struct task_struct *mgr); - int (*binder_transaction)(struct task_struct *from, - struct task_struct *to); - int (*binder_transfer_binder)(struct task_struct *from, - struct task_struct *to); - int (*binder_transfer_file)(struct task_struct *from, - struct task_struct *to, + int (*binder_set_context_mgr)(const struct cred *mgr); + int (*binder_transaction)(const struct cred *from, + const struct cred *to); + int (*binder_transfer_binder)(const struct cred *from, + const struct cred *to); + int (*binder_transfer_file)(const struct cred *from, + const struct cred *to, struct file *file); int (*ptrace_access_check)(struct task_struct *child, diff --git a/include/linux/security.h b/include/linux/security.h index df90399a8af98..0d4cb64cae1f0 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -249,13 +249,13 @@ extern int security_init(void); extern int early_security_init(void); /* Security operations */ -int security_binder_set_context_mgr(struct task_struct *mgr); -int security_binder_transaction(struct task_struct *from, - struct task_struct *to); -int security_binder_transfer_binder(struct task_struct *from, - struct task_struct *to); -int security_binder_transfer_file(struct task_struct *from, - struct task_struct *to, struct file *file); +int security_binder_set_context_mgr(const struct cred *mgr); +int security_binder_transaction(const struct cred *from, + const struct cred *to); +int security_binder_transfer_binder(const struct cred *from, + const struct cred *to); +int security_binder_transfer_file(const struct cred *from, + const struct cred *to, struct file *file); int security_ptrace_access_check(struct task_struct *child, unsigned int mode); int security_ptrace_traceme(struct task_struct *parent); int security_capget(struct task_struct *target, @@ -481,25 +481,25 @@ static inline int early_security_init(void) return 0; } -static inline int security_binder_set_context_mgr(struct task_struct *mgr) +static inline int security_binder_set_context_mgr(const struct cred *mgr) { return 0; } -static inline int security_binder_transaction(struct task_struct *from, - struct task_struct *to) +static inline int security_binder_transaction(const struct cred *from, + const struct cred *to) { return 0; } -static inline int security_binder_transfer_binder(struct task_struct *from, - struct task_struct *to) +static inline int security_binder_transfer_binder(const struct cred *from, + const struct cred *to) { return 0; } -static inline int security_binder_transfer_file(struct task_struct *from, - struct task_struct *to, +static inline int security_binder_transfer_file(const struct cred *from, + const struct cred *to, struct file *file) { return 0; diff --git a/security/security.c b/security/security.c index 1bc000f834e2d..c34ec4c7d98cc 100644 --- a/security/security.c +++ b/security/security.c @@ -670,25 +670,25 @@ static void __init lsm_early_task(struct task_struct *task) /* Security operations */ -int security_binder_set_context_mgr(struct task_struct *mgr) +int security_binder_set_context_mgr(const struct cred *mgr) { return call_int_hook(binder_set_context_mgr, 0, mgr); } -int security_binder_transaction(struct task_struct *from, - struct task_struct *to) +int security_binder_transaction(const struct cred *from, + const struct cred *to) { return call_int_hook(binder_transaction, 0, from, to); } -int security_binder_transfer_binder(struct task_struct *from, - struct task_struct *to) +int security_binder_transfer_binder(const struct cred *from, + const struct cred *to) { return call_int_hook(binder_transfer_binder, 0, from, to); } -int security_binder_transfer_file(struct task_struct *from, - struct task_struct *to, struct file *file) +int security_binder_transfer_file(const struct cred *from, + const struct cred *to, struct file *file) { return call_int_hook(binder_transfer_file, 0, from, to, file); } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 717a398ef4d05..8b9cbe1e8ee22 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2050,22 +2050,19 @@ static inline u32 open_file_to_av(struct file *file) /* Hook functions begin here. */ -static int selinux_binder_set_context_mgr(struct task_struct *mgr) +static int selinux_binder_set_context_mgr(const struct cred *mgr) { - u32 mysid = current_sid(); - u32 mgrsid = task_sid(mgr); - return avc_has_perm(&selinux_state, - mysid, mgrsid, SECCLASS_BINDER, + current_sid(), cred_sid(mgr), SECCLASS_BINDER, BINDER__SET_CONTEXT_MGR, NULL); } -static int selinux_binder_transaction(struct task_struct *from, - struct task_struct *to) +static int selinux_binder_transaction(const struct cred *from, + const struct cred *to) { u32 mysid = current_sid(); - u32 fromsid = task_sid(from); - u32 tosid = task_sid(to); + u32 fromsid = cred_sid(from); + u32 tosid = cred_sid(to); int rc; if (mysid != fromsid) { @@ -2076,27 +2073,24 @@ static int selinux_binder_transaction(struct task_struct *from, return rc; } - return avc_has_perm(&selinux_state, - fromsid, tosid, SECCLASS_BINDER, BINDER__CALL, - NULL); + return avc_has_perm(&selinux_state, fromsid, tosid, + SECCLASS_BINDER, BINDER__CALL, NULL); } -static int selinux_binder_transfer_binder(struct task_struct *from, - struct task_struct *to) +static int selinux_binder_transfer_binder(const struct cred *from, + const struct cred *to) { - u32 fromsid = task_sid(from); - u32 tosid = task_sid(to); - return avc_has_perm(&selinux_state, - fromsid, tosid, SECCLASS_BINDER, BINDER__TRANSFER, + cred_sid(from), cred_sid(to), + SECCLASS_BINDER, BINDER__TRANSFER, NULL); } -static int selinux_binder_transfer_file(struct task_struct *from, - struct task_struct *to, +static int selinux_binder_transfer_file(const struct cred *from, + const struct cred *to, struct file *file) { - u32 sid = task_sid(to); + u32 sid = cred_sid(to); struct file_security_struct *fsec = selinux_file(file); struct dentry *dentry = file->f_path.dentry; struct inode_security_struct *isec; From 1ba4dd0b9178dd8851b27bf1ba7bf121084ea4e0 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Wed, 10 Nov 2021 15:00:25 -0800 Subject: [PATCH 1339/5344] binder: use cred instead of task for getsecid commit 4d5b5539742d2554591751b4248b0204d20dcc9d upstream. Use the 'struct cred' saved at binder_open() to lookup the security ID via security_cred_getsecid(). This ensures that the security context that opened binder is the one used to generate the secctx. Cc: stable@vger.kernel.org # 5.4+ Fixes: ec74136ded79 ("binder: create node flag to request sender's security context") Signed-off-by: Todd Kjos Suggested-by: Stephen Smalley Reported-by: kernel test robot Acked-by: Casey Schaufler Signed-off-by: Paul Moore Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder.c | 2 +- include/linux/security.h | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index c5b7d8d81a5a5..33765b85c3366 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -3108,7 +3108,7 @@ static void binder_transaction(struct binder_proc *proc, u32 secid; size_t added_size; - security_task_getsecid(proc->tsk, &secid); + security_cred_getsecid(proc->cred, &secid); ret = security_secid_to_secctx(secid, &secctx, &secctx_sz); if (ret) { return_error = BR_FAILED_REPLY; diff --git a/include/linux/security.h b/include/linux/security.h index 0d4cb64cae1f0..3f6b8195ae9eb 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -985,6 +985,11 @@ static inline void security_transfer_creds(struct cred *new, { } +static inline void security_cred_getsecid(const struct cred *c, u32 *secid) +{ + *secid = 0; +} + static inline int security_kernel_act_as(struct cred *cred, u32 secid) { return 0; From 4e0d19b54c27f54b90da9892abf4d78168c87dfd Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 9 Nov 2021 22:58:01 -0800 Subject: [PATCH 1340/5344] Input: iforce - fix control-message timeout commit 744d0090a5f6dfa4c81b53402ccdf08313100429 upstream. USB control-message timeouts are specified in milliseconds and should specifically not vary with CONFIG_HZ. Fixes: 487358627825 ("Input: iforce - use DMA-safe buffer when getting IDs from USB") Signed-off-by: Johan Hovold Cc: stable@vger.kernel.org # 5.3 Link: https://lore.kernel.org/r/20211025115501.5190-1-johan@kernel.org Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/joystick/iforce/iforce-usb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/joystick/iforce/iforce-usb.c b/drivers/input/joystick/iforce/iforce-usb.c index 6c554c11a7ac3..ea58805c480fa 100644 --- a/drivers/input/joystick/iforce/iforce-usb.c +++ b/drivers/input/joystick/iforce/iforce-usb.c @@ -92,7 +92,7 @@ static int iforce_usb_get_id(struct iforce *iforce, u8 id, id, USB_TYPE_VENDOR | USB_DIR_IN | USB_RECIP_INTERFACE, - 0, 0, buf, IFORCE_MAX_LENGTH, HZ); + 0, 0, buf, IFORCE_MAX_LENGTH, 1000); if (status < 0) { dev_err(&iforce_usb->intf->dev, "usb_submit_urb failed: %d\n", status); From 60325b22ff6697f7cc05dd1f9a568bb74c1a4094 Mon Sep 17 00:00:00 2001 From: Phoenix Huang Date: Sun, 7 Nov 2021 22:00:03 -0800 Subject: [PATCH 1341/5344] Input: elantench - fix misreporting trackpoint coordinates commit be896bd3b72b44126c55768f14c22a8729b0992e upstream. Some firmwares occasionally report bogus data from trackpoint, with X or Y displacement being too large (outside of [-127, 127] range). Let's drop such packets so that we do not generate jumps. Signed-off-by: Phoenix Huang Tested-by: Yufei Du Link: https://lore.kernel.org/r/20210729010940.5752-1-phoenix@emc.com.tw Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/mouse/elantech.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/input/mouse/elantech.c b/drivers/input/mouse/elantech.c index 8533f1f625942..3e78c26025815 100644 --- a/drivers/input/mouse/elantech.c +++ b/drivers/input/mouse/elantech.c @@ -517,6 +517,19 @@ static void elantech_report_trackpoint(struct psmouse *psmouse, case 0x16008020U: case 0x26800010U: case 0x36808000U: + + /* + * This firmware misreport coordinates for trackpoint + * occasionally. Discard packets outside of [-127, 127] range + * to prevent cursor jumps. + */ + if (packet[4] == 0x80 || packet[5] == 0x80 || + packet[1] >> 7 == packet[4] >> 7 || + packet[2] >> 7 == packet[5] >> 7) { + elantech_debug("discarding packet [%6ph]\n", packet); + break; + + } x = packet[4] - (int)((packet[1]^0x80) << 1); y = (int)((packet[2]^0x80) << 1) - packet[5]; From a7609eb8de67c40cdcaa6b1324ddb6e1526743db Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 3 Nov 2021 08:00:19 +0100 Subject: [PATCH 1342/5344] Input: i8042 - Add quirk for Fujitsu Lifebook T725 commit 16e28abb7290c4ca3b3a0f333ba067f34bb18c86 upstream. Fujitsu Lifebook T725 laptop requires, like a few other similar models, the nomux and notimeout options to probe the touchpad properly. This patch adds the corresponding quirk entries. BugLink: https://bugzilla.suse.com/show_bug.cgi?id=1191980 Tested-by: Neal Gompa Cc: Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20211103070019.13374-1-tiwai@suse.de Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/serio/i8042-x86ia64io.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/input/serio/i8042-x86ia64io.h b/drivers/input/serio/i8042-x86ia64io.h index 23442a144b834..202e43a6ffae2 100644 --- a/drivers/input/serio/i8042-x86ia64io.h +++ b/drivers/input/serio/i8042-x86ia64io.h @@ -272,6 +272,13 @@ static const struct dmi_system_id __initconst i8042_dmi_nomux_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "LifeBook S6230"), }, }, + { + /* Fujitsu Lifebook T725 laptop */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU"), + DMI_MATCH(DMI_PRODUCT_NAME, "LIFEBOOK T725"), + }, + }, { /* Fujitsu Lifebook U745 */ .matches = { @@ -840,6 +847,13 @@ static const struct dmi_system_id __initconst i8042_dmi_notimeout_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "LIFEBOOK AH544"), }, }, + { + /* Fujitsu Lifebook T725 laptop */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU"), + DMI_MATCH(DMI_PRODUCT_NAME, "LIFEBOOK T725"), + }, + }, { /* Fujitsu U574 laptop */ /* https://bugzilla.kernel.org/show_bug.cgi?id=69731 */ From db1296f67b94b4bb1f81e8c7a48fe8029992f9c3 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 4 Nov 2021 17:31:58 +0900 Subject: [PATCH 1343/5344] libata: fix read log timeout value commit 68dbbe7d5b4fde736d104cbbc9a2fce875562012 upstream. Some ATA drives are very slow to respond to READ_LOG_EXT and READ_LOG_DMA_EXT commands issued from ata_dev_configure() when the device is revalidated right after resuming a system or inserting the ATA adapter driver (e.g. ahci). The default 5s timeout (ATA_EH_CMD_DFL_TIMEOUT) used for these commands is too short, causing errors during the device configuration. Ex: ... ata9: SATA max UDMA/133 abar m524288@0x9d200000 port 0x9d200400 irq 209 ata9: SATA link up 6.0 Gbps (SStatus 133 SControl 300) ata9.00: ATA-9: XXX XXXXXXXXXXXXXXX, XXXXXXXX, max UDMA/133 ata9.00: qc timeout (cmd 0x2f) ata9.00: Read log page 0x00 failed, Emask 0x4 ata9.00: Read log page 0x00 failed, Emask 0x40 ata9.00: NCQ Send/Recv Log not supported ata9.00: Read log page 0x08 failed, Emask 0x40 ata9.00: 27344764928 sectors, multi 16: LBA48 NCQ (depth 32), AA ata9.00: Read log page 0x00 failed, Emask 0x40 ata9.00: ATA Identify Device Log not supported ata9.00: failed to set xfermode (err_mask=0x40) ata9: SATA link up 6.0 Gbps (SStatus 133 SControl 300) ata9.00: configured for UDMA/133 ... The timeout error causes a soft reset of the drive link, followed in most cases by a successful revalidation as that give enough time to the drive to become fully ready to quickly process the read log commands. However, in some cases, this also fails resulting in the device being dropped. Fix this by using adding the ata_eh_revalidate_timeouts entries for the READ_LOG_EXT and READ_LOG_DMA_EXT commands. This defines a timeout increased to 15s, retriable one time. Reported-by: Geert Uytterhoeven Tested-by: Geert Uytterhoeven Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Signed-off-by: Greg Kroah-Hartman --- drivers/ata/libata-eh.c | 8 ++++++++ include/linux/libata.h | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c index 3bfd9da584734..c6a21a784ec65 100644 --- a/drivers/ata/libata-eh.c +++ b/drivers/ata/libata-eh.c @@ -97,6 +97,12 @@ static const unsigned long ata_eh_identify_timeouts[] = { ULONG_MAX, }; +static const unsigned long ata_eh_revalidate_timeouts[] = { + 15000, /* Some drives are slow to read log pages when waking-up */ + 15000, /* combined time till here is enough even for media access */ + ULONG_MAX, +}; + static const unsigned long ata_eh_flush_timeouts[] = { 15000, /* be generous with flush */ 15000, /* ditto */ @@ -133,6 +139,8 @@ static const struct ata_eh_cmd_timeout_ent ata_eh_cmd_timeout_table[ATA_EH_CMD_TIMEOUT_TABLE_SIZE] = { { .commands = CMDS(ATA_CMD_ID_ATA, ATA_CMD_ID_ATAPI), .timeouts = ata_eh_identify_timeouts, }, + { .commands = CMDS(ATA_CMD_READ_LOG_EXT, ATA_CMD_READ_LOG_DMA_EXT), + .timeouts = ata_eh_revalidate_timeouts, }, { .commands = CMDS(ATA_CMD_READ_NATIVE_MAX, ATA_CMD_READ_NATIVE_MAX_EXT), .timeouts = ata_eh_other_timeouts, }, { .commands = CMDS(ATA_CMD_SET_MAX, ATA_CMD_SET_MAX_EXT), diff --git a/include/linux/libata.h b/include/linux/libata.h index 3d5adbaf8214f..2e448d65a04c7 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -391,7 +391,7 @@ enum { /* This should match the actual table size of * ata_eh_cmd_timeout_table in libata-eh.c. */ - ATA_EH_CMD_TIMEOUT_TABLE_SIZE = 6, + ATA_EH_CMD_TIMEOUT_TABLE_SIZE = 7, /* Horkage types. May be set by libata or controller on drives (some horkage may be drive/controller pair dependent */ From 61bceeb4e3ea63af99642886ea104e89944ab599 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 5 Nov 2021 13:34:55 -0700 Subject: [PATCH 1344/5344] ocfs2: fix data corruption on truncate commit 839b63860eb3835da165642923120d305925561d upstream. Patch series "ocfs2: Truncate data corruption fix". As further testing has shown, commit 5314454ea3f ("ocfs2: fix data corruption after conversion from inline format") didn't fix all the data corruption issues the customer started observing after 6dbf7bb55598 ("fs: Don't invalidate page buffers in block_write_full_page()") This time I have tracked them down to two bugs in ocfs2 truncation code. One bug (truncating page cache before clearing tail cluster and setting i_size) could cause data corruption even before 6dbf7bb55598, but before that commit it needed a race with page fault, after 6dbf7bb55598 it started to be pretty deterministic. Another bug (zeroing pages beyond old i_size) used to be harmless inefficiency before commit 6dbf7bb55598. But after commit 6dbf7bb55598 in combination with the first bug it resulted in deterministic data corruption. Although fixing only the first problem is needed to stop data corruption, I've fixed both issues to make the code more robust. This patch (of 2): ocfs2_truncate_file() did unmap invalidate page cache pages before zeroing partial tail cluster and setting i_size. Thus some pages could be left (and likely have left if the cluster zeroing happened) in the page cache beyond i_size after truncate finished letting user possibly see stale data once the file was extended again. Also the tail cluster zeroing was not guaranteed to finish before truncate finished causing possible stale data exposure. The problem started to be particularly easy to hit after commit 6dbf7bb55598 "fs: Don't invalidate page buffers in block_write_full_page()" stopped invalidation of pages beyond i_size from page writeback path. Fix these problems by unmapping and invalidating pages in the page cache after the i_size is reduced and tail cluster is zeroed out. Link: https://lkml.kernel.org/r/20211025150008.29002-1-jack@suse.cz Link: https://lkml.kernel.org/r/20211025151332.11301-1-jack@suse.cz Fixes: ccd979bdbce9 ("[PATCH] OCFS2: The Second Oracle Cluster Filesystem") Signed-off-by: Jan Kara Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/ocfs2/file.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 58d4546a208e6..c30747bdfb127 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -478,10 +478,11 @@ int ocfs2_truncate_file(struct inode *inode, * greater than page size, so we have to truncate them * anyway. */ - unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(inode->i_mapping, new_i_size); if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + unmap_mapping_range(inode->i_mapping, + new_i_size + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(inode->i_mapping, new_i_size); status = ocfs2_truncate_inline(inode, di_bh, new_i_size, i_size_read(inode), 1); if (status) @@ -500,6 +501,9 @@ int ocfs2_truncate_file(struct inode *inode, goto bail_unlock_sem; } + unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(inode->i_mapping, new_i_size); + status = ocfs2_commit_truncate(osb, inode, di_bh); if (status < 0) { mlog_errno(status); From 9188664c22a04f15008dc58fa06197a1302c5e46 Mon Sep 17 00:00:00 2001 From: Arun Easi Date: Wed, 8 Sep 2021 09:46:18 -0700 Subject: [PATCH 1345/5344] scsi: qla2xxx: Fix kernel crash when accessing port_speed sysfs file commit 3ef68d4f0c9e7cb589ae8b70f07d77f528105331 upstream. Kernel crashes when accessing port_speed sysfs file. The issue happens on a CNA when the local array was accessed beyond bounds. Fix this by changing the lookup. BUG: unable to handle kernel paging request at 0000000000004000 PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI CPU: 15 PID: 455213 Comm: sosreport Kdump: loaded Not tainted 4.18.0-305.7.1.el8_4.x86_64 #1 RIP: 0010:string_nocheck+0x12/0x70 Code: 00 00 4c 89 e2 be 20 00 00 00 48 89 ef e8 86 9a 00 00 4c 01 e3 eb 81 90 49 89 f2 48 89 ce 48 89 f8 48 c1 fe 30 66 85 f6 74 4f <44> 0f b6 0a 45 84 c9 74 46 83 ee 01 41 b8 01 00 00 00 48 8d 7c 37 RSP: 0018:ffffb5141c1afcf0 EFLAGS: 00010286 RAX: ffff8bf4009f8000 RBX: ffff8bf4009f9000 RCX: ffff0a00ffffff04 RDX: 0000000000004000 RSI: ffffffffffffffff RDI: ffff8bf4009f8000 RBP: 0000000000004000 R08: 0000000000000001 R09: ffffb5141c1afb84 R10: ffff8bf4009f9000 R11: ffffb5141c1afce6 R12: ffff0a00ffffff04 R13: ffffffffc08e21aa R14: 0000000000001000 R15: ffffffffc08e21aa FS: 00007fc4ebfff700(0000) GS:ffff8c717f7c0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000004000 CR3: 000000edfdee6006 CR4: 00000000001706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: string+0x40/0x50 vsnprintf+0x33c/0x520 scnprintf+0x4d/0x90 qla2x00_port_speed_show+0xb5/0x100 [qla2xxx] dev_attr_show+0x1c/0x40 sysfs_kf_seq_show+0x9b/0x100 seq_read+0x153/0x410 vfs_read+0x91/0x140 ksys_read+0x4f/0xb0 do_syscall_64+0x5b/0x1a0 entry_SYSCALL_64_after_hwframe+0x65/0xca Link: https://lore.kernel.org/r/20210908164622.19240-7-njavali@marvell.com Fixes: 4910b524ac9e ("scsi: qla2xxx: Add support for setting port speed") Cc: stable@vger.kernel.org Reviewed-by: Himanshu Madhani Signed-off-by: Arun Easi Signed-off-by: Nilesh Javali Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/qla2xxx/qla_attr.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_attr.c b/drivers/scsi/qla2xxx/qla_attr.c index 580d30cd5c35c..8f85ca0112961 100644 --- a/drivers/scsi/qla2xxx/qla_attr.c +++ b/drivers/scsi/qla2xxx/qla_attr.c @@ -1759,6 +1759,18 @@ qla2x00_port_speed_store(struct device *dev, struct device_attribute *attr, return strlen(buf); } +static const struct { + u16 rate; + char *str; +} port_speed_str[] = { + { PORT_SPEED_4GB, "4" }, + { PORT_SPEED_8GB, "8" }, + { PORT_SPEED_16GB, "16" }, + { PORT_SPEED_32GB, "32" }, + { PORT_SPEED_64GB, "64" }, + { PORT_SPEED_10GB, "10" }, +}; + static ssize_t qla2x00_port_speed_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -1766,7 +1778,8 @@ qla2x00_port_speed_show(struct device *dev, struct device_attribute *attr, struct scsi_qla_host *vha = shost_priv(dev_to_shost(dev)); struct qla_hw_data *ha = vha->hw; ssize_t rval; - char *spd[7] = {"0", "0", "0", "4", "8", "16", "32"}; + u16 i; + char *speed = "Unknown"; rval = qla2x00_get_data_rate(vha); if (rval != QLA_SUCCESS) { @@ -1775,7 +1788,14 @@ qla2x00_port_speed_show(struct device *dev, struct device_attribute *attr, return -EINVAL; } - return scnprintf(buf, PAGE_SIZE, "%s\n", spd[ha->link_data_rate]); + for (i = 0; i < ARRAY_SIZE(port_speed_str); i++) { + if (port_speed_str[i].rate != ha->link_data_rate) + continue; + speed = port_speed_str[i].str; + break; + } + + return scnprintf(buf, PAGE_SIZE, "%s\n", speed); } /* ----- */ From 85ac9961efeb7fd5e0953cc409bd4c0a907347d6 Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Wed, 8 Sep 2021 09:46:21 -0700 Subject: [PATCH 1346/5344] scsi: qla2xxx: Fix use after free in eh_abort path commit 3d33b303d4f3b74a71bede5639ebba3cfd2a2b4d upstream. In eh_abort path driver prematurely exits the call to upper layer. Check whether command is aborted / completed by firmware before exiting the call. 9 [ffff8b1ebf803c00] page_fault at ffffffffb0389778 [exception RIP: qla2x00_status_entry+0x48d] RIP: ffffffffc04fa62d RSP: ffff8b1ebf803cb0 RFLAGS: 00010082 RAX: 00000000ffffffff RBX: 00000000000e0000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 00000000000013d8 RDI: fffff3253db78440 RBP: ffff8b1ebf803dd0 R8: ffff8b1ebcd9b0c0 R9: 0000000000000000 R10: ffff8b1e38a30808 R11: 0000000000001000 R12: 00000000000003e9 R13: 0000000000000000 R14: ffff8b1ebcd9d740 R15: 0000000000000028 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 10 [ffff8b1ebf803cb0] enqueue_entity at ffffffffafce708f 11 [ffff8b1ebf803d00] enqueue_task_fair at ffffffffafce7b88 12 [ffff8b1ebf803dd8] qla24xx_process_response_queue at ffffffffc04fc9a6 [qla2xxx] 13 [ffff8b1ebf803e78] qla24xx_msix_rsp_q at ffffffffc04ff01b [qla2xxx] 14 [ffff8b1ebf803eb0] __handle_irq_event_percpu at ffffffffafd50714 Link: https://lore.kernel.org/r/20210908164622.19240-10-njavali@marvell.com Fixes: f45bca8c5052 ("scsi: qla2xxx: Fix double scsi_done for abort path") Cc: stable@vger.kernel.org Reviewed-by: Himanshu Madhani Co-developed-by: David Jeffery Signed-off-by: David Jeffery Co-developed-by: Laurence Oberman Signed-off-by: Laurence Oberman Signed-off-by: Quinn Tran Signed-off-by: Nilesh Javali Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/qla2xxx/qla_os.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 28cbefe715e59..8b68879058132 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -1229,6 +1229,7 @@ qla2xxx_eh_abort(struct scsi_cmnd *cmd) uint32_t ratov_j; struct qla_qpair *qpair; unsigned long flags; + int fast_fail_status = SUCCESS; if (qla2x00_isp_reg_stat(ha)) { ql_log(ql_log_info, vha, 0x8042, @@ -1236,15 +1237,16 @@ qla2xxx_eh_abort(struct scsi_cmnd *cmd) return FAILED; } + /* Save any FAST_IO_FAIL value to return later if abort succeeds */ ret = fc_block_scsi_eh(cmd); if (ret != 0) - return ret; + fast_fail_status = ret; sp = scsi_cmd_priv(cmd); qpair = sp->qpair; if ((sp->fcport && sp->fcport->deleted) || !qpair) - return SUCCESS; + return fast_fail_status != SUCCESS ? fast_fail_status : FAILED; spin_lock_irqsave(qpair->qp_lock_ptr, flags); if (sp->completed) { @@ -1290,7 +1292,7 @@ qla2xxx_eh_abort(struct scsi_cmnd *cmd) __func__, ha->r_a_tov/10); ret = FAILED; } else { - ret = SUCCESS; + ret = fast_fail_status; } break; default: From d737e2fb58cdb6b119cb59c2a7449e917b947a15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20L=C3=B6hle?= Date: Thu, 16 Sep 2021 05:59:19 +0000 Subject: [PATCH 1347/5344] mmc: dw_mmc: Dont wait for DRTO on Write RSP error commit 43592c8736e84025d7a45e61a46c3fa40536a364 upstream. Only wait for DRTO on reads, otherwise the driver hangs. The driver prevents sending CMD12 on response errors like CRCs. According to the comment this is because some cards have problems with this during the UHS tuning sequence. Unfortunately this workaround currently also applies for any command with data. On reads this will set the drto timer, which then triggers after a while. On writes this will not set any timer and the tasklet will not be scheduled again. I cannot test for the UHS workarounds need, but even if so, it should at most apply to reads. I have observed many hangs when CMD25 response contained a CRC error. This patch fixes this without touching the actual UHS tuning workaround. Signed-off-by: Christian Loehle Reviewed-by: Jaehoon Chung Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/af8f8b8674ba4fcc9a781019e4aeb72c@hyperstone.com Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/host/dw_mmc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c index 7b280cb363271..ba37be8e423c2 100644 --- a/drivers/mmc/host/dw_mmc.c +++ b/drivers/mmc/host/dw_mmc.c @@ -2013,7 +2013,8 @@ static void dw_mci_tasklet_func(unsigned long priv) * delayed. Allowing the transfer to take place * avoids races and keeps things simple. */ - if (err != -ETIMEDOUT) { + if (err != -ETIMEDOUT && + host->dir_status == DW_MCI_RECV_STATUS) { state = STATE_SENDING_DATA; continue; } From 7b69eae0d5d1cde4e1dcb21d928b6a627450a8fc Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Tue, 5 Oct 2021 00:27:49 +0200 Subject: [PATCH 1348/5344] parisc: Fix ptrace check on syscall return commit 8779e05ba8aaffec1829872ef9774a71f44f6580 upstream. The TIF_XXX flags are stored in the flags field in the thread_info struct (TI_FLAGS), not in the flags field of the task_struct structure (TASK_FLAGS). It seems this bug didn't generate any important side-effects, otherwise it wouldn't have went unnoticed for 12 years (since v2.6.32). Signed-off-by: Helge Deller Fixes: ecd3d4bc06e48 ("parisc: stop using task->ptrace for {single,block}step flags") Cc: Kyle McMartin Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/parisc/kernel/entry.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S index 873bf3434da94..7d14e9ae18fb1 100644 --- a/arch/parisc/kernel/entry.S +++ b/arch/parisc/kernel/entry.S @@ -1841,7 +1841,7 @@ syscall_restore: LDREG TI_TASK-THREAD_SZ_ALGN-FRAME_SIZE(%r30),%r1 /* Are we being ptraced? */ - ldw TASK_FLAGS(%r1),%r19 + LDREG TI_FLAGS-THREAD_SZ_ALGN-FRAME_SIZE(%r30),%r19 ldi _TIF_SYSCALL_TRACE_MASK,%r2 and,COND(=) %r19,%r2,%r0 b,n syscall_restore_rfi From 6348294c47b07a5c7c0f3cef3b0a6dfea6ec0f80 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 8 Sep 2021 08:33:57 +0300 Subject: [PATCH 1349/5344] tpm: Check for integer overflow in tpm2_map_response_body() commit a0bcce2b2a169e10eb265c8f0ebdd5ae4c875670 upstream. The "4 * be32_to_cpu(data->count)" multiplication can potentially overflow which would lead to memory corruption. Add a check for that. Cc: stable@vger.kernel.org Fixes: 745b361e989a ("tpm: infrastructure for TPM spaces") Signed-off-by: Dan Carpenter Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen Signed-off-by: Greg Kroah-Hartman --- drivers/char/tpm/tpm2-space.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/char/tpm/tpm2-space.c b/drivers/char/tpm/tpm2-space.c index 784b8b3cb903f..97e916856cf3e 100644 --- a/drivers/char/tpm/tpm2-space.c +++ b/drivers/char/tpm/tpm2-space.c @@ -455,6 +455,9 @@ static int tpm2_map_response_body(struct tpm_chip *chip, u32 cc, u8 *rsp, if (be32_to_cpu(data->capability) != TPM2_CAP_HANDLES) return 0; + if (be32_to_cpu(data->count) > (UINT_MAX - TPM_HEADER_SIZE - 9) / 4) + return -EFAULT; + if (len != TPM_HEADER_SIZE + 9 + 4 * be32_to_cpu(data->count)) return -EFAULT; From 823aca66455b47782248b9340abfbf558950dd38 Mon Sep 17 00:00:00 2001 From: jing yangyang Date: Thu, 19 Aug 2021 19:30:16 -0700 Subject: [PATCH 1350/5344] firmware/psci: fix application of sizeof to pointer commit 2ac5fb35cd520ab1851c9a4816c523b65276052f upstream. sizeof when applied to a pointer typed expression gives the size of the pointer. ./drivers/firmware/psci/psci_checker.c:158:41-47: ERROR application of sizeof to pointer This issue was detected with the help of Coccinelle. Fixes: 7401056de5f8 ("drivers/firmware: psci_checker: stash and use topology_core_cpumask for hotplug tests") Cc: stable@vger.kernel.org Reported-by: Zeal Robot Acked-by: Mark Rutland Reviewed-by: Gustavo A. R. Silva Signed-off-by: jing yangyang Signed-off-by: Gustavo A. R. Silva Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/psci/psci_checker.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/psci/psci_checker.c b/drivers/firmware/psci/psci_checker.c index 03eb798ad3ed9..250b7232f9816 100644 --- a/drivers/firmware/psci/psci_checker.c +++ b/drivers/firmware/psci/psci_checker.c @@ -155,7 +155,7 @@ static int alloc_init_cpu_groups(cpumask_var_t **pcpu_groups) if (!alloc_cpumask_var(&tmp, GFP_KERNEL)) return -ENOMEM; - cpu_groups = kcalloc(nb_available_cpus, sizeof(cpu_groups), + cpu_groups = kcalloc(nb_available_cpus, sizeof(*cpu_groups), GFP_KERNEL); if (!cpu_groups) { free_cpumask_var(tmp); From eff4c3f86ffbeb673e5dbe26e9910a13a82f8c27 Mon Sep 17 00:00:00 2001 From: Tang Bin Date: Thu, 21 Oct 2021 09:34:22 +0800 Subject: [PATCH 1351/5344] crypto: s5p-sss - Add error handling in s5p_aes_probe() commit a472cc0dde3eb057db71c80f102556eeced03805 upstream. The function s5p_aes_probe() does not perform sufficient error checking after executing platform_get_resource(), thus fix it. Fixes: c2afad6c6105 ("crypto: s5p-sss - Add HASH support for Exynos") Cc: Signed-off-by: Tang Bin Reviewed-by: Krzysztof Kozlowski Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- drivers/crypto/s5p-sss.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/crypto/s5p-sss.c b/drivers/crypto/s5p-sss.c index 010f1bb20dada..86a13b738c2de 100644 --- a/drivers/crypto/s5p-sss.c +++ b/drivers/crypto/s5p-sss.c @@ -2208,6 +2208,8 @@ static int s5p_aes_probe(struct platform_device *pdev) variant = find_s5p_sss_version(pdev); res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) + return -EINVAL; /* * Note: HASH and PRNG uses the same registers in secss, avoid From 41b35bd02a6eb239cfbeeda3e1a4900c62f870cf Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sun, 17 Oct 2021 13:01:15 +0100 Subject: [PATCH 1352/5344] media: ite-cir: IR receiver stop working after receive overflow commit fdc881783099c6343921ff017450831c8766d12a upstream. On an Intel NUC6iSYK, no IR is reported after a receive overflow. When a receiver overflow occurs, this condition is only cleared by reading the fifo. Make sure we read anything in the fifo. Fixes: 28c7afb07ccf ("media: ite-cir: check for receive overflow") Suggested-by: Bryan Pass Tested-by: Bryan Pass Cc: stable@vger.kernel.org> Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/rc/ite-cir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/rc/ite-cir.c b/drivers/media/rc/ite-cir.c index 4b8aee390518d..742b7f41b75f8 100644 --- a/drivers/media/rc/ite-cir.c +++ b/drivers/media/rc/ite-cir.c @@ -283,7 +283,7 @@ static irqreturn_t ite_cir_isr(int irq, void *data) } /* check for the receive interrupt */ - if (iflags & ITE_IRQ_RX_FIFO) { + if (iflags & (ITE_IRQ_RX_FIFO | ITE_IRQ_RX_FIFO_OVERRUN)) { /* read the FIFO bytes */ rx_bytes = dev->params.get_rx_bytes(dev, rx_buf, From 3dc078f49972bd5588beba02fd1ee1b6f07102b4 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Wed, 15 Sep 2021 18:14:07 +0200 Subject: [PATCH 1353/5344] media: ir-kbd-i2c: improve responsiveness of hauppauge zilog receivers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit c73ba202a851c0b611ef2c25e568fadeff5e667f upstream. The IR receiver has two issues: - Sometimes there is no response to a button press - Sometimes a button press is repeated when it should not have been Hanging the polling interval fixes this behaviour. Link: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=994050 Cc: stable@vger.kernel.org Suggested-by: Joaquín Alberto Calderón Pozo Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/i2c/ir-kbd-i2c.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/i2c/ir-kbd-i2c.c b/drivers/media/i2c/ir-kbd-i2c.c index 92376592455ee..56674173524fd 100644 --- a/drivers/media/i2c/ir-kbd-i2c.c +++ b/drivers/media/i2c/ir-kbd-i2c.c @@ -791,6 +791,7 @@ static int ir_probe(struct i2c_client *client, const struct i2c_device_id *id) rc_proto = RC_PROTO_BIT_RC5 | RC_PROTO_BIT_RC6_MCE | RC_PROTO_BIT_RC6_6A_32; ir_codes = RC_MAP_HAUPPAUGE; + ir->polling_interval = 125; probe_tx = true; break; } From dbfdbeb34c95499843d37de56f5d64e55e145e7a Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Fri, 18 Jun 2021 14:29:03 +0200 Subject: [PATCH 1354/5344] media: v4l2-ioctl: Fix check_ext_ctrls commit 861f92cb9160b14beef0ada047384c2340701ee2 upstream. Drivers that do not use the ctrl-framework use this function instead. Fix the following issues: - Do not check for multiple classes when getting the DEF_VAL. - Return -EINVAL for request_api calls - Default value cannot be changed, return EINVAL as soon as possible. - Return the right error_idx [If an error is found when validating the list of controls passed with VIDIOC_G_EXT_CTRLS, then error_idx shall be set to ctrls->count to indicate to userspace that no actual hardware was touched. It would have been much nicer of course if error_idx could point to the control index that failed the validation, but sadly that's not how the API was designed.] Fixes v4l2-compliance: Control ioctls (Input 0): warn: v4l2-test-controls.cpp(834): error_idx should be equal to count warn: v4l2-test-controls.cpp(855): error_idx should be equal to count fail: v4l2-test-controls.cpp(813): doioctl(node, VIDIOC_G_EXT_CTRLS, &ctrls) test VIDIOC_G/S/TRY_EXT_CTRLS: FAIL Buffer ioctls (Input 0): fail: v4l2-test-buffers.cpp(1994): ret != EINVAL && ret != EBADR && ret != ENOTTY test Requests: FAIL Cc: stable@vger.kernel.org Fixes: 6fa6f831f095 ("media: v4l2-ctrls: add core request support") Suggested-by: Hans Verkuil Reviewed-by: Hans Verkuil Signed-off-by: Ricardo Ribalda Signed-off-by: Laurent Pinchart Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/v4l2-core/v4l2-ioctl.c | 60 ++++++++++++++++++---------- 1 file changed, 39 insertions(+), 21 deletions(-) diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c index 24db33f803c06..98633fa5d46fd 100644 --- a/drivers/media/v4l2-core/v4l2-ioctl.c +++ b/drivers/media/v4l2-core/v4l2-ioctl.c @@ -902,7 +902,7 @@ static void v4l_print_default(const void *arg, bool write_only) pr_cont("driver-specific ioctl\n"); } -static int check_ext_ctrls(struct v4l2_ext_controls *c, int allow_priv) +static bool check_ext_ctrls(struct v4l2_ext_controls *c, unsigned long ioctl) { __u32 i; @@ -911,23 +911,41 @@ static int check_ext_ctrls(struct v4l2_ext_controls *c, int allow_priv) for (i = 0; i < c->count; i++) c->controls[i].reserved2[0] = 0; - /* V4L2_CID_PRIVATE_BASE cannot be used as control class - when using extended controls. - Only when passed in through VIDIOC_G_CTRL and VIDIOC_S_CTRL - is it allowed for backwards compatibility. - */ - if (!allow_priv && c->which == V4L2_CID_PRIVATE_BASE) - return 0; - if (!c->which) - return 1; + switch (c->which) { + case V4L2_CID_PRIVATE_BASE: + /* + * V4L2_CID_PRIVATE_BASE cannot be used as control class + * when using extended controls. + * Only when passed in through VIDIOC_G_CTRL and VIDIOC_S_CTRL + * is it allowed for backwards compatibility. + */ + if (ioctl == VIDIOC_G_CTRL || ioctl == VIDIOC_S_CTRL) + return false; + break; + case V4L2_CTRL_WHICH_DEF_VAL: + /* Default value cannot be changed */ + if (ioctl == VIDIOC_S_EXT_CTRLS || + ioctl == VIDIOC_TRY_EXT_CTRLS) { + c->error_idx = c->count; + return false; + } + return true; + case V4L2_CTRL_WHICH_CUR_VAL: + return true; + case V4L2_CTRL_WHICH_REQUEST_VAL: + c->error_idx = c->count; + return false; + } + /* Check that all controls are from the same control class. */ for (i = 0; i < c->count; i++) { if (V4L2_CTRL_ID2WHICH(c->controls[i].id) != c->which) { - c->error_idx = i; - return 0; + c->error_idx = ioctl == VIDIOC_TRY_EXT_CTRLS ? i : + c->count; + return false; } } - return 1; + return true; } static int check_fmt(struct file *file, enum v4l2_buf_type type) @@ -2145,7 +2163,7 @@ static int v4l_g_ctrl(const struct v4l2_ioctl_ops *ops, ctrls.controls = &ctrl; ctrl.id = p->id; ctrl.value = p->value; - if (check_ext_ctrls(&ctrls, 1)) { + if (check_ext_ctrls(&ctrls, VIDIOC_G_CTRL)) { int ret = ops->vidioc_g_ext_ctrls(file, fh, &ctrls); if (ret == 0) @@ -2179,7 +2197,7 @@ static int v4l_s_ctrl(const struct v4l2_ioctl_ops *ops, ctrls.controls = &ctrl; ctrl.id = p->id; ctrl.value = p->value; - if (check_ext_ctrls(&ctrls, 1)) + if (check_ext_ctrls(&ctrls, VIDIOC_S_CTRL)) return ops->vidioc_s_ext_ctrls(file, fh, &ctrls); return -EINVAL; } @@ -2201,8 +2219,8 @@ static int v4l_g_ext_ctrls(const struct v4l2_ioctl_ops *ops, vfd, vfd->v4l2_dev->mdev, p); if (ops->vidioc_g_ext_ctrls == NULL) return -ENOTTY; - return check_ext_ctrls(p, 0) ? ops->vidioc_g_ext_ctrls(file, fh, p) : - -EINVAL; + return check_ext_ctrls(p, VIDIOC_G_EXT_CTRLS) ? + ops->vidioc_g_ext_ctrls(file, fh, p) : -EINVAL; } static int v4l_s_ext_ctrls(const struct v4l2_ioctl_ops *ops, @@ -2222,8 +2240,8 @@ static int v4l_s_ext_ctrls(const struct v4l2_ioctl_ops *ops, vfd, vfd->v4l2_dev->mdev, p); if (ops->vidioc_s_ext_ctrls == NULL) return -ENOTTY; - return check_ext_ctrls(p, 0) ? ops->vidioc_s_ext_ctrls(file, fh, p) : - -EINVAL; + return check_ext_ctrls(p, VIDIOC_S_EXT_CTRLS) ? + ops->vidioc_s_ext_ctrls(file, fh, p) : -EINVAL; } static int v4l_try_ext_ctrls(const struct v4l2_ioctl_ops *ops, @@ -2243,8 +2261,8 @@ static int v4l_try_ext_ctrls(const struct v4l2_ioctl_ops *ops, vfd, vfd->v4l2_dev->mdev, p); if (ops->vidioc_try_ext_ctrls == NULL) return -ENOTTY; - return check_ext_ctrls(p, 0) ? ops->vidioc_try_ext_ctrls(file, fh, p) : - -EINVAL; + return check_ext_ctrls(p, VIDIOC_TRY_EXT_CTRLS) ? + ops->vidioc_try_ext_ctrls(file, fh, p) : -EINVAL; } /* From 2412cdebf1e1ea04897c38dd23d866e9e60dcc00 Mon Sep 17 00:00:00 2001 From: Tim Crawford Date: Mon, 1 Nov 2021 10:21:34 -0600 Subject: [PATCH 1355/5344] ALSA: hda/realtek: Add quirk for Clevo PC70HS commit dbfe83507cf4ea66ce4efee2ac14c5ad420e31d3 upstream. Apply the PB51ED PCI quirk to the Clevo PC70HS. Fixes audio output from the internal speakers. Signed-off-by: Tim Crawford Cc: Link: https://lore.kernel.org/r/20211101162134.5336-1-tcrawford@system76.com Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 49aefb7875e72..8e986b028ecea 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -2541,6 +2541,7 @@ static const struct snd_pci_quirk alc882_fixup_tbl[] = { SND_PCI_QUIRK(0x1558, 0x67d1, "Clevo PB71[ER][CDF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x67e1, "Clevo PB71[DE][CDF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x67e5, "Clevo PC70D[PRS](?:-D|-G)?", ALC1220_FIXUP_CLEVO_PB51ED_PINS), + SND_PCI_QUIRK(0x1558, 0x67f1, "Clevo PC70H[PRS]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x70d1, "Clevo PC70[ER][CDF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x7714, "Clevo X170SM", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x7715, "Clevo X170KM-G", ALC1220_FIXUP_CLEVO_PB51ED), From d78d626739e44b0c326c850aa71e8af5865bc8c7 Mon Sep 17 00:00:00 2001 From: Jaroslav Kysela Date: Thu, 4 Nov 2021 16:57:26 +0100 Subject: [PATCH 1356/5344] ALSA: hda/realtek: Add a quirk for Acer Spin SP513-54N commit 2a5bb694488bb6593066d46881bfd9d07edd1628 upstream. Another model requires ALC255_FIXUP_ACER_MIC_NO_PRESENCE fixup. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=211853 Signed-off-by: Jaroslav Kysela Cc: Link: https://lore.kernel.org/r/20211104155726.2090997-1-perex@perex.cz Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 8e986b028ecea..527c62e5566c7 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -7977,6 +7977,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1025, 0x1308, "Acer Aspire Z24-890", ALC286_FIXUP_ACER_AIO_HEADSET_MIC), SND_PCI_QUIRK(0x1025, 0x132a, "Acer TravelMate B114-21", ALC233_FIXUP_ACER_HEADSET_MIC), SND_PCI_QUIRK(0x1025, 0x1330, "Acer TravelMate X514-51T", ALC255_FIXUP_ACER_HEADSET_MIC), + SND_PCI_QUIRK(0x1025, 0x141f, "Acer Spin SP513-54N", ALC255_FIXUP_ACER_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1025, 0x142b, "Acer Swift SF314-42", ALC255_FIXUP_ACER_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1025, 0x1430, "Acer TravelMate B311R-31", ALC256_FIXUP_ACER_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x0470, "Dell M101z", ALC269_FIXUP_DELL_M101Z), From 77c7ded0d7c3345bbcefc4e88ef65629d0014d24 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Sun, 7 Nov 2021 09:33:39 +0100 Subject: [PATCH 1357/5344] ALSA: hda/realtek: Add quirk for ASUS UX550VE commit 4fad4fb9871b43389e4f4bead18ec693064697bb upstream. ASUS UX550VE (SSID 1043:1970) requires a similar workaround for managing the routing of the 4 speakers like some other ASUS models. Add a corresponding quirk entry for fixing it. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=212641 Cc: Link: https://lore.kernel.org/r/20211107083339.18013-1-tiwai@suse.de Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 527c62e5566c7..e672bb452ebaf 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -8134,6 +8134,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x18b1, "Asus MJ401TA", ALC256_FIXUP_ASUS_HEADSET_MIC), SND_PCI_QUIRK(0x1043, 0x18f1, "Asus FX505DT", ALC256_FIXUP_ASUS_HEADSET_MIC), SND_PCI_QUIRK(0x1043, 0x194e, "ASUS UX563FD", ALC294_FIXUP_ASUS_HPE), + SND_PCI_QUIRK(0x1043, 0x1970, "ASUS UX550VE", ALC289_FIXUP_ASUS_GA401), SND_PCI_QUIRK(0x1043, 0x1982, "ASUS B1400CEPE", ALC256_FIXUP_ASUS_HPE), SND_PCI_QUIRK(0x1043, 0x19ce, "ASUS B9450FA", ALC294_FIXUP_ASUS_HPE), SND_PCI_QUIRK(0x1043, 0x19e1, "ASUS UX581LV", ALC295_FIXUP_ASUS_MIC_NO_PRESENCE), From 7075d29e03e70904793fe782bddb23a7e7459afb Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Wed, 10 Nov 2021 22:40:32 +0800 Subject: [PATCH 1358/5344] ALSA: hda/realtek: Add quirk for HP EliteBook 840 G7 mute LED commit c058493df7edcef8f48c1494d9a84218519f966b upstream. The mute and micmute LEDs don't work on HP EliteBook 840 G7. The same quirk for other HP laptops can let LEDs work, so apply it. Signed-off-by: Kai-Heng Feng Cc: Link: https://lore.kernel.org/r/20211110144033.118451-1-kai.heng.feng@canonical.com Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index e672bb452ebaf..94fc17b28e9c7 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -8104,6 +8104,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x861f, "HP Elite Dragonfly G1", ALC285_FIXUP_HP_GPIO_AMP_INIT), SND_PCI_QUIRK(0x103c, 0x869d, "HP", ALC236_FIXUP_HP_MUTE_LED), SND_PCI_QUIRK(0x103c, 0x8724, "HP EliteBook 850 G7", ALC285_FIXUP_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8728, "HP EliteBook 840 G7", ALC285_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8729, "HP", ALC285_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8736, "HP", ALC285_FIXUP_HP_GPIO_AMP_INIT), SND_PCI_QUIRK(0x103c, 0x8760, "HP", ALC285_FIXUP_HP_MUTE_LED), From bb42dc403d3ea0f4666f01a03978f9fe58f19711 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 26 Oct 2021 11:54:01 +0200 Subject: [PATCH 1359/5344] ALSA: ua101: fix division by zero at probe commit 55f261b73a7e1cb254577c3536cef8f415de220a upstream. Add the missing endpoint max-packet sanity check to probe() to avoid division by zero in alloc_stream_buffers() in case a malicious device has broken descriptors (or when doing descriptor fuzz testing). Note that USB core will reject URBs submitted for endpoints with zero wMaxPacketSize but that drivers doing packet-size calculations still need to handle this (cf. commit 2548288b4fb0 ("USB: Fix: Don't skip endpoint descriptors with maxpacket=0")). Fixes: 63978ab3e3e9 ("sound: add Edirol UA-101 support") Cc: stable@vger.kernel.org # 2.6.34 Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20211026095401.26522-1-johan@kernel.org Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/misc/ua101.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/usb/misc/ua101.c b/sound/usb/misc/ua101.c index 307b72d5fffa9..77304a29a61d2 100644 --- a/sound/usb/misc/ua101.c +++ b/sound/usb/misc/ua101.c @@ -1020,7 +1020,7 @@ static int detect_usb_format(struct ua101 *ua) fmt_playback->bSubframeSize * ua->playback.channels; epd = &ua->intf[INTF_CAPTURE]->altsetting[1].endpoint[0].desc; - if (!usb_endpoint_is_isoc_in(epd)) { + if (!usb_endpoint_is_isoc_in(epd) || usb_endpoint_maxp(epd) == 0) { dev_err(&ua->dev->dev, "invalid capture endpoint\n"); return -ENXIO; } @@ -1028,7 +1028,7 @@ static int detect_usb_format(struct ua101 *ua) ua->capture.max_packet_bytes = usb_endpoint_maxp(epd); epd = &ua->intf[INTF_PLAYBACK]->altsetting[1].endpoint[0].desc; - if (!usb_endpoint_is_isoc_out(epd)) { + if (!usb_endpoint_is_isoc_out(epd) || usb_endpoint_maxp(epd) == 0) { dev_err(&ua->dev->dev, "invalid playback endpoint\n"); return -ENXIO; } From 56a7a2f060db7809719a4d73592ebe5efafe3e82 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 25 Oct 2021 14:11:41 +0200 Subject: [PATCH 1360/5344] ALSA: 6fire: fix control and bulk message timeouts commit 9b371c6cc37f954360989eec41c2ddc5a6b83917 upstream. USB control and bulk message timeouts are specified in milliseconds and should specifically not vary with CONFIG_HZ. Fixes: c6d43ba816d1 ("ALSA: usb/6fire - Driver for TerraTec DMX 6Fire USB") Cc: stable@vger.kernel.org # 2.6.39 Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20211025121142.6531-2-johan@kernel.org Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/6fire/comm.c | 2 +- sound/usb/6fire/firmware.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sound/usb/6fire/comm.c b/sound/usb/6fire/comm.c index 43a2a62d66f7e..49629d4bb327a 100644 --- a/sound/usb/6fire/comm.c +++ b/sound/usb/6fire/comm.c @@ -95,7 +95,7 @@ static int usb6fire_comm_send_buffer(u8 *buffer, struct usb_device *dev) int actual_len; ret = usb_interrupt_msg(dev, usb_sndintpipe(dev, COMM_EP), - buffer, buffer[1] + 2, &actual_len, HZ); + buffer, buffer[1] + 2, &actual_len, 1000); if (ret < 0) return ret; else if (actual_len != buffer[1] + 2) diff --git a/sound/usb/6fire/firmware.c b/sound/usb/6fire/firmware.c index 69137c14d0dcf..2333e8ff3411a 100644 --- a/sound/usb/6fire/firmware.c +++ b/sound/usb/6fire/firmware.c @@ -162,7 +162,7 @@ static int usb6fire_fw_ezusb_write(struct usb_device *device, ret = usb_control_msg(device, usb_sndctrlpipe(device, 0), type, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, - value, 0, data, len, HZ); + value, 0, data, len, 1000); if (ret < 0) return ret; else if (ret != len) @@ -175,7 +175,7 @@ static int usb6fire_fw_ezusb_read(struct usb_device *device, { int ret = usb_control_msg(device, usb_rcvctrlpipe(device, 0), type, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, value, - 0, data, len, HZ); + 0, data, len, 1000); if (ret < 0) return ret; else if (ret != len) @@ -190,7 +190,7 @@ static int usb6fire_fw_fpga_write(struct usb_device *device, int ret; ret = usb_bulk_msg(device, usb_sndbulkpipe(device, FPGA_EP), data, len, - &actual_len, HZ); + &actual_len, 1000); if (ret < 0) return ret; else if (actual_len != len) From c902e651556019dcaad2725f78c0864a8a706d87 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 25 Oct 2021 14:11:42 +0200 Subject: [PATCH 1361/5344] ALSA: line6: fix control and interrupt message timeouts commit f4000b58b64344871d7b27c05e73932f137cfef6 upstream. USB control and interrupt message timeouts are specified in milliseconds and should specifically not vary with CONFIG_HZ. Fixes: 705ececd1c60 ("Staging: add line6 usb driver") Cc: stable@vger.kernel.org # 2.6.30 Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20211025121142.6531-3-johan@kernel.org Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/line6/driver.c | 14 +++++++------- sound/usb/line6/driver.h | 2 +- sound/usb/line6/podhd.c | 6 +++--- sound/usb/line6/toneport.c | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/sound/usb/line6/driver.c b/sound/usb/line6/driver.c index 1e38cdda2af69..8ca56ba600cf4 100644 --- a/sound/usb/line6/driver.c +++ b/sound/usb/line6/driver.c @@ -113,12 +113,12 @@ static int line6_send_raw_message(struct usb_line6 *line6, const char *buffer, retval = usb_interrupt_msg(line6->usbdev, usb_sndintpipe(line6->usbdev, properties->ep_ctrl_w), (char *)frag_buf, frag_size, - &partial, LINE6_TIMEOUT * HZ); + &partial, LINE6_TIMEOUT); } else { retval = usb_bulk_msg(line6->usbdev, usb_sndbulkpipe(line6->usbdev, properties->ep_ctrl_w), (char *)frag_buf, frag_size, - &partial, LINE6_TIMEOUT * HZ); + &partial, LINE6_TIMEOUT); } if (retval) { @@ -350,7 +350,7 @@ int line6_read_data(struct usb_line6 *line6, unsigned address, void *data, ret = usb_control_msg(usbdev, usb_sndctrlpipe(usbdev, 0), 0x67, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, (datalen << 8) | 0x21, address, - NULL, 0, LINE6_TIMEOUT * HZ); + NULL, 0, LINE6_TIMEOUT); if (ret < 0) { dev_err(line6->ifcdev, "read request failed (error %d)\n", ret); @@ -365,7 +365,7 @@ int line6_read_data(struct usb_line6 *line6, unsigned address, void *data, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, 0x0012, 0x0000, len, 1, - LINE6_TIMEOUT * HZ); + LINE6_TIMEOUT); if (ret < 0) { dev_err(line6->ifcdev, "receive length failed (error %d)\n", ret); @@ -393,7 +393,7 @@ int line6_read_data(struct usb_line6 *line6, unsigned address, void *data, ret = usb_control_msg(usbdev, usb_rcvctrlpipe(usbdev, 0), 0x67, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, 0x0013, 0x0000, data, datalen, - LINE6_TIMEOUT * HZ); + LINE6_TIMEOUT); if (ret < 0) dev_err(line6->ifcdev, "read failed (error %d)\n", ret); @@ -425,7 +425,7 @@ int line6_write_data(struct usb_line6 *line6, unsigned address, void *data, ret = usb_control_msg(usbdev, usb_sndctrlpipe(usbdev, 0), 0x67, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, 0x0022, address, data, datalen, - LINE6_TIMEOUT * HZ); + LINE6_TIMEOUT); if (ret < 0) { dev_err(line6->ifcdev, @@ -441,7 +441,7 @@ int line6_write_data(struct usb_line6 *line6, unsigned address, void *data, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, 0x0012, 0x0000, - status, 1, LINE6_TIMEOUT * HZ); + status, 1, LINE6_TIMEOUT); if (ret < 0) { dev_err(line6->ifcdev, diff --git a/sound/usb/line6/driver.h b/sound/usb/line6/driver.h index e5e572ed5f304..890c239e3fc01 100644 --- a/sound/usb/line6/driver.h +++ b/sound/usb/line6/driver.h @@ -27,7 +27,7 @@ #define LINE6_FALLBACK_INTERVAL 10 #define LINE6_FALLBACK_MAXPACKETSIZE 16 -#define LINE6_TIMEOUT 1 +#define LINE6_TIMEOUT 1000 #define LINE6_BUFSIZE_LISTEN 64 #define LINE6_MIDI_MESSAGE_MAXLEN 256 diff --git a/sound/usb/line6/podhd.c b/sound/usb/line6/podhd.c index 5d9954a2d05e6..8b1610bdb8d5c 100644 --- a/sound/usb/line6/podhd.c +++ b/sound/usb/line6/podhd.c @@ -190,7 +190,7 @@ static int podhd_dev_start(struct usb_line6_podhd *pod) ret = usb_control_msg(usbdev, usb_sndctrlpipe(usbdev, 0), 0x67, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, 0x11, 0, - NULL, 0, LINE6_TIMEOUT * HZ); + NULL, 0, LINE6_TIMEOUT); if (ret < 0) { dev_err(pod->line6.ifcdev, "read request failed (error %d)\n", ret); goto exit; @@ -200,7 +200,7 @@ static int podhd_dev_start(struct usb_line6_podhd *pod) ret = usb_control_msg(usbdev, usb_rcvctrlpipe(usbdev, 0), 0x67, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, 0x11, 0x0, - init_bytes, 3, LINE6_TIMEOUT * HZ); + init_bytes, 3, LINE6_TIMEOUT); if (ret < 0) { dev_err(pod->line6.ifcdev, "receive length failed (error %d)\n", ret); @@ -220,7 +220,7 @@ static int podhd_dev_start(struct usb_line6_podhd *pod) USB_REQ_SET_FEATURE, USB_TYPE_STANDARD | USB_RECIP_DEVICE | USB_DIR_OUT, 1, 0, - NULL, 0, LINE6_TIMEOUT * HZ); + NULL, 0, LINE6_TIMEOUT); exit: kfree(init_bytes); return ret; diff --git a/sound/usb/line6/toneport.c b/sound/usb/line6/toneport.c index d0a555dbe324f..21f86c71dad7f 100644 --- a/sound/usb/line6/toneport.c +++ b/sound/usb/line6/toneport.c @@ -128,7 +128,7 @@ static int toneport_send_cmd(struct usb_device *usbdev, int cmd1, int cmd2) ret = usb_control_msg(usbdev, usb_sndctrlpipe(usbdev, 0), 0x67, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, - cmd1, cmd2, NULL, 0, LINE6_TIMEOUT * HZ); + cmd1, cmd2, NULL, 0, LINE6_TIMEOUT); if (ret < 0) { dev_err(&usbdev->dev, "send failed (error %d)\n", ret); From 3eaa08d8f259183d5f5e26568cd2de4f87abbe26 Mon Sep 17 00:00:00 2001 From: Alexander Tsoy Date: Sat, 30 Oct 2021 20:43:08 +0300 Subject: [PATCH 1362/5344] ALSA: usb-audio: Add registration quirk for JBL Quantum 400 commit 763d92ed5dece7d439fc28a88b2d2728d525ffd9 upstream. Add another device ID for JBL Quantum 400. It requires the same quirk as other JBL Quantum devices. Signed-off-by: Alexander Tsoy Cc: Link: https://lore.kernel.org/r/20211030174308.1011825-1-alexander@tsoy.me Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index d5d828817c5e8..72223545abfd8 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1842,6 +1842,7 @@ static const struct registration_quirk registration_quirks[] = { REG_QUIRK_ENTRY(0x0951, 0x16ea, 2), /* Kingston HyperX Cloud Flight S */ REG_QUIRK_ENTRY(0x0ecb, 0x1f46, 2), /* JBL Quantum 600 */ REG_QUIRK_ENTRY(0x0ecb, 0x1f47, 2), /* JBL Quantum 800 */ + REG_QUIRK_ENTRY(0x0ecb, 0x1f4c, 2), /* JBL Quantum 400 */ REG_QUIRK_ENTRY(0x0ecb, 0x2039, 2), /* JBL Quantum 400 */ REG_QUIRK_ENTRY(0x0ecb, 0x203c, 2), /* JBL Quantum 600 */ REG_QUIRK_ENTRY(0x0ecb, 0x203e, 2), /* JBL Quantum 800 */ From 20b519995a9ff5a9b4624a7e460a73c8f074d9a2 Mon Sep 17 00:00:00 2001 From: Austin Kim Date: Tue, 9 Nov 2021 00:37:42 +0000 Subject: [PATCH 1363/5344] ALSA: synth: missing check for possible NULL after the call to kstrdup commit d159037abbe3412285c271bdfb9cdf19e62678ff upstream. If kcalloc() return NULL due to memory starvation, it is possible for kstrdup() to return NULL in similar case. So add null check after the call to kstrdup() is made. [ minor coding-style fix by tiwai ] Signed-off-by: Austin Kim Cc: Link: https://lore.kernel.org/r/20211109003742.GA5423@raspberrypi Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/synth/emux/emux.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/synth/emux/emux.c b/sound/synth/emux/emux.c index f65e6c7b139f5..6695530bba9b3 100644 --- a/sound/synth/emux/emux.c +++ b/sound/synth/emux/emux.c @@ -88,7 +88,7 @@ int snd_emux_register(struct snd_emux *emu, struct snd_card *card, int index, ch emu->name = kstrdup(name, GFP_KERNEL); emu->voices = kcalloc(emu->max_voices, sizeof(struct snd_emux_voice), GFP_KERNEL); - if (emu->voices == NULL) + if (emu->name == NULL || emu->voices == NULL) return -ENOMEM; /* create soundfont list */ From 11ba6ba99ce328bebb42c22fd7e7b9aa9f1ab9e4 Mon Sep 17 00:00:00 2001 From: Wang Wensheng Date: Wed, 3 Nov 2021 03:35:17 +0000 Subject: [PATCH 1364/5344] ALSA: timer: Fix use-after-free problem commit c0317c0e87094f5b5782b6fdef5ae0a4b150496c upstream. When the timer instance was add into ack_list but was not currently in process, the user could stop it via snd_timer_stop1() without delete it from the ack_list. Then the user could free the timer instance and when it was actually processed UAF occurred. This issue could be reproduced via testcase snd_timer01 in ltp - running several instances of that testcase at the same time. What I actually met was that the ack_list of the timer broken and the kernel went into deadloop with irqoff. That could be detected by hardlockup detector on board or when we run it on qemu, we could use gdb to dump the ack_list when the console has no response. To fix this issue, we delete the timer instance from ack_list and active_list unconditionally in snd_timer_stop1(). Signed-off-by: Wang Wensheng Suggested-by: Takashi Iwai Cc: Link: https://lore.kernel.org/r/20211103033517.80531-1-wangwensheng4@huawei.com Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/core/timer.c b/sound/core/timer.c index b5a0ba79bf746..6b7cba0d23e0c 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -595,13 +595,13 @@ static int snd_timer_stop1(struct snd_timer_instance *timeri, bool stop) if (!timer) return -EINVAL; spin_lock_irqsave(&timer->lock, flags); + list_del_init(&timeri->ack_list); + list_del_init(&timeri->active_list); if (!(timeri->flags & (SNDRV_TIMER_IFLG_RUNNING | SNDRV_TIMER_IFLG_START))) { result = -EBUSY; goto unlock; } - list_del_init(&timeri->ack_list); - list_del_init(&timeri->active_list); if (timer->card && timer->card->shutdown) goto unlock; if (stop) { From 827fc80e872cb7342246abeb7b071700e5233018 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 5 Nov 2021 10:15:17 +0100 Subject: [PATCH 1365/5344] ALSA: timer: Unconditionally unlink slave instances, too commit ffdd98277f0a1d15a67a74ae09bee713df4c0dbc upstream. Like the previous fix (commit c0317c0e8709 "ALSA: timer: Fix use-after-free problem"), we have to unlink slave timer instances immediately at snd_timer_stop(), too. Otherwise it may leave a stale entry in the list if the slave instance is freed before actually running. Cc: Link: https://lore.kernel.org/r/20211105091517.21733-1-tiwai@suse.de Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/timer.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/sound/core/timer.c b/sound/core/timer.c index 6b7cba0d23e0c..d684aa4150aad 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -636,23 +636,22 @@ static int snd_timer_stop1(struct snd_timer_instance *timeri, bool stop) static int snd_timer_stop_slave(struct snd_timer_instance *timeri, bool stop) { unsigned long flags; + bool running; spin_lock_irqsave(&slave_active_lock, flags); - if (!(timeri->flags & SNDRV_TIMER_IFLG_RUNNING)) { - spin_unlock_irqrestore(&slave_active_lock, flags); - return -EBUSY; - } + running = timeri->flags & SNDRV_TIMER_IFLG_RUNNING; timeri->flags &= ~SNDRV_TIMER_IFLG_RUNNING; if (timeri->timer) { spin_lock(&timeri->timer->lock); list_del_init(&timeri->ack_list); list_del_init(&timeri->active_list); - snd_timer_notify1(timeri, stop ? SNDRV_TIMER_EVENT_STOP : - SNDRV_TIMER_EVENT_PAUSE); + if (running) + snd_timer_notify1(timeri, stop ? SNDRV_TIMER_EVENT_STOP : + SNDRV_TIMER_EVENT_PAUSE); spin_unlock(&timeri->timer->lock); } spin_unlock_irqrestore(&slave_active_lock, flags); - return 0; + return running ? 0 : -EBUSY; } /* From d3f647aa5abc4fdb09cba5e795667ec773565518 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 2 Nov 2021 11:10:37 +0100 Subject: [PATCH 1366/5344] fuse: fix page stealing commit 712a951025c0667ff00b25afc360f74e639dfabe upstream. It is possible to trigger a crash by splicing anon pipe bufs to the fuse device. The reason for this is that anon_pipe_buf_release() will reuse buf->page if the refcount is 1, but that page might have already been stolen and its flags modified (e.g. PG_lru added). This happens in the unlikely case of fuse_dev_splice_write() getting around to calling pipe_buf_release() after a page has been stolen, added to the page cache and removed from the page cache. Fix by calling pipe_buf_release() right after the page was inserted into the page cache. In this case the page has an elevated refcount so any release function will know that the page isn't reusable. Reported-by: Frank Dinoff Link: https://lore.kernel.org/r/CAAmZXrsGg2xsP1CK+cbuEMumtrqdvD-NKnWzhNcvn71RV3c1yw@mail.gmail.com/ Fixes: dd3bb14f44a6 ("fuse: support splice() writing to fuse device") Cc: # v2.6.35 Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/fuse/dev.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 65683087aba36..1861d85b3abcf 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -839,6 +839,12 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) goto out_put_old; } + /* + * Release while we have extra ref on stolen page. Otherwise + * anon_pipe_buf_release() might think the page can be reused. + */ + pipe_buf_release(cs->pipe, buf); + get_page(newpage); if (!(buf->flags & PIPE_BUF_FLAG_LRU)) @@ -2027,8 +2033,12 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, pipe_lock(pipe); out_free: - for (idx = 0; idx < nbuf; idx++) - pipe_buf_release(pipe, &bufs[idx]); + for (idx = 0; idx < nbuf; idx++) { + struct pipe_buffer *buf = &bufs[idx]; + + if (buf->ops) + pipe_buf_release(pipe, buf); + } pipe_unlock(pipe); kvfree(bufs); From bd1072dad68c5a8f8c671d8cabe092df85216977 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Fri, 15 Oct 2021 12:24:16 -0500 Subject: [PATCH 1367/5344] x86/sme: Use #define USE_EARLY_PGTABLE_L5 in mem_encrypt_identity.c commit e7d445ab26db833d6640d4c9a08bee176777cc82 upstream. When runtime support for converting between 4-level and 5-level pagetables was added to the kernel, the SME code that built pagetables was updated to use the pagetable functions, e.g. p4d_offset(), etc., in order to simplify the code. However, the use of the pagetable functions in early boot code requires the use of the USE_EARLY_PGTABLE_L5 #define in order to ensure that the proper definition of pgtable_l5_enabled() is used. Without the #define, pgtable_l5_enabled() is #defined as cpu_feature_enabled(X86_FEATURE_LA57). In early boot, the CPU features have not yet been discovered and populated, so pgtable_l5_enabled() will return false even when 5-level paging is enabled. This causes the SME code to always build 4-level pagetables to perform the in-place encryption. If 5-level paging is enabled, switching to the SME pagetables results in a page-fault that kills the boot. Adding the #define results in pgtable_l5_enabled() using the __pgtable_l5_enabled variable set in early boot and the SME code building pagetables for the proper paging level. Fixes: aad983913d77 ("x86/mm/encrypt: Simplify sme_populate_pgd() and sme_populate_pgd_large()") Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov Acked-by: Kirill A. Shutemov Cc: # 4.18.x Link: https://lkml.kernel.org/r/2cb8329655f5c753905812d951e212022a480475.1634318656.git.thomas.lendacky@amd.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/mem_encrypt_identity.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index 84cda5dc03870..ebc8e3af1c675 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -27,6 +27,15 @@ #undef CONFIG_PARAVIRT_XXL #undef CONFIG_PARAVIRT_SPINLOCKS +/* + * This code runs before CPU feature bits are set. By default, the + * pgtable_l5_enabled() function uses bit X86_FEATURE_LA57 to determine if + * 5-level paging is active, so that won't work here. USE_EARLY_PGTABLE_L5 + * is provided to handle this situation and, instead, use a variable that + * has been set by the early boot code. + */ +#define USE_EARLY_PGTABLE_L5 + #include #include #include From b73acc1eead5f0b8bf247024a96ad88809be770b Mon Sep 17 00:00:00 2001 From: Jane Malalane Date: Thu, 21 Oct 2021 11:47:44 +0100 Subject: [PATCH 1368/5344] x86/cpu: Fix migration safety with X86_BUG_NULL_SEL commit 415de44076640483648d6c0f6d645a9ee61328ad upstream. Currently, Linux probes for X86_BUG_NULL_SEL unconditionally which makes it unsafe to migrate in a virtualised environment as the properties across the migration pool might differ. To be specific, the case which goes wrong is: 1. Zen1 (or earlier) and Zen2 (or later) in a migration pool 2. Linux boots on Zen2, probes and finds the absence of X86_BUG_NULL_SEL 3. Linux is then migrated to Zen1 Linux is now running on a X86_BUG_NULL_SEL-impacted CPU while believing that the bug is fixed. The only way to address the problem is to fully trust the "no longer affected" CPUID bit when virtualised, because in the above case it would be clear deliberately to indicate the fact "you might migrate to somewhere which has this behaviour". Zen3 adds the NullSelectorClearsBase CPUID bit to indicate that loading a NULL segment selector zeroes the base and limit fields, as well as just attributes. Zen2 also has this behaviour but doesn't have the NSCB bit. [ bp: Minor touchups. ] Signed-off-by: Jane Malalane Signed-off-by: Borislav Petkov CC: Link: https://lkml.kernel.org/r/20211021104744.24126-1-jane.malalane@citrix.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/amd.c | 2 ++ arch/x86/kernel/cpu/common.c | 44 ++++++++++++++++++++++++++++++------ arch/x86/kernel/cpu/cpu.h | 1 + arch/x86/kernel/cpu/hygon.c | 2 ++ 4 files changed, 42 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 8d84494b57c4e..12c59eed769cc 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1012,6 +1012,8 @@ static void init_amd(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_IRPERF) && !cpu_has_amd_erratum(c, amd_erratum_1054)) msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT); + + check_null_seg_clears_base(c); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 1c055b22b432d..29c0093293d96 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1342,9 +1342,8 @@ void __init early_cpu_init(void) early_identify_cpu(&boot_cpu_data); } -static void detect_null_seg_behavior(struct cpuinfo_x86 *c) +static bool detect_null_seg_behavior(void) { -#ifdef CONFIG_X86_64 /* * Empirically, writing zero to a segment selector on AMD does * not clear the base, whereas writing zero to a segment @@ -1365,10 +1364,43 @@ static void detect_null_seg_behavior(struct cpuinfo_x86 *c) wrmsrl(MSR_FS_BASE, 1); loadsegment(fs, 0); rdmsrl(MSR_FS_BASE, tmp); - if (tmp != 0) - set_cpu_bug(c, X86_BUG_NULL_SEG); wrmsrl(MSR_FS_BASE, old_base); -#endif + return tmp == 0; +} + +void check_null_seg_clears_base(struct cpuinfo_x86 *c) +{ + /* BUG_NULL_SEG is only relevant with 64bit userspace */ + if (!IS_ENABLED(CONFIG_X86_64)) + return; + + /* Zen3 CPUs advertise Null Selector Clears Base in CPUID. */ + if (c->extended_cpuid_level >= 0x80000021 && + cpuid_eax(0x80000021) & BIT(6)) + return; + + /* + * CPUID bit above wasn't set. If this kernel is still running + * as a HV guest, then the HV has decided not to advertize + * that CPUID bit for whatever reason. For example, one + * member of the migration pool might be vulnerable. Which + * means, the bug is present: set the BUG flag and return. + */ + if (cpu_has(c, X86_FEATURE_HYPERVISOR)) { + set_cpu_bug(c, X86_BUG_NULL_SEG); + return; + } + + /* + * Zen2 CPUs also have this behaviour, but no CPUID bit. + * 0x18 is the respective family for Hygon. + */ + if ((c->x86 == 0x17 || c->x86 == 0x18) && + detect_null_seg_behavior()) + return; + + /* All the remaining ones are affected */ + set_cpu_bug(c, X86_BUG_NULL_SEG); } static void generic_identify(struct cpuinfo_x86 *c) @@ -1404,8 +1436,6 @@ static void generic_identify(struct cpuinfo_x86 *c) get_model_name(c); /* Default name */ - detect_null_seg_behavior(c); - /* * ESPFIX is a strange bug. All real CPUs have it. Paravirt * systems that run Linux at CPL > 0 may or may not have the diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 9d033693519aa..4d04c127c4a79 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -73,6 +73,7 @@ extern int detect_extended_topology_early(struct cpuinfo_x86 *c); extern int detect_extended_topology(struct cpuinfo_x86 *c); extern int detect_ht_early(struct cpuinfo_x86 *c); extern void detect_ht(struct cpuinfo_x86 *c); +extern void check_null_seg_clears_base(struct cpuinfo_x86 *c); unsigned int aperfmperf_get_khz(int cpu); diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 62e9a982adaf9..b232bd0be78d6 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -350,6 +350,8 @@ static void init_hygon(struct cpuinfo_x86 *c) /* Hygon CPUs don't reset SS attributes on SYSRET, Xen does. */ if (!cpu_has(c, X86_FEATURE_XENPV)) set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); + + check_null_seg_clears_base(c); } static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c) From fed4b860d9dc260dbccd09f5941e8ab74f282bab Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Oct 2021 17:11:04 -0700 Subject: [PATCH 1369/5344] x86/irq: Ensure PI wakeup handler is unregistered before module unload commit 6ff53f6a438f72998f56e82e76694a1df9d1ea2c upstream. Add a synchronize_rcu() after clearing the posted interrupt wakeup handler to ensure all readers, i.e. in-flight IRQ handlers, see the new handler before returning to the caller. If the caller is an exiting module and is unregistering its handler, failure to wait could result in the IRQ handler jumping into an unloaded module. The registration path doesn't require synchronization, as it's the caller's responsibility to not generate interrupts it cares about until after its handler is registered. Fixes: f6b3c72c2366 ("x86/irq: Define a global vector for VT-d Posted-Interrupts") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20211009001107.3936588-2-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/irq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 21efee32e2b12..7dfd0185767cc 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -295,8 +295,10 @@ void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)) { if (handler) kvm_posted_intr_wakeup_handler = handler; - else + else { kvm_posted_intr_wakeup_handler = dummy_handler; + synchronize_rcu(); + } } EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler); From 1a44586b790dd76665e29899d31075c5abed9f9b Mon Sep 17 00:00:00 2001 From: Zheyu Ma Date: Mon, 18 Oct 2021 02:16:22 +0000 Subject: [PATCH 1370/5344] cavium: Return negative value when pci_alloc_irq_vectors() fails [ Upstream commit b2cddb44bddc1a9c5949a978bb454bba863264db ] During the process of driver probing, the probe function should return < 0 for failure, otherwise, the kernel will treat value > 0 as success. Signed-off-by: Zheyu Ma Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/cavium/thunder/nic_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cavium/thunder/nic_main.c b/drivers/net/ethernet/cavium/thunder/nic_main.c index 9361f964bb9b2..816453a4f8d6c 100644 --- a/drivers/net/ethernet/cavium/thunder/nic_main.c +++ b/drivers/net/ethernet/cavium/thunder/nic_main.c @@ -1193,7 +1193,7 @@ static int nic_register_interrupts(struct nicpf *nic) dev_err(&nic->pdev->dev, "Request for #%d msix vectors failed, returned %d\n", nic->num_vec, ret); - return 1; + return ret; } /* Register mailbox interrupt handler */ From 2e814dfd7d9f8f6dd1dcf37a7e0388c3b7640fe5 Mon Sep 17 00:00:00 2001 From: Zheyu Ma Date: Mon, 18 Oct 2021 01:56:21 +0000 Subject: [PATCH 1371/5344] scsi: qla2xxx: Return -ENOMEM if kzalloc() fails [ Upstream commit 06634d5b6e923ed0d4772aba8def5a618f44c7fe ] The driver probing function should return < 0 for failure, otherwise kernel will treat value > 0 as success. Link: https://lore.kernel.org/r/1634522181-31166-1-git-send-email-zheyuma97@gmail.com Reviewed-by: Himanshu Madhani Signed-off-by: Zheyu Ma Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/qla2xxx/qla_os.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 8b68879058132..049a68c59c137 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -4066,7 +4066,7 @@ qla2x00_mem_alloc(struct qla_hw_data *ha, uint16_t req_len, uint16_t rsp_len, ql_dbg_pci(ql_dbg_init, ha->pdev, 0xe0ee, "%s: failed alloc dsd\n", __func__); - return 1; + return -ENOMEM; } ha->dif_bundle_kallocs++; From 43d2a997d56b2d575d14119c4b0fabd506854f11 Mon Sep 17 00:00:00 2001 From: Dmitry Bogdanov Date: Mon, 18 Oct 2021 15:26:50 +0300 Subject: [PATCH 1372/5344] scsi: qla2xxx: Fix unmap of already freed sgl [ Upstream commit 4a8f71014b4d56c4fb287607e844c0a9f68f46d9 ] The sgl is freed in the target stack in target_release_cmd_kref() before calling qlt_free_cmd() but there is an unmap of sgl in qlt_free_cmd() that causes a panic if sgl is not yet DMA unmapped: NIP dma_direct_unmap_sg+0xdc/0x180 LR dma_direct_unmap_sg+0xc8/0x180 Call Trace: ql_dbg_prefix+0x68/0xc0 [qla2xxx] (unreliable) dma_unmap_sg_attrs+0x54/0xf0 qlt_unmap_sg.part.19+0x54/0x1c0 [qla2xxx] qlt_free_cmd+0x124/0x1d0 [qla2xxx] tcm_qla2xxx_release_cmd+0x4c/0xa0 [tcm_qla2xxx] target_put_sess_cmd+0x198/0x370 [target_core_mod] transport_generic_free_cmd+0x6c/0x1b0 [target_core_mod] tcm_qla2xxx_complete_free+0x6c/0x90 [tcm_qla2xxx] The sgl may be left unmapped in error cases of response sending. For instance, qlt_rdy_to_xfer() maps sgl and exits when session is being deleted keeping the sgl mapped. This patch removes use-after-free of the sgl and ensures that the sgl is unmapped for any command that was not sent to firmware. Link: https://lore.kernel.org/r/20211018122650.11846-1-d.bogdanov@yadro.com Reviewed-by: Himanshu Madhani Signed-off-by: Dmitry Bogdanov Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/qla2xxx/qla_target.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c index 57068e2faef54..0111c543f0e64 100644 --- a/drivers/scsi/qla2xxx/qla_target.c +++ b/drivers/scsi/qla2xxx/qla_target.c @@ -3251,8 +3251,7 @@ int qlt_xmit_response(struct qla_tgt_cmd *cmd, int xmit_type, "RESET-RSP online/active/old-count/new-count = %d/%d/%d/%d.\n", vha->flags.online, qla2x00_reset_active(vha), cmd->reset_count, qpair->chip_reset); - spin_unlock_irqrestore(qpair->qp_lock_ptr, flags); - return 0; + goto out_unmap_unlock; } /* Does F/W have an IOCBs for this request */ @@ -3375,10 +3374,6 @@ int qlt_rdy_to_xfer(struct qla_tgt_cmd *cmd) prm.sg = NULL; prm.req_cnt = 1; - /* Calculate number of entries and segments required */ - if (qlt_pci_map_calc_cnt(&prm) != 0) - return -EAGAIN; - if (!qpair->fw_started || (cmd->reset_count != qpair->chip_reset) || (cmd->sess && cmd->sess->deleted)) { /* @@ -3396,6 +3391,10 @@ int qlt_rdy_to_xfer(struct qla_tgt_cmd *cmd) return 0; } + /* Calculate number of entries and segments required */ + if (qlt_pci_map_calc_cnt(&prm) != 0) + return -EAGAIN; + spin_lock_irqsave(qpair->qp_lock_ptr, flags); /* Does F/W have an IOCBs for this request */ res = qlt_check_reserve_free_req(qpair, prm.req_cnt); @@ -3800,9 +3799,6 @@ void qlt_free_cmd(struct qla_tgt_cmd *cmd) BUG_ON(cmd->cmd_in_wq); - if (cmd->sg_mapped) - qlt_unmap_sg(cmd->vha, cmd); - if (!cmd->q_full) qlt_decr_num_pend_cmds(cmd->vha); From fac30d753275621c3b03a8dc084df879b71eee75 Mon Sep 17 00:00:00 2001 From: Zheyu Ma Date: Mon, 18 Oct 2021 14:32:57 +0000 Subject: [PATCH 1373/5344] cavium: Fix return values of the probe function [ Upstream commit c69b2f46876825c726bd8a97c7fa852d8932bc32 ] During the process of driver probing, the probe function should return < 0 for failure, otherwise, the kernel will treat value > 0 as success. Signed-off-by: Zheyu Ma Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/cavium/thunder/nicvf_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index 5c45c0c6dd234..27ea528ef4484 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -1226,7 +1226,7 @@ static int nicvf_register_misc_interrupt(struct nicvf *nic) if (ret < 0) { netdev_err(nic->netdev, "Req for #%d msix vectors failed\n", nic->num_vec); - return 1; + return ret; } sprintf(nic->irq_name[irq], "%s Mbox", "NICVF"); @@ -1245,7 +1245,7 @@ static int nicvf_register_misc_interrupt(struct nicvf *nic) if (!nicvf_check_pf_ready(nic)) { nicvf_disable_intr(nic, NICVF_INTR_MBOX, 0); nicvf_unregister_interrupts(nic); - return 1; + return -EIO; } return 0; From 9f45c766450dcbc381a65340ec9970167d9661e6 Mon Sep 17 00:00:00 2001 From: Erik Ekman Date: Wed, 20 Oct 2021 00:40:16 +0200 Subject: [PATCH 1374/5344] sfc: Don't use netif_info before net_device setup [ Upstream commit bf6abf345dfa77786aca554bc58c64bd428ecb1d ] Use pci_info instead to avoid unnamed/uninitialized noise: [197088.688729] sfc 0000:01:00.0: Solarflare NIC detected [197088.690333] sfc 0000:01:00.0: Part Number : SFN5122F [197088.729061] sfc 0000:01:00.0 (unnamed net_device) (uninitialized): no SR-IOV VFs probed [197088.729071] sfc 0000:01:00.0 (unnamed net_device) (uninitialized): no PTP support Inspired by fa44821a4ddd ("sfc: don't use netif_info et al before net_device is registered") from Heiner Kallweit. Signed-off-by: Erik Ekman Acked-by: Martin Habets Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/sfc/ptp.c | 4 ++-- drivers/net/ethernet/sfc/siena_sriov.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c index 59b4f16896a81..1fa1b71dbfa11 100644 --- a/drivers/net/ethernet/sfc/ptp.c +++ b/drivers/net/ethernet/sfc/ptp.c @@ -648,7 +648,7 @@ static int efx_ptp_get_attributes(struct efx_nic *efx) } else if (rc == -EINVAL) { fmt = MC_CMD_PTP_OUT_GET_ATTRIBUTES_SECONDS_NANOSECONDS; } else if (rc == -EPERM) { - netif_info(efx, probe, efx->net_dev, "no PTP support\n"); + pci_info(efx->pci_dev, "no PTP support\n"); return rc; } else { efx_mcdi_display_error(efx, MC_CMD_PTP, sizeof(inbuf), @@ -824,7 +824,7 @@ static int efx_ptp_disable(struct efx_nic *efx) * should only have been called during probe. */ if (rc == -ENOSYS || rc == -EPERM) - netif_info(efx, probe, efx->net_dev, "no PTP support\n"); + pci_info(efx->pci_dev, "no PTP support\n"); else if (rc) efx_mcdi_display_error(efx, MC_CMD_PTP, MC_CMD_PTP_IN_DISABLE_LEN, diff --git a/drivers/net/ethernet/sfc/siena_sriov.c b/drivers/net/ethernet/sfc/siena_sriov.c index dfbdf05dcf794..68f092881d137 100644 --- a/drivers/net/ethernet/sfc/siena_sriov.c +++ b/drivers/net/ethernet/sfc/siena_sriov.c @@ -1056,7 +1056,7 @@ void efx_siena_sriov_probe(struct efx_nic *efx) return; if (efx_siena_sriov_cmd(efx, false, &efx->vi_scale, &count)) { - netif_info(efx, probe, efx->net_dev, "no SR-IOV VFs probed\n"); + pci_info(efx->pci_dev, "no SR-IOV VFs probed\n"); return; } if (count > 0 && count > max_vfs) From 9c9e047c4d96ab2a7f390da915c2aa88e40a51a2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 18 Oct 2021 15:19:08 +0200 Subject: [PATCH 1375/5344] hyperv/vmbus: include linux/bitops.h [ Upstream commit 8017c99680fa65e1e8d999df1583de476a187830 ] On arm64 randconfig builds, hyperv sometimes fails with this error: In file included from drivers/hv/hv_trace.c:3: In file included from drivers/hv/hyperv_vmbus.h:16: In file included from arch/arm64/include/asm/sync_bitops.h:5: arch/arm64/include/asm/bitops.h:11:2: error: only can be included directly In file included from include/asm-generic/bitops/hweight.h:5: include/asm-generic/bitops/arch_hweight.h:9:9: error: implicit declaration of function '__sw_hweight32' [-Werror,-Wimplicit-function-declaration] include/asm-generic/bitops/atomic.h:17:7: error: implicit declaration of function 'BIT_WORD' [-Werror,-Wimplicit-function-declaration] Include the correct header first. Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20211018131929.2260087-1-arnd@kernel.org Signed-off-by: Wei Liu Signed-off-by: Sasha Levin --- drivers/hv/hyperv_vmbus.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index cabcb66e7c5ef..356382a340b2c 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -13,6 +13,7 @@ #define _HYPERV_VMBUS_H #include +#include #include #include #include From 0efd038b073756b872680909cd35907cfd8bc263 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bastien=20Roucari=C3=A8s?= Date: Thu, 16 Sep 2021 08:17:21 +0000 Subject: [PATCH 1376/5344] ARM: dts: sun7i: A20-olinuxino-lime2: Fix ethernet phy-mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 55dd7e059098ce4bd0a55c251cb78e74604abb57 ] Commit bbc4d71d6354 ("net: phy: realtek: fix rtl8211e rx/tx delay config") sets the RX/TX delay according to the phy-mode property in the device tree. For the A20-olinuxino-lime2 board this is "rgmii", which is the wrong setting. Following the example of a900cac3750b ("ARM: dts: sun7i: a20: bananapro: Fix ethernet phy-mode") the phy-mode is changed to "rgmii-id" which gets the Ethernet working again on this board. Signed-off-by: Bastien Roucariès Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20210916081721.237137-1-rouca@debian.org Signed-off-by: Sasha Levin --- arch/arm/boot/dts/sun7i-a20-olinuxino-lime2.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/sun7i-a20-olinuxino-lime2.dts b/arch/arm/boot/dts/sun7i-a20-olinuxino-lime2.dts index 9ba62774e89a1..488933b87ad5a 100644 --- a/arch/arm/boot/dts/sun7i-a20-olinuxino-lime2.dts +++ b/arch/arm/boot/dts/sun7i-a20-olinuxino-lime2.dts @@ -112,7 +112,7 @@ pinctrl-names = "default"; pinctrl-0 = <&gmac_rgmii_pins>; phy-handle = <&phy1>; - phy-mode = "rgmii"; + phy-mode = "rgmii-id"; status = "okay"; }; From 642664431badfbb6302854fcb80149951aef470b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Anikiel?= Date: Mon, 20 Sep 2021 14:41:41 +0200 Subject: [PATCH 1377/5344] reset: socfpga: add empty driver allowing consumers to probe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 3ad60b4b3570937f3278509fe6797a5093ce53f8 ] The early reset driver doesn't ever probe, which causes consuming devices to be unable to probe. Add an empty driver to set this device as available, allowing consumers to probe. Signed-off-by: Paweł Anikiel Link: https://lore.kernel.org/r/20210920124141.1166544-4-pan@semihalf.com Signed-off-by: Philipp Zabel Signed-off-by: Sasha Levin --- drivers/reset/reset-socfpga.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/drivers/reset/reset-socfpga.c b/drivers/reset/reset-socfpga.c index 96953992c2bb5..1c5236a69dc49 100644 --- a/drivers/reset/reset-socfpga.c +++ b/drivers/reset/reset-socfpga.c @@ -86,3 +86,29 @@ void __init socfpga_reset_init(void) for_each_matching_node(np, socfpga_early_reset_dt_ids) a10_reset_init(np); } + +/* + * The early driver is problematic, because it doesn't register + * itself as a driver. This causes certain device links to prevent + * consumer devices from probing. The hacky solution is to register + * an empty driver, whose only job is to attach itself to the reset + * manager and call probe. + */ +static const struct of_device_id socfpga_reset_dt_ids[] = { + { .compatible = "altr,rst-mgr", }, + { /* sentinel */ }, +}; + +static int reset_simple_probe(struct platform_device *pdev) +{ + return 0; +} + +static struct platform_driver reset_socfpga_driver = { + .probe = reset_simple_probe, + .driver = { + .name = "socfpga-reset", + .of_match_table = socfpga_reset_dt_ids, + }, +}; +builtin_platform_driver(reset_socfpga_driver); From 7f295314bb81f1a905fd67778aa8c69bcc6468af Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 17 Oct 2021 10:59:49 -0700 Subject: [PATCH 1378/5344] mmc: winbond: don't build on M68K [ Upstream commit 162079f2dccd02cb4b6654defd32ca387dd6d4d4 ] The Winbond MMC driver fails to build on ARCH=m68k so prevent that build config. Silences these build errors: ../drivers/mmc/host/wbsd.c: In function 'wbsd_request_end': ../drivers/mmc/host/wbsd.c:212:28: error: implicit declaration of function 'claim_dma_lock' [-Werror=implicit-function-declaration] 212 | dmaflags = claim_dma_lock(); ../drivers/mmc/host/wbsd.c:215:17: error: implicit declaration of function 'release_dma_lock'; did you mean 'release_task'? [-Werror=implicit-function-declaration] 215 | release_dma_lock(dmaflags); Signed-off-by: Randy Dunlap Cc: Pierre Ossman Cc: Geert Uytterhoeven Cc: Arnd Bergmann Link: https://lore.kernel.org/r/20211017175949.23838-1-rdunlap@infradead.org Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- drivers/mmc/host/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index 49ea02c467bf1..1b4a40d910cfb 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -449,7 +449,7 @@ config MMC_OMAP_HS config MMC_WBSD tristate "Winbond W83L51xD SD/MMC Card Interface support" - depends on ISA_DMA_API + depends on ISA_DMA_API && !M68K help This selects the Winbond(R) W83L51xD Secure digital and Multimedia card Interface. From 310cd6fe1d6e4f06dd54ce8ea6aec7e373fa0c30 Mon Sep 17 00:00:00 2001 From: Bryant Mairs Date: Tue, 19 Oct 2021 09:24:33 -0500 Subject: [PATCH 1379/5344] drm: panel-orientation-quirks: Add quirk for Aya Neo 2021 [ Upstream commit def0c3697287f6e85d5ac68b21302966c95474f9 ] Fixes screen orientation for the Aya Neo 2021 handheld gaming console. Signed-off-by: Bryant Mairs Signed-off-by: Sam Ravnborg Link: https://patchwork.freedesktop.org/patch/msgid/20211019142433.4295-1-bryant@mai.rs Signed-off-by: Sasha Levin --- drivers/gpu/drm/drm_panel_orientation_quirks.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/drm_panel_orientation_quirks.c b/drivers/gpu/drm/drm_panel_orientation_quirks.c index f6bdec7fa9253..30c17a76f49ae 100644 --- a/drivers/gpu/drm/drm_panel_orientation_quirks.c +++ b/drivers/gpu/drm/drm_panel_orientation_quirks.c @@ -134,6 +134,12 @@ static const struct dmi_system_id orientation_data[] = { DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "T103HAF"), }, .driver_data = (void *)&lcd800x1280_rightside_up, + }, { /* AYA NEO 2021 */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "AYADEVICE"), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "AYA NEO 2021"), + }, + .driver_data = (void *)&lcd800x1280_rightside_up, }, { /* GPD MicroPC (generic strings, also match on bios date) */ .matches = { DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Default string"), From 37328c7a4456f70879498faf396e6a0be0164ab8 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Thu, 14 Oct 2021 15:25:52 +0100 Subject: [PATCH 1380/5344] bpf: Define bpf_jit_alloc_exec_limit for arm64 JIT [ Upstream commit 5d63ae908242f028bd10860cba98450d11c079b8 ] Expose the maximum amount of useable memory from the arm64 JIT. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211014142554.53120-3-lmb@cloudflare.com Signed-off-by: Sasha Levin --- arch/arm64/net/bpf_jit_comp.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index afc7d41347f73..a343677837c77 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -996,6 +996,11 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) return prog; } +u64 bpf_jit_alloc_exec_limit(void) +{ + return BPF_JIT_REGION_SIZE; +} + void *bpf_jit_alloc_exec(unsigned long size) { return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START, From aeca111d30bfe77d432eea699f2aee7371bcef70 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Thu, 14 Oct 2021 15:25:53 +0100 Subject: [PATCH 1381/5344] bpf: Prevent increasing bpf_jit_limit above max [ Upstream commit fadb7ff1a6c2c565af56b4aacdd086b067eed440 ] Restrict bpf_jit_limit to the maximum supported by the arch's JIT. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211014142554.53120-4-lmb@cloudflare.com Signed-off-by: Sasha Levin --- include/linux/filter.h | 1 + kernel/bpf/core.c | 4 +++- net/core/sysctl_net_core.c | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index c5473bf40f619..f7109bff712c0 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -961,6 +961,7 @@ extern int bpf_jit_enable; extern int bpf_jit_harden; extern int bpf_jit_kallsyms; extern long bpf_jit_limit; +extern long bpf_jit_limit_max; typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d9a3d995bd966..1238ef9c569df 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -523,6 +523,7 @@ int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); int bpf_jit_harden __read_mostly; int bpf_jit_kallsyms __read_mostly; long bpf_jit_limit __read_mostly; +long bpf_jit_limit_max __read_mostly; static __always_inline void bpf_get_prog_addr_region(const struct bpf_prog *prog, @@ -759,7 +760,8 @@ u64 __weak bpf_jit_alloc_exec_limit(void) static int __init bpf_jit_charge_init(void) { /* Only used as heuristic here to derive limit. */ - bpf_jit_limit = min_t(u64, round_up(bpf_jit_alloc_exec_limit() >> 2, + bpf_jit_limit_max = bpf_jit_alloc_exec_limit(); + bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 2, PAGE_SIZE), LONG_MAX); return 0; } diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 669cbe1609d9e..48041f50ecfb4 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -424,7 +424,7 @@ static struct ctl_table net_core_table[] = { .mode = 0600, .proc_handler = proc_dolongvec_minmax_bpf_restricted, .extra1 = &long_one, - .extra2 = &long_max, + .extra2 = &bpf_jit_limit_max, }, #endif { From 16fe57cd71ce895fd1d7e5b60e6d0edd76718363 Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Fri, 22 Oct 2021 16:31:39 -0700 Subject: [PATCH 1382/5344] xen/netfront: stop tx queues during live migration [ Upstream commit 042b2046d0f05cf8124c26ff65dbb6148a4404fb ] The tx queues are not stopped during the live migration. As a result, the ndo_start_xmit() may access netfront_info->queues which is freed by talk_to_netback()->xennet_destroy_queues(). This patch is to netif_device_detach() at the beginning of xen-netfront resuming, and netif_device_attach() at the end of resuming. CPU A CPU B talk_to_netback() -> if (info->queues) xennet_destroy_queues(info); to free netfront_info->queues xennet_start_xmit() to access netfront_info->queues -> err = xennet_create_queues(info, &num_queues); The idea is borrowed from virtio-net. Cc: Joe Jin Signed-off-by: Dongli Zhang Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/xen-netfront.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 88280057e0321..7d389c2cc9026 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -1439,6 +1439,10 @@ static int netfront_resume(struct xenbus_device *dev) dev_dbg(&dev->dev, "%s\n", dev->nodename); + netif_tx_lock_bh(info->netdev); + netif_device_detach(info->netdev); + netif_tx_unlock_bh(info->netdev); + xennet_disconnect_backend(info); return 0; } @@ -1987,6 +1991,10 @@ static int xennet_connect(struct net_device *dev) * domain a kick because we've probably just requeued some * packets. */ + netif_tx_lock_bh(np->netdev); + netif_device_attach(np->netdev); + netif_tx_unlock_bh(np->netdev); + netif_carrier_on(np->netdev); for (j = 0; j < num_queues; ++j) { queue = &np->queues[j]; From c50890ef04a5cdfb4460fa6b1e2df64323fbf0b5 Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Fri, 15 Oct 2021 10:26:34 +0200 Subject: [PATCH 1383/5344] nvmet-tcp: fix a memory leak when releasing a queue [ Upstream commit 926245c7d22271307606c88b1fbb2539a8550e94 ] page_frag_free() won't completely release the memory allocated for the commands, the cache page must be explicitly freed by calling __page_frag_cache_drain(). This bug can be easily reproduced by repeatedly executing the following command on the initiator: $echo 1 > /sys/devices/virtual/nvme-fabrics/ctl/nvme0/reset_controller Signed-off-by: Maurizio Lombardi Reviewed-by: Sagi Grimberg Reviewed-by: John Meneghini Signed-off-by: Christoph Hellwig Signed-off-by: Sasha Levin --- drivers/nvme/target/tcp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 522cb908e1c30..2874df083a0ef 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1341,6 +1341,7 @@ static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue) static void nvmet_tcp_release_queue_work(struct work_struct *w) { + struct page *page; struct nvmet_tcp_queue *queue = container_of(w, struct nvmet_tcp_queue, release_work); @@ -1360,6 +1361,8 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w) nvmet_tcp_free_crypto(queue); ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx); + page = virt_to_head_page(queue->pf_cache.va); + __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias); kfree(queue); } From 01e17be95853058544f86cdee73aa088128d28ef Mon Sep 17 00:00:00 2001 From: Thomas Perrot Date: Fri, 22 Oct 2021 16:21:04 +0200 Subject: [PATCH 1384/5344] spi: spl022: fix Microwire full duplex mode [ Upstream commit d81d0e41ed5fe7229a2c9a29d13bad288c7cf2d2 ] There are missing braces in the function that verify controller parameters, then an error is always returned when the parameter to select Microwire frames operation is used on devices allowing it. Signed-off-by: Thomas Perrot Link: https://lore.kernel.org/r/20211022142104.1386379-1-thomas.perrot@bootlin.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- drivers/spi/spi-pl022.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-pl022.c b/drivers/spi/spi-pl022.c index 7fedea67159c5..8523bb4f6a62e 100644 --- a/drivers/spi/spi-pl022.c +++ b/drivers/spi/spi-pl022.c @@ -1720,12 +1720,13 @@ static int verify_controller_parameters(struct pl022 *pl022, return -EINVAL; } } else { - if (chip_info->duplex != SSP_MICROWIRE_CHANNEL_FULL_DUPLEX) + if (chip_info->duplex != SSP_MICROWIRE_CHANNEL_FULL_DUPLEX) { dev_err(&pl022->adev->dev, "Microwire half duplex mode requested," " but this is only available in the" " ST version of PL022\n"); - return -EINVAL; + return -EINVAL; + } } } return 0; From ffbc0da9d421145b83989dd54b1b08125e6ebf4c Mon Sep 17 00:00:00 2001 From: Cyril Strejc Date: Sun, 24 Oct 2021 22:14:25 +0200 Subject: [PATCH 1385/5344] net: multicast: calculate csum of looped-back and forwarded packets [ Upstream commit 9122a70a6333705c0c35614ddc51c274ed1d3637 ] During a testing of an user-space application which transmits UDP multicast datagrams and utilizes multicast routing to send the UDP datagrams out of defined network interfaces, I've found a multicast router does not fill-in UDP checksum into locally produced, looped-back and forwarded UDP datagrams, if an original output NIC the datagrams are sent to has UDP TX checksum offload enabled. The datagrams are sent malformed out of the NIC the datagrams have been forwarded to. It is because: 1. If TX checksum offload is enabled on the output NIC, UDP checksum is not calculated by kernel and is not filled into skb data. 2. dev_loopback_xmit(), which is called solely by ip_mc_finish_output(), sets skb->ip_summed = CHECKSUM_UNNECESSARY unconditionally. 3. Since 35fc92a9 ("[NET]: Allow forwarding of ip_summed except CHECKSUM_COMPLETE"), the ip_summed value is preserved during forwarding. 4. If ip_summed != CHECKSUM_PARTIAL, checksum is not calculated during a packet egress. The minimum fix in dev_loopback_xmit(): 1. Preserves skb->ip_summed CHECKSUM_PARTIAL. This is the case when the original output NIC has TX checksum offload enabled. The effects are: a) If the forwarding destination interface supports TX checksum offloading, the NIC driver is responsible to fill-in the checksum. b) If the forwarding destination interface does NOT support TX checksum offloading, checksums are filled-in by kernel before skb is submitted to the NIC driver. c) For local delivery, checksum validation is skipped as in the case of CHECKSUM_UNNECESSARY, thanks to skb_csum_unnecessary(). 2. Translates ip_summed CHECKSUM_NONE to CHECKSUM_UNNECESSARY. It means, for CHECKSUM_NONE, the behavior is unmodified and is there to skip a looped-back packet local delivery checksum validation. Signed-off-by: Cyril Strejc Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/net/udp.h | 5 +++-- net/core/dev.c | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/include/net/udp.h b/include/net/udp.h index fabf507bce5d1..d9d39cc20a847 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -480,8 +480,9 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk, * CHECKSUM_NONE in __udp_gso_segment. UDP GRO indeed builds partial * packets in udp_gro_complete_segment. As does UDP GSO, verified by * udp_send_skb. But when those packets are looped in dev_loopback_xmit - * their ip_summed is set to CHECKSUM_UNNECESSARY. Reset in this - * specific case, where PARTIAL is both correct and required. + * their ip_summed CHECKSUM_NONE is changed to CHECKSUM_UNNECESSARY. + * Reset in this specific case, where PARTIAL is both correct and + * required. */ if (skb->pkt_type == PACKET_LOOPBACK) skb->ip_summed = CHECKSUM_PARTIAL; diff --git a/net/core/dev.c b/net/core/dev.c index 54e7e1490b8d1..a0e7752e27d59 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3487,7 +3487,8 @@ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) skb_reset_mac_header(skb); __skb_pull(skb, skb_network_offset(skb)); skb->pkt_type = PACKET_LOOPBACK; - skb->ip_summed = CHECKSUM_UNNECESSARY; + if (skb->ip_summed == CHECKSUM_NONE) + skb->ip_summed = CHECKSUM_UNNECESSARY; WARN_ON(!skb_dst(skb)); skb_dst_force(skb); netif_rx_ni(skb); From 10229a8d53799815c3162a1cf8eaa0bf58a7bc1a Mon Sep 17 00:00:00 2001 From: Walter Stoll Date: Thu, 14 Oct 2021 12:22:29 +0200 Subject: [PATCH 1386/5344] watchdog: Fix OMAP watchdog early handling [ Upstream commit cd004d8299f1dc6cfa6a4eea8f94cb45eaedf070 ] TI's implementation does not service the watchdog even if the kernel command line parameter omap_wdt.early_enable is set to 1. This patch fixes the issue. Signed-off-by: Walter Stoll Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/88a8fe5229cd68fa0f1fd22f5d66666c1b7057a0.camel@duagon.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck Signed-off-by: Sasha Levin --- drivers/watchdog/omap_wdt.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/watchdog/omap_wdt.c b/drivers/watchdog/omap_wdt.c index 9b91882fe3c41..6d7ccbc0b666c 100644 --- a/drivers/watchdog/omap_wdt.c +++ b/drivers/watchdog/omap_wdt.c @@ -268,8 +268,12 @@ static int omap_wdt_probe(struct platform_device *pdev) wdev->wdog.bootstatus = WDIOF_CARDRESET; } - if (!early_enable) + if (early_enable) { + omap_wdt_start(&wdev->wdog); + set_bit(WDOG_HW_RUNNING, &wdev->wdog.status); + } else { omap_wdt_disable(wdev); + } ret = watchdog_register_device(&wdev->wdog); if (ret) { From 03243dc4a1eb742238adbf4e3e1815bc276e66b0 Mon Sep 17 00:00:00 2001 From: Mario Date: Tue, 26 Oct 2021 13:27:37 +0200 Subject: [PATCH 1387/5344] drm: panel-orientation-quirks: Add quirk for GPD Win3 [ Upstream commit 61b1d445f3bfe4c3ba4335ceeb7e8ba688fd31e2 ] Fixes screen orientation for GPD Win 3 handheld gaming console. Signed-off-by: Mario Risoldi Signed-off-by: Sam Ravnborg Link: https://patchwork.freedesktop.org/patch/msgid/20211026112737.9181-1-awxkrnl@gmail.com Signed-off-by: Sasha Levin --- drivers/gpu/drm/drm_panel_orientation_quirks.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/drm_panel_orientation_quirks.c b/drivers/gpu/drm/drm_panel_orientation_quirks.c index 30c17a76f49ae..e1b2ce4921ae7 100644 --- a/drivers/gpu/drm/drm_panel_orientation_quirks.c +++ b/drivers/gpu/drm/drm_panel_orientation_quirks.c @@ -191,6 +191,12 @@ static const struct dmi_system_id orientation_data[] = { DMI_EXACT_MATCH(DMI_BOARD_NAME, "Default string"), }, .driver_data = (void *)&gpd_win2, + }, { /* GPD Win 3 */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "GPD"), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "G1618-03") + }, + .driver_data = (void *)&lcd720x1280_rightside_up, }, { /* I.T.Works TW891 */ .matches = { DMI_EXACT_MATCH(DMI_SYS_VENDOR, "To be filled by O.E.M."), From 28d50f7218581fdf1c82083ea4760043544349eb Mon Sep 17 00:00:00 2001 From: Amit Engel Date: Wed, 27 Oct 2021 09:49:27 +0300 Subject: [PATCH 1388/5344] nvmet-tcp: fix header digest verification [ Upstream commit 86aeda32b887cdaeb0f4b7bfc9971e36377181c7 ] Pass the correct length to nvmet_tcp_verify_hdgst, which is the pdu header length. This fixes a wrong behaviour where header digest verification passes although the digest is wrong. Signed-off-by: Amit Engel Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Sasha Levin --- drivers/nvme/target/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 2874df083a0ef..3940be28d31cd 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1018,7 +1018,7 @@ static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue) } if (queue->hdr_digest && - nvmet_tcp_verify_hdgst(queue, &queue->pdu, queue->offset)) { + nvmet_tcp_verify_hdgst(queue, &queue->pdu, hdr->hlen)) { nvmet_tcp_fatal_error(queue); /* fatal */ return -EPROTO; } From 837dde043681f185fc62596f4900911bb1b3c4bc Mon Sep 17 00:00:00 2001 From: Janghyub Seo Date: Tue, 26 Oct 2021 07:12:42 +0000 Subject: [PATCH 1389/5344] r8169: Add device 10ec:8162 to driver r8169 [ Upstream commit 72f898ca0ab85fde6facf78b14d9f67a4a7b32d1 ] This patch makes the driver r8169 pick up device Realtek Semiconductor Co. , Ltd. Device [10ec:8162]. Signed-off-by: Janghyub Seo Suggested-by: Rushab Shah Link: https://lore.kernel.org/r/1635231849296.1489250046.441294000@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/realtek/r8169_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 623d86e5e970f..8d2b184ff5758 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -212,6 +212,7 @@ static const struct pci_device_id rtl8169_pci_tbl[] = { { PCI_VDEVICE(REALTEK, 0x8129) }, { PCI_VDEVICE(REALTEK, 0x8136), RTL_CFG_NO_GBIT }, { PCI_VDEVICE(REALTEK, 0x8161) }, + { PCI_VDEVICE(REALTEK, 0x8162) }, { PCI_VDEVICE(REALTEK, 0x8167) }, { PCI_VDEVICE(REALTEK, 0x8168) }, { PCI_VDEVICE(NCUBE, 0x8168) }, From 88fc3ed2bc729b9daabab62008c13a93927f9d5e Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Tue, 26 Oct 2021 14:50:31 -0700 Subject: [PATCH 1390/5344] vmxnet3: do not stop tx queues after netif_device_detach() [ Upstream commit 9159f102402a64ac85e676b75cc1f9c62c5b4b73 ] The netif_device_detach() conditionally stops all tx queues if the queues are running. There is no need to call netif_tx_stop_all_queues() again. Signed-off-by: Dongli Zhang Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/vmxnet3/vmxnet3_drv.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index a06e6ab453f50..cf090f88dac03 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -3634,7 +3634,6 @@ vmxnet3_suspend(struct device *device) vmxnet3_free_intr_resources(adapter); netif_device_detach(netdev); - netif_tx_stop_all_queues(netdev); /* Create wake-up filters. */ pmConf = adapter->pm_conf; From 0610a3ced097ba895cc04b22e525c9c88bd2e397 Mon Sep 17 00:00:00 2001 From: Yu Xiao Date: Thu, 28 Oct 2021 12:00:36 +0200 Subject: [PATCH 1391/5344] nfp: bpf: relax prog rejection for mtu check through max_pkt_offset [ Upstream commit 90a881fc352a953f1c8beb61977a2db0818157d4 ] MTU change is refused whenever the value of new MTU is bigger than the max packet bytes that fits in NFP Cluster Target Memory (CTM). However, an eBPF program doesn't always need to access the whole packet data. The maximum direct packet access (DPA) offset has always been caculated by verifier and stored in the max_pkt_offset field of prog aux data. Signed-off-by: Yu Xiao Reviewed-by: Yinjun Zhang Reviewed-by: Niklas Soderlund Signed-off-by: Simon Horman Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/netronome/nfp/bpf/main.c | 16 +++++++++++----- drivers/net/ethernet/netronome/nfp/bpf/main.h | 2 ++ .../net/ethernet/netronome/nfp/bpf/offload.c | 17 +++++++++++++---- 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c index 8f732771d3fad..bd2966eb6b7d3 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@ -182,15 +182,21 @@ static int nfp_bpf_check_mtu(struct nfp_app *app, struct net_device *netdev, int new_mtu) { struct nfp_net *nn = netdev_priv(netdev); - unsigned int max_mtu; + struct nfp_bpf_vnic *bv; + struct bpf_prog *prog; if (~nn->dp.ctrl & NFP_NET_CFG_CTRL_BPF) return 0; - max_mtu = nn_readb(nn, NFP_NET_CFG_BPF_INL_MTU) * 64 - 32; - if (new_mtu > max_mtu) { - nn_info(nn, "BPF offload active, MTU over %u not supported\n", - max_mtu); + if (nn->xdp_hw.prog) { + prog = nn->xdp_hw.prog; + } else { + bv = nn->app_priv; + prog = bv->tc_prog; + } + + if (nfp_bpf_offload_check_mtu(nn, prog, new_mtu)) { + nn_info(nn, "BPF offload active, potential packet access beyond hardware packet boundary"); return -EBUSY; } return 0; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index fac9c6f9e197b..c74620fcc539c 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -560,6 +560,8 @@ bool nfp_is_subprog_start(struct nfp_insn_meta *meta); void nfp_bpf_jit_prepare(struct nfp_prog *nfp_prog); int nfp_bpf_jit(struct nfp_prog *prog); bool nfp_bpf_supported_opcode(u8 code); +bool nfp_bpf_offload_check_mtu(struct nfp_net *nn, struct bpf_prog *prog, + unsigned int mtu); int nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx); diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index 88fab6a82acff..7ff388ecc7e3a 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -477,19 +477,28 @@ int nfp_bpf_event_output(struct nfp_app_bpf *bpf, const void *data, return 0; } +bool nfp_bpf_offload_check_mtu(struct nfp_net *nn, struct bpf_prog *prog, + unsigned int mtu) +{ + unsigned int fw_mtu, pkt_off; + + fw_mtu = nn_readb(nn, NFP_NET_CFG_BPF_INL_MTU) * 64 - 32; + pkt_off = min(prog->aux->max_pkt_offset, mtu); + + return fw_mtu < pkt_off; +} + static int nfp_net_bpf_load(struct nfp_net *nn, struct bpf_prog *prog, struct netlink_ext_ack *extack) { struct nfp_prog *nfp_prog = prog->aux->offload->dev_priv; - unsigned int fw_mtu, pkt_off, max_stack, max_prog_len; + unsigned int max_stack, max_prog_len; dma_addr_t dma_addr; void *img; int err; - fw_mtu = nn_readb(nn, NFP_NET_CFG_BPF_INL_MTU) * 64 - 32; - pkt_off = min(prog->aux->max_pkt_offset, nn->dp.netdev->mtu); - if (fw_mtu < pkt_off) { + if (nfp_bpf_offload_check_mtu(nn, prog, nn->dp.netdev->mtu)) { NL_SET_ERR_MSG_MOD(extack, "BPF offload not supported with potential packet access beyond HW packet split boundary"); return -EOPNOTSUPP; } From 7c2dac7e65e26040fd880e79d8aa9d05037b6ac3 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 28 Oct 2021 15:13:47 +0800 Subject: [PATCH 1392/5344] net/smc: Correct spelling mistake to TCPF_SYN_RECV [ Upstream commit f3a3a0fe0b644582fa5d83dd94b398f99fc57914 ] There should use TCPF_SYN_RECV instead of TCP_SYN_RECV. Signed-off-by: Wen Gu Reviewed-by: Tony Lu Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/smc/af_smc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index d2886a04ed7fc..6b0f09c5b195f 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -787,7 +787,7 @@ static void smc_connect_work(struct work_struct *work) if (smc->clcsock->sk->sk_err) { smc->sk.sk_err = smc->clcsock->sk->sk_err; } else if ((1 << smc->clcsock->sk->sk_state) & - (TCPF_SYN_SENT | TCP_SYN_RECV)) { + (TCPF_SYN_SENT | TCPF_SYN_RECV)) { rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo); if ((rc == -EPIPE) && ((1 << smc->clcsock->sk->sk_state) & From 73e4060490127e02cc64caea853c1f7477174404 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Tue, 5 Oct 2021 01:15:33 +0800 Subject: [PATCH 1393/5344] btrfs: clear MISSING device status bit in btrfs_close_one_device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 5d03dbebba2594d2e6fbf3b5dd9060c5a835de3b upstream. Reported bug: https://github.com/kdave/btrfs-progs/issues/389 There's a problem with scrub reporting aborted status but returning error code 0, on a filesystem with missing and readded device. Roughly these steps: - mkfs -d raid1 dev1 dev2 - fill with data - unmount - make dev1 disappear - mount -o degraded - copy more data - make dev1 appear again Running scrub afterwards reports that the command was aborted, but the system log message says the exit code was 0. It seems that the cause of the error is decrementing fs_devices->missing_devices but not clearing device->dev_state. Every time we umount filesystem, it would call close_ctree, And it would eventually involve btrfs_close_one_device to close the device, but it only decrements fs_devices->missing_devices but does not clear the device BTRFS_DEV_STATE_MISSING bit. Worse, this bug will cause Integer Overflow, because every time umount, fs_devices->missing_devices will decrease. If fs_devices->missing_devices value hit 0, it would overflow. With added debugging: loop1: detected capacity change from 0 to 20971520 BTRFS: device fsid 56ad51f1-5523-463b-8547-c19486c51ebb devid 1 transid 21 /dev/loop1 scanned by systemd-udevd (2311) loop2: detected capacity change from 0 to 20971520 BTRFS: device fsid 56ad51f1-5523-463b-8547-c19486c51ebb devid 2 transid 17 /dev/loop2 scanned by systemd-udevd (2313) BTRFS info (device loop1): flagging fs with big metadata feature BTRFS info (device loop1): allowing degraded mounts BTRFS info (device loop1): using free space tree BTRFS info (device loop1): has skinny extents BTRFS info (device loop1): before clear_missing.00000000f706684d /dev/loop1 0 BTRFS warning (device loop1): devid 2 uuid 6635ac31-56dd-4852-873b-c60f5e2d53d2 is missing BTRFS info (device loop1): before clear_missing.0000000000000000 /dev/loop2 1 BTRFS info (device loop1): flagging fs with big metadata feature BTRFS info (device loop1): allowing degraded mounts BTRFS info (device loop1): using free space tree BTRFS info (device loop1): has skinny extents BTRFS info (device loop1): before clear_missing.00000000f706684d /dev/loop1 0 BTRFS warning (device loop1): devid 2 uuid 6635ac31-56dd-4852-873b-c60f5e2d53d2 is missing BTRFS info (device loop1): before clear_missing.0000000000000000 /dev/loop2 0 BTRFS info (device loop1): flagging fs with big metadata feature BTRFS info (device loop1): allowing degraded mounts BTRFS info (device loop1): using free space tree BTRFS info (device loop1): has skinny extents BTRFS info (device loop1): before clear_missing.00000000f706684d /dev/loop1 18446744073709551615 BTRFS warning (device loop1): devid 2 uuid 6635ac31-56dd-4852-873b-c60f5e2d53d2 is missing BTRFS info (device loop1): before clear_missing.0000000000000000 /dev/loop2 18446744073709551615 If fs_devices->missing_devices is 0, next time it would be 18446744073709551615 After apply this patch, the fs_devices->missing_devices seems to be right: $ truncate -s 10g test1 $ truncate -s 10g test2 $ losetup /dev/loop1 test1 $ losetup /dev/loop2 test2 $ mkfs.btrfs -draid1 -mraid1 /dev/loop1 /dev/loop2 -f $ losetup -d /dev/loop2 $ mount -o degraded /dev/loop1 /mnt/1 $ umount /mnt/1 $ mount -o degraded /dev/loop1 /mnt/1 $ umount /mnt/1 $ mount -o degraded /dev/loop1 /mnt/1 $ umount /mnt/1 $ dmesg loop1: detected capacity change from 0 to 20971520 loop2: detected capacity change from 0 to 20971520 BTRFS: device fsid 15aa1203-98d3-4a66-bcae-ca82f629c2cd devid 1 transid 5 /dev/loop1 scanned by mkfs.btrfs (1863) BTRFS: device fsid 15aa1203-98d3-4a66-bcae-ca82f629c2cd devid 2 transid 5 /dev/loop2 scanned by mkfs.btrfs (1863) BTRFS info (device loop1): flagging fs with big metadata feature BTRFS info (device loop1): allowing degraded mounts BTRFS info (device loop1): disk space caching is enabled BTRFS info (device loop1): has skinny extents BTRFS info (device loop1): before clear_missing.00000000975bd577 /dev/loop1 0 BTRFS warning (device loop1): devid 2 uuid 8b333791-0b3f-4f57-b449-1c1ab6b51f38 is missing BTRFS info (device loop1): before clear_missing.0000000000000000 /dev/loop2 1 BTRFS info (device loop1): checking UUID tree BTRFS info (device loop1): flagging fs with big metadata feature BTRFS info (device loop1): allowing degraded mounts BTRFS info (device loop1): disk space caching is enabled BTRFS info (device loop1): has skinny extents BTRFS info (device loop1): before clear_missing.00000000975bd577 /dev/loop1 0 BTRFS warning (device loop1): devid 2 uuid 8b333791-0b3f-4f57-b449-1c1ab6b51f38 is missing BTRFS info (device loop1): before clear_missing.0000000000000000 /dev/loop2 1 BTRFS info (device loop1): flagging fs with big metadata feature BTRFS info (device loop1): allowing degraded mounts BTRFS info (device loop1): disk space caching is enabled BTRFS info (device loop1): has skinny extents BTRFS info (device loop1): before clear_missing.00000000975bd577 /dev/loop1 0 BTRFS warning (device loop1): devid 2 uuid 8b333791-0b3f-4f57-b449-1c1ab6b51f38 is missing BTRFS info (device loop1): before clear_missing.0000000000000000 /dev/loop2 1 CC: stable@vger.kernel.org # 4.19+ Signed-off-by: Li Zhang Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/volumes.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f302bbb93f32c..8af92e44deae4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1315,8 +1315,10 @@ static void btrfs_close_one_device(struct btrfs_device *device) if (device->devid == BTRFS_DEV_REPLACE_DEVID) clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); - if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { + clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); fs_devices->missing_devices--; + } btrfs_close_bdev(device); From b9c68a55b3520cce45d9afa9cd16ae410f019eca Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 14 Oct 2021 17:26:04 +0100 Subject: [PATCH 1394/5344] btrfs: fix lost error handling when replaying directory deletes commit 10adb1152d957a4d570ad630f93a88bb961616c1 upstream. At replay_dir_deletes(), if find_dir_range() returns an error we break out of the main while loop and then assign a value of 0 (success) to the 'ret' variable, resulting in completely ignoring that an error happened. Fix that by jumping to the 'out' label when find_dir_range() returns an error (negative value). CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-log.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f342055699870..7bc4477d7ee7d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2490,7 +2490,9 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, else { ret = find_dir_range(log, path, dirid, key_type, &range_start, &range_end); - if (ret != 0) + if (ret < 0) + goto out; + else if (ret > 0) break; } From 5ddd564efd4fe2949c9a39a93a99940387468f65 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Tue, 19 Oct 2021 18:43:38 +0800 Subject: [PATCH 1395/5344] btrfs: call btrfs_check_rw_degradable only if there is a missing device commit 5c78a5e7aa835c4f08a7c90fe02d19f95a776f29 upstream. In open_ctree() in btrfs_check_rw_degradable() [1], we check each block group individually if at least the minimum number of devices is available for that profile. If all the devices are available, then we don't have to check degradable. [1] open_ctree() :: 3559 if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) { Also before calling btrfs_check_rw_degradable() in open_ctee() at the line number shown below [2] we call btrfs_read_chunk_tree() and down to add_missing_dev() to record number of missing devices. [2] open_ctree() :: 3454 ret = btrfs_read_chunk_tree(fs_info); btrfs_read_chunk_tree() read_one_chunk() / read_one_dev() add_missing_dev() So, check if there is any missing device before btrfs_check_rw_degradable() in open_ctree(). Also, with this the mount command could save ~16ms.[3] in the most common case, that is no device is missing. [3] 1) * 16934.96 us | btrfs_check_rw_degradable [btrfs](); CC: stable@vger.kernel.org # 4.19+ Reviewed-by: Josef Bacik Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/disk-io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 946ae198b3449..1499531bc1511 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3145,7 +3145,8 @@ int open_ctree(struct super_block *sb, goto fail_sysfs; } - if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) { + if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices && + !btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info, "writable mount is not allowed due to too many missing devices"); goto fail_sysfs; From c657a4db89f0524f4a111e85d3d5614f553c3460 Mon Sep 17 00:00:00 2001 From: Zev Weiss Date: Tue, 28 Sep 2021 02:22:35 -0700 Subject: [PATCH 1396/5344] hwmon: (pmbus/lm25066) Add offset coefficients commit ae59dc455a78fb73034dd1fbb337d7e59c27cbd8 upstream. With the exception of the lm5066i, all the devices handled by this driver had been missing their offset ('b') coefficients for direct format readings. Cc: stable@vger.kernel.org Fixes: 58615a94f6a1 ("hwmon: (pmbus/lm25066) Add support for LM25056") Fixes: e53e6497fc9f ("hwmon: (pmbus/lm25066) Refactor device specific coefficients") Signed-off-by: Zev Weiss Link: https://lore.kernel.org/r/20210928092242.30036-2-zev@bewilderbeest.net Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- drivers/hwmon/pmbus/lm25066.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/drivers/hwmon/pmbus/lm25066.c b/drivers/hwmon/pmbus/lm25066.c index 05fce86f1f817..eba043c59fc73 100644 --- a/drivers/hwmon/pmbus/lm25066.c +++ b/drivers/hwmon/pmbus/lm25066.c @@ -55,22 +55,27 @@ static struct __coeff lm25066_coeff[6][PSC_NUM_CLASSES + 2] = { [lm25056] = { [PSC_VOLTAGE_IN] = { .m = 16296, + .b = 1343, .R = -2, }, [PSC_CURRENT_IN] = { .m = 13797, + .b = -1833, .R = -2, }, [PSC_CURRENT_IN_L] = { .m = 6726, + .b = -537, .R = -2, }, [PSC_POWER] = { .m = 5501, + .b = -2908, .R = -3, }, [PSC_POWER_L] = { .m = 26882, + .b = -5646, .R = -4, }, [PSC_TEMPERATURE] = { @@ -82,26 +87,32 @@ static struct __coeff lm25066_coeff[6][PSC_NUM_CLASSES + 2] = { [lm25066] = { [PSC_VOLTAGE_IN] = { .m = 22070, + .b = -1800, .R = -2, }, [PSC_VOLTAGE_OUT] = { .m = 22070, + .b = -1800, .R = -2, }, [PSC_CURRENT_IN] = { .m = 13661, + .b = -5200, .R = -2, }, [PSC_CURRENT_IN_L] = { .m = 6852, + .b = -3100, .R = -2, }, [PSC_POWER] = { .m = 736, + .b = -3300, .R = -2, }, [PSC_POWER_L] = { .m = 369, + .b = -1900, .R = -2, }, [PSC_TEMPERATURE] = { @@ -111,26 +122,32 @@ static struct __coeff lm25066_coeff[6][PSC_NUM_CLASSES + 2] = { [lm5064] = { [PSC_VOLTAGE_IN] = { .m = 4611, + .b = -642, .R = -2, }, [PSC_VOLTAGE_OUT] = { .m = 4621, + .b = 423, .R = -2, }, [PSC_CURRENT_IN] = { .m = 10742, + .b = 1552, .R = -2, }, [PSC_CURRENT_IN_L] = { .m = 5456, + .b = 2118, .R = -2, }, [PSC_POWER] = { .m = 1204, + .b = 8524, .R = -3, }, [PSC_POWER_L] = { .m = 612, + .b = 11202, .R = -3, }, [PSC_TEMPERATURE] = { @@ -140,26 +157,32 @@ static struct __coeff lm25066_coeff[6][PSC_NUM_CLASSES + 2] = { [lm5066] = { [PSC_VOLTAGE_IN] = { .m = 4587, + .b = -1200, .R = -2, }, [PSC_VOLTAGE_OUT] = { .m = 4587, + .b = -2400, .R = -2, }, [PSC_CURRENT_IN] = { .m = 10753, + .b = -1200, .R = -2, }, [PSC_CURRENT_IN_L] = { .m = 5405, + .b = -600, .R = -2, }, [PSC_POWER] = { .m = 1204, + .b = -6000, .R = -3, }, [PSC_POWER_L] = { .m = 605, + .b = -8000, .R = -3, }, [PSC_TEMPERATURE] = { From 967510a0d75e4dfc0c2f49d043efffb92dbb91e0 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 8 Oct 2021 13:37:13 +0200 Subject: [PATCH 1397/5344] regulator: s5m8767: do not use reset value as DVS voltage if GPIO DVS is disabled commit b16bef60a9112b1e6daf3afd16484eb06e7ce792 upstream. The driver and its bindings, before commit 04f9f068a619 ("regulator: s5m8767: Modify parsing method of the voltage table of buck2/3/4") were requiring to provide at least one safe/default voltage for DVS registers if DVS GPIO is not being enabled. IOW, if s5m8767,pmic-buck2-uses-gpio-dvs is missing, the s5m8767,pmic-buck2-dvs-voltage should still be present and contain one voltage. This requirement was coming from driver behavior matching this condition (none of DVS GPIO is enabled): it was always initializing the DVS selector pins to 0 and keeping the DVS enable setting at reset value (enabled). Therefore if none of DVS GPIO is enabled in devicetree, driver was configuring the first DVS voltage for buck[234]. Mentioned commit 04f9f068a619 ("regulator: s5m8767: Modify parsing method of the voltage table of buck2/3/4") broke it because DVS voltage won't be parsed from devicetree if DVS GPIO is not enabled. After the change, driver will configure bucks to use the register reset value as voltage which might have unpleasant effects. Fix this by relaxing the bindings constrain: if DVS GPIO is not enabled in devicetree (therefore DVS voltage is also not parsed), explicitly disable it. Cc: Fixes: 04f9f068a619 ("regulator: s5m8767: Modify parsing method of the voltage table of buck2/3/4") Signed-off-by: Krzysztof Kozlowski Acked-by: Rob Herring Message-Id: <20211008113723.134648-2-krzysztof.kozlowski@canonical.com> Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- .../bindings/regulator/samsung,s5m8767.txt | 21 +++++++------------ drivers/regulator/s5m8767.c | 21 ++++++++----------- 2 files changed, 17 insertions(+), 25 deletions(-) diff --git a/Documentation/devicetree/bindings/regulator/samsung,s5m8767.txt b/Documentation/devicetree/bindings/regulator/samsung,s5m8767.txt index 093edda0c8dfc..d9cff1614f7ac 100644 --- a/Documentation/devicetree/bindings/regulator/samsung,s5m8767.txt +++ b/Documentation/devicetree/bindings/regulator/samsung,s5m8767.txt @@ -13,6 +13,14 @@ common regulator binding documented in: Required properties of the main device node (the parent!): + - s5m8767,pmic-buck-ds-gpios: GPIO specifiers for three host gpio's used + for selecting GPIO DVS lines. It is one-to-one mapped to dvs gpio lines. + + [1] If either of the 's5m8767,pmic-buck[2/3/4]-uses-gpio-dvs' optional + property is specified, then all the eight voltage values for the + 's5m8767,pmic-buck[2/3/4]-dvs-voltage' should be specified. + +Optional properties of the main device node (the parent!): - s5m8767,pmic-buck2-dvs-voltage: A set of 8 voltage values in micro-volt (uV) units for buck2 when changing voltage using gpio dvs. Refer to [1] below for additional information. @@ -25,19 +33,6 @@ Required properties of the main device node (the parent!): units for buck4 when changing voltage using gpio dvs. Refer to [1] below for additional information. - - s5m8767,pmic-buck-ds-gpios: GPIO specifiers for three host gpio's used - for selecting GPIO DVS lines. It is one-to-one mapped to dvs gpio lines. - - [1] If none of the 's5m8767,pmic-buck[2/3/4]-uses-gpio-dvs' optional - property is specified, the 's5m8767,pmic-buck[2/3/4]-dvs-voltage' - property should specify atleast one voltage level (which would be a - safe operating voltage). - - If either of the 's5m8767,pmic-buck[2/3/4]-uses-gpio-dvs' optional - property is specified, then all the eight voltage values for the - 's5m8767,pmic-buck[2/3/4]-dvs-voltage' should be specified. - -Optional properties of the main device node (the parent!): - s5m8767,pmic-buck2-uses-gpio-dvs: 'buck2' can be controlled by gpio dvs. - s5m8767,pmic-buck3-uses-gpio-dvs: 'buck3' can be controlled by gpio dvs. - s5m8767,pmic-buck4-uses-gpio-dvs: 'buck4' can be controlled by gpio dvs. diff --git a/drivers/regulator/s5m8767.c b/drivers/regulator/s5m8767.c index 5276f8442f3c6..1e9f03a2ea1cc 100644 --- a/drivers/regulator/s5m8767.c +++ b/drivers/regulator/s5m8767.c @@ -851,18 +851,15 @@ static int s5m8767_pmic_probe(struct platform_device *pdev) /* DS4 GPIO */ gpio_direction_output(pdata->buck_ds[2], 0x0); - if (pdata->buck2_gpiodvs || pdata->buck3_gpiodvs || - pdata->buck4_gpiodvs) { - regmap_update_bits(s5m8767->iodev->regmap_pmic, - S5M8767_REG_BUCK2CTRL, 1 << 1, - (pdata->buck2_gpiodvs) ? (1 << 1) : (0 << 1)); - regmap_update_bits(s5m8767->iodev->regmap_pmic, - S5M8767_REG_BUCK3CTRL, 1 << 1, - (pdata->buck3_gpiodvs) ? (1 << 1) : (0 << 1)); - regmap_update_bits(s5m8767->iodev->regmap_pmic, - S5M8767_REG_BUCK4CTRL, 1 << 1, - (pdata->buck4_gpiodvs) ? (1 << 1) : (0 << 1)); - } + regmap_update_bits(s5m8767->iodev->regmap_pmic, + S5M8767_REG_BUCK2CTRL, 1 << 1, + (pdata->buck2_gpiodvs) ? (1 << 1) : (0 << 1)); + regmap_update_bits(s5m8767->iodev->regmap_pmic, + S5M8767_REG_BUCK3CTRL, 1 << 1, + (pdata->buck3_gpiodvs) ? (1 << 1) : (0 << 1)); + regmap_update_bits(s5m8767->iodev->regmap_pmic, + S5M8767_REG_BUCK4CTRL, 1 << 1, + (pdata->buck4_gpiodvs) ? (1 << 1) : (0 << 1)); /* Initialize GPIO DVS registers */ for (i = 0; i < 8; i++) { From 5fb29396279ae5884a94a7e6382e4cec8a24bdde Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 8 Oct 2021 13:37:14 +0200 Subject: [PATCH 1398/5344] regulator: dt-bindings: samsung,s5m8767: correct s5m8767,pmic-buck-default-dvs-idx property commit a7fda04bc9b6ad9da8e19c9e6e3b1dab773d068a upstream. The driver was always parsing "s5m8767,pmic-buck-default-dvs-idx", not "s5m8767,pmic-buck234-default-dvs-idx". Cc: Fixes: 26aec009f6b6 ("regulator: add device tree support for s5m8767") Signed-off-by: Krzysztof Kozlowski Acked-by: Rob Herring Message-Id: <20211008113723.134648-3-krzysztof.kozlowski@canonical.com> Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- Documentation/devicetree/bindings/regulator/samsung,s5m8767.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/regulator/samsung,s5m8767.txt b/Documentation/devicetree/bindings/regulator/samsung,s5m8767.txt index d9cff1614f7ac..6cd83d920155f 100644 --- a/Documentation/devicetree/bindings/regulator/samsung,s5m8767.txt +++ b/Documentation/devicetree/bindings/regulator/samsung,s5m8767.txt @@ -39,7 +39,7 @@ Optional properties of the main device node (the parent!): Additional properties required if either of the optional properties are used: - - s5m8767,pmic-buck234-default-dvs-idx: Default voltage setting selected from + - s5m8767,pmic-buck-default-dvs-idx: Default voltage setting selected from the possible 8 options selectable by the dvs gpios. The value of this property should be between 0 and 7. If not specified or if out of range, the default value of this property is set to 0. From 8a556b7ea53e295d236a8f92e112eca6a9b39ed2 Mon Sep 17 00:00:00 2001 From: Eric Badger Date: Sun, 10 Oct 2021 10:06:56 -0700 Subject: [PATCH 1399/5344] EDAC/sb_edac: Fix top-of-high-memory value for Broadwell/Haswell commit 537bddd069c743759addf422d0b8f028ff0f8dbc upstream. The computation of TOHM is off by one bit. This missed bit results in too low a value for TOHM, which can cause errors in regular memory to incorrectly report: EDAC MC0: 1 CE Error at MMIOH area, on addr 0x000000207fffa680 on any memory Fixes: 50d1bb93672f ("sb_edac: add support for Haswell based systems") Cc: stable@vger.kernel.org Reported-by: Meeta Saggi Signed-off-by: Eric Badger Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20211010170127.848113-1-ebadger@purestorage.com Signed-off-by: Greg Kroah-Hartman --- drivers/edac/sb_edac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 756bf4fcc8d12..58a772ef437be 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -1055,7 +1055,7 @@ static u64 haswell_get_tohm(struct sbridge_pvt *pvt) pci_read_config_dword(pvt->info.pci_vtd, HASWELL_TOHM_1, ®); rc = ((reg << 6) | rc) << 26; - return rc | 0x1ffffff; + return rc | 0x3ffffff; } static u64 knl_get_tolm(struct sbridge_pvt *pvt) From 0636e89fe81f2fd855abb7f8364431a31542d663 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 27 Oct 2021 10:08:19 +0200 Subject: [PATCH 1400/5344] mwifiex: fix division by zero in fw download path commit 89f8765a11d8df49296d92c404067f9b5c58ee26 upstream. Add the missing endpoint sanity checks to probe() to avoid division by zero in mwifiex_write_data_sync() in case a malicious device has broken descriptors (or when doing descriptor fuzz testing). Only add checks for the firmware-download boot stage, which require both command endpoints, for now. The driver looks like it will handle a missing endpoint during normal operation without oopsing, albeit not very gracefully as it will try to submit URBs to the default pipe and fail. Note that USB core will reject URBs submitted for endpoints with zero wMaxPacketSize but that drivers doing packet-size calculations still need to handle this (cf. commit 2548288b4fb0 ("USB: Fix: Don't skip endpoint descriptors with maxpacket=0")). Fixes: 4daffe354366 ("mwifiex: add support for Marvell USB8797 chipset") Cc: stable@vger.kernel.org # 3.5 Cc: Amitkumar Karwar Signed-off-by: Johan Hovold Reviewed-by: Brian Norris Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20211027080819.6675-4-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/marvell/mwifiex/usb.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/net/wireless/marvell/mwifiex/usb.c b/drivers/net/wireless/marvell/mwifiex/usb.c index 528107d70c1cb..cb8a9ad40cfe9 100644 --- a/drivers/net/wireless/marvell/mwifiex/usb.c +++ b/drivers/net/wireless/marvell/mwifiex/usb.c @@ -505,6 +505,22 @@ static int mwifiex_usb_probe(struct usb_interface *intf, } } + switch (card->usb_boot_state) { + case USB8XXX_FW_DNLD: + /* Reject broken descriptors. */ + if (!card->rx_cmd_ep || !card->tx_cmd_ep) + return -ENODEV; + if (card->bulk_out_maxpktsize == 0) + return -ENODEV; + break; + case USB8XXX_FW_READY: + /* Assume the driver can handle missing endpoints for now. */ + break; + default: + WARN_ON(1); + return -ENODEV; + } + usb_set_intfdata(intf, card); ret = mwifiex_add_card(card, &card->fw_done, &usb_ops, From 18acf67ec338ba3a25161e46bb5650f97b869005 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 27 Oct 2021 10:08:18 +0200 Subject: [PATCH 1401/5344] ath6kl: fix division by zero in send path commit c1b9ca365deae667192be9fe24db244919971234 upstream. Add the missing endpoint max-packet sanity check to probe() to avoid division by zero in ath10k_usb_hif_tx_sg() in case a malicious device has broken descriptors (or when doing descriptor fuzz testing). Note that USB core will reject URBs submitted for endpoints with zero wMaxPacketSize but that drivers doing packet-size calculations still need to handle this (cf. commit 2548288b4fb0 ("USB: Fix: Don't skip endpoint descriptors with maxpacket=0")). Fixes: 9cbee358687e ("ath6kl: add full USB support") Cc: stable@vger.kernel.org # 3.5 Signed-off-by: Johan Hovold Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20211027080819.6675-3-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/ath/ath6kl/usb.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/wireless/ath/ath6kl/usb.c b/drivers/net/wireless/ath/ath6kl/usb.c index 53b66e9434c99..760d0975b5ccf 100644 --- a/drivers/net/wireless/ath/ath6kl/usb.c +++ b/drivers/net/wireless/ath/ath6kl/usb.c @@ -340,6 +340,11 @@ static int ath6kl_usb_setup_pipe_resources(struct ath6kl_usb *ar_usb) le16_to_cpu(endpoint->wMaxPacketSize), endpoint->bInterval); } + + /* Ignore broken descriptors. */ + if (usb_endpoint_maxp(endpoint) == 0) + continue; + urbcount = 0; pipe_num = From cf71bf3c2e6fc7397865b8eeafb35ba74f1cb096 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 25 Oct 2021 14:05:20 +0200 Subject: [PATCH 1402/5344] ath6kl: fix control-message timeout commit a066d28a7e729f808a3e6eff22e70c003091544e upstream. USB control-message timeouts are specified in milliseconds and should specifically not vary with CONFIG_HZ. Fixes: 241b128b6b69 ("ath6kl: add back beginnings of USB support") Cc: stable@vger.kernel.org # 3.4 Signed-off-by: Johan Hovold Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20211025120522.6045-3-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/ath/ath6kl/usb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath6kl/usb.c b/drivers/net/wireless/ath/ath6kl/usb.c index 760d0975b5ccf..8b24964304b1f 100644 --- a/drivers/net/wireless/ath/ath6kl/usb.c +++ b/drivers/net/wireless/ath/ath6kl/usb.c @@ -912,7 +912,7 @@ static int ath6kl_usb_submit_ctrl_in(struct ath6kl_usb *ar_usb, req, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, value, index, buf, - size, 2 * HZ); + size, 2000); if (ret < 0) { ath6kl_warn("Failed to read usb control message: %d\n", ret); From a58ffe4c2022f5006c82304e43a807466203fb50 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 25 Oct 2021 14:05:19 +0200 Subject: [PATCH 1403/5344] ath10k: fix control-message timeout commit 5286132324230168d3fab6ffc16bfd7de85bdfb4 upstream. USB control-message timeouts are specified in milliseconds and should specifically not vary with CONFIG_HZ. Fixes: 4db66499df91 ("ath10k: add initial USB support") Cc: stable@vger.kernel.org # 4.14 Cc: Erik Stromdahl Signed-off-by: Johan Hovold Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20211025120522.6045-2-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/ath/ath10k/usb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath10k/usb.c b/drivers/net/wireless/ath/ath10k/usb.c index 05c0d5e92475e..ee635e4eb4c42 100644 --- a/drivers/net/wireless/ath/ath10k/usb.c +++ b/drivers/net/wireless/ath/ath10k/usb.c @@ -525,7 +525,7 @@ static int ath10k_usb_submit_ctrl_in(struct ath10k *ar, req, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, value, index, buf, - size, 2 * HZ); + size, 2000); if (ret < 0) { ath10k_warn(ar, "Failed to read usb control message: %d\n", From e0e9678ff5d63942e8533e59e810f30dbc29025a Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 27 Oct 2021 10:08:17 +0200 Subject: [PATCH 1404/5344] ath10k: fix division by zero in send path commit a006acb931317aad3a8dd41333ebb0453caf49b8 upstream. Add the missing endpoint max-packet sanity check to probe() to avoid division by zero in ath10k_usb_hif_tx_sg() in case a malicious device has broken descriptors (or when doing descriptor fuzz testing). Note that USB core will reject URBs submitted for endpoints with zero wMaxPacketSize but that drivers doing packet-size calculations still need to handle this (cf. commit 2548288b4fb0 ("USB: Fix: Don't skip endpoint descriptors with maxpacket=0")). Fixes: 4db66499df91 ("ath10k: add initial USB support") Cc: stable@vger.kernel.org # 4.14 Cc: Erik Stromdahl Signed-off-by: Johan Hovold Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20211027080819.6675-2-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/ath/ath10k/usb.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/wireless/ath/ath10k/usb.c b/drivers/net/wireless/ath/ath10k/usb.c index ee635e4eb4c42..326e1dc4c6734 100644 --- a/drivers/net/wireless/ath/ath10k/usb.c +++ b/drivers/net/wireless/ath/ath10k/usb.c @@ -865,6 +865,11 @@ static int ath10k_usb_setup_pipe_resources(struct ath10k *ar, le16_to_cpu(endpoint->wMaxPacketSize), endpoint->bInterval); } + + /* Ignore broken descriptors. */ + if (usb_endpoint_maxp(endpoint) == 0) + continue; + urbcount = 0; pipe_num = From 3acfd19bfd8fded6056491e48aa1b642828d9390 Mon Sep 17 00:00:00 2001 From: Ingmar Klein Date: Fri, 9 Apr 2021 11:26:33 +0200 Subject: [PATCH 1405/5344] PCI: Mark Atheros QCA6174 to avoid bus reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit e3f4bd3462f6f796594ecc0dda7144ed2d1e5a26 upstream. When passing the Atheros QCA6174 through to a virtual machine, the VM hangs at the point where the ath10k driver loads. Add a quirk to avoid bus resets on this device, which avoids the hang. [bhelgaas: commit log] Link: https://lore.kernel.org/r/08982e05-b6e8-5a8d-24ab-da1488ee50a8@web.de Signed-off-by: Ingmar Klein Signed-off-by: Bjorn Helgaas Reviewed-by: Pali Rohár Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/pci/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 686298c0f6cda..fb7c5518447da 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3584,6 +3584,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x0032, quirk_no_bus_reset); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x003c, quirk_no_bus_reset); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x0033, quirk_no_bus_reset); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x0034, quirk_no_bus_reset); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x003e, quirk_no_bus_reset); /* * Root port on some Cavium CN8xxx chips do not successfully complete a bus From eb7be2757892c874ba3b826081f51f5b5bff6145 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 25 Oct 2021 14:05:21 +0200 Subject: [PATCH 1406/5344] rtl8187: fix control-message timeouts commit 2e9be536a213e838daed6ba42024dd68954ac061 upstream. USB control-message timeouts are specified in milliseconds and should specifically not vary with CONFIG_HZ. Fixes: 605bebe23bf6 ("[PATCH] Add rtl8187 wireless driver") Cc: stable@vger.kernel.org # 2.6.23 Signed-off-by: Johan Hovold Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20211025120522.6045-4-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- .../net/wireless/realtek/rtl818x/rtl8187/rtl8225.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/wireless/realtek/rtl818x/rtl8187/rtl8225.c b/drivers/net/wireless/realtek/rtl818x/rtl8187/rtl8225.c index b2616d61b66d8..843e3f41a2bc3 100644 --- a/drivers/net/wireless/realtek/rtl818x/rtl8187/rtl8225.c +++ b/drivers/net/wireless/realtek/rtl818x/rtl8187/rtl8225.c @@ -28,7 +28,7 @@ u8 rtl818x_ioread8_idx(struct rtl8187_priv *priv, usb_control_msg(priv->udev, usb_rcvctrlpipe(priv->udev, 0), RTL8187_REQ_GET_REG, RTL8187_REQT_READ, (unsigned long)addr, idx & 0x03, - &priv->io_dmabuf->bits8, sizeof(val), HZ / 2); + &priv->io_dmabuf->bits8, sizeof(val), 500); val = priv->io_dmabuf->bits8; mutex_unlock(&priv->io_mutex); @@ -45,7 +45,7 @@ u16 rtl818x_ioread16_idx(struct rtl8187_priv *priv, usb_control_msg(priv->udev, usb_rcvctrlpipe(priv->udev, 0), RTL8187_REQ_GET_REG, RTL8187_REQT_READ, (unsigned long)addr, idx & 0x03, - &priv->io_dmabuf->bits16, sizeof(val), HZ / 2); + &priv->io_dmabuf->bits16, sizeof(val), 500); val = priv->io_dmabuf->bits16; mutex_unlock(&priv->io_mutex); @@ -62,7 +62,7 @@ u32 rtl818x_ioread32_idx(struct rtl8187_priv *priv, usb_control_msg(priv->udev, usb_rcvctrlpipe(priv->udev, 0), RTL8187_REQ_GET_REG, RTL8187_REQT_READ, (unsigned long)addr, idx & 0x03, - &priv->io_dmabuf->bits32, sizeof(val), HZ / 2); + &priv->io_dmabuf->bits32, sizeof(val), 500); val = priv->io_dmabuf->bits32; mutex_unlock(&priv->io_mutex); @@ -79,7 +79,7 @@ void rtl818x_iowrite8_idx(struct rtl8187_priv *priv, usb_control_msg(priv->udev, usb_sndctrlpipe(priv->udev, 0), RTL8187_REQ_SET_REG, RTL8187_REQT_WRITE, (unsigned long)addr, idx & 0x03, - &priv->io_dmabuf->bits8, sizeof(val), HZ / 2); + &priv->io_dmabuf->bits8, sizeof(val), 500); mutex_unlock(&priv->io_mutex); } @@ -93,7 +93,7 @@ void rtl818x_iowrite16_idx(struct rtl8187_priv *priv, usb_control_msg(priv->udev, usb_sndctrlpipe(priv->udev, 0), RTL8187_REQ_SET_REG, RTL8187_REQT_WRITE, (unsigned long)addr, idx & 0x03, - &priv->io_dmabuf->bits16, sizeof(val), HZ / 2); + &priv->io_dmabuf->bits16, sizeof(val), 500); mutex_unlock(&priv->io_mutex); } @@ -107,7 +107,7 @@ void rtl818x_iowrite32_idx(struct rtl8187_priv *priv, usb_control_msg(priv->udev, usb_sndctrlpipe(priv->udev, 0), RTL8187_REQ_SET_REG, RTL8187_REQT_WRITE, (unsigned long)addr, idx & 0x03, - &priv->io_dmabuf->bits32, sizeof(val), HZ / 2); + &priv->io_dmabuf->bits32, sizeof(val), 500); mutex_unlock(&priv->io_mutex); } @@ -183,7 +183,7 @@ static void rtl8225_write_8051(struct ieee80211_hw *dev, u8 addr, __le16 data) usb_control_msg(priv->udev, usb_sndctrlpipe(priv->udev, 0), RTL8187_REQ_SET_REG, RTL8187_REQT_WRITE, addr, 0x8225, &priv->io_dmabuf->bits16, sizeof(data), - HZ / 2); + 500); mutex_unlock(&priv->io_mutex); From 679daecc2354c5f41f5f4bca9bd5f7207c08e7b6 Mon Sep 17 00:00:00 2001 From: Austin Kim Date: Thu, 28 Oct 2021 12:26:42 +0100 Subject: [PATCH 1407/5344] evm: mark evm_fixmode as __ro_after_init commit 32ba540f3c2a7ef61ed5a577ce25069a3d714fc9 upstream. The evm_fixmode is only configurable by command-line option and it is never modified outside initcalls, so declaring it with __ro_after_init is better. Signed-off-by: Austin Kim Cc: stable@vger.kernel.org Signed-off-by: Mimi Zohar Signed-off-by: Greg Kroah-Hartman --- security/integrity/evm/evm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c index 81e3245aec86c..b82291d10e730 100644 --- a/security/integrity/evm/evm_main.c +++ b/security/integrity/evm/evm_main.c @@ -56,7 +56,7 @@ static struct xattr_list evm_config_default_xattrnames[] = { LIST_HEAD(evm_config_xattrnames); -static int evm_fixmode; +static int evm_fixmode __ro_after_init; static int __init evm_set_fixmode(char *str) { if (strncmp(str, "fix", 3) == 0) From 3c0cf38ed2cb7e0c92ec69dfc8a8315106edcc40 Mon Sep 17 00:00:00 2001 From: Loic Poulain Date: Wed, 20 Oct 2021 15:38:53 +0200 Subject: [PATCH 1408/5344] wcn36xx: Fix HT40 capability for 2Ghz band commit 960ae77f25631bbe4e3aafefe209b52e044baf31 upstream. All wcn36xx controllers are supposed to support HT40 (and SGI40), This doubles the maximum bitrate/throughput with compatible APs. Tested with wcn3620 & wcn3680B. Cc: stable@vger.kernel.org Fixes: 8e84c2582169 ("wcn36xx: mac80211 driver for Qualcomm WCN3660/WCN3680 hardware") Signed-off-by: Loic Poulain Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1634737133-22336-1-git-send-email-loic.poulain@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/ath/wcn36xx/main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/wcn36xx/main.c b/drivers/net/wireless/ath/wcn36xx/main.c index 556ba3c6c5d8e..eb5d08bf25032 100644 --- a/drivers/net/wireless/ath/wcn36xx/main.c +++ b/drivers/net/wireless/ath/wcn36xx/main.c @@ -134,7 +134,9 @@ static struct ieee80211_supported_band wcn_band_2ghz = { .cap = IEEE80211_HT_CAP_GRN_FLD | IEEE80211_HT_CAP_SGI_20 | IEEE80211_HT_CAP_DSSSCCK40 | - IEEE80211_HT_CAP_LSIG_TXOP_PROT, + IEEE80211_HT_CAP_LSIG_TXOP_PROT | + IEEE80211_HT_CAP_SGI_40 | + IEEE80211_HT_CAP_SUP_WIDTH_20_40, .ht_supported = true, .ampdu_factor = IEEE80211_HT_MAX_AMPDU_64K, .ampdu_density = IEEE80211_HT_MPDU_DENSITY_16, From 64b621758e7fdca6113a0ec32772b534c60f5c2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonas=20Dre=C3=9Fler?= Date: Mon, 11 Oct 2021 15:32:23 +0200 Subject: [PATCH 1409/5344] mwifiex: Read a PCI register after writing the TX ring write pointer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit e5f4eb8223aa740237cd463246a7debcddf4eda1 upstream. On the 88W8897 PCIe+USB card the firmware randomly crashes after setting the TX ring write pointer. The issue is present in the latest firmware version 15.68.19.p21 of the PCIe+USB card. Those firmware crashes can be worked around by reading any PCI register of the card after setting that register, so read the PCI_VENDOR_ID register here. The reason this works is probably because we keep the bus from entering an ASPM state for a bit longer, because that's what causes the cards firmware to crash. This fixes a bug where during RX/TX traffic and with ASPM L1 substates enabled (the specific substates where the issue happens appear to be platform dependent), the firmware crashes and eventually a command timeout appears in the logs. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=109681 Cc: stable@vger.kernel.org Signed-off-by: Jonas Dreßler Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20211011133224.15561-2-verdre@v0yd.nl Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/marvell/mwifiex/pcie.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/wireless/marvell/mwifiex/pcie.c b/drivers/net/wireless/marvell/mwifiex/pcie.c index bc46a0aa06eb7..b316e34917958 100644 --- a/drivers/net/wireless/marvell/mwifiex/pcie.c +++ b/drivers/net/wireless/marvell/mwifiex/pcie.c @@ -1326,6 +1326,14 @@ mwifiex_pcie_send_data(struct mwifiex_adapter *adapter, struct sk_buff *skb, ret = -1; goto done_unmap; } + + /* The firmware (latest version 15.68.19.p21) of the 88W8897 PCIe+USB card + * seems to crash randomly after setting the TX ring write pointer when + * ASPM powersaving is enabled. A workaround seems to be keeping the bus + * busy by reading a random register afterwards. + */ + mwifiex_read_reg(adapter, PCI_VENDOR_ID, &rx_val); + if ((mwifiex_pcie_txbd_not_full(card)) && tx_param->next_pkt_len) { /* have more packets and TxBD still can hold more */ From 74bc0020829e32de00b89b989224942271836c77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Reimar=20D=C3=B6ffinger?= Date: Tue, 12 Oct 2021 08:27:44 +0200 Subject: [PATCH 1410/5344] libata: fix checking of DMA state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit f971a85439bd25dc7b4d597cf5e4e8dc7ffc884b upstream. Checking if DMA is enabled should be done via the ata_dma_enabled helper function, since the init state 0xff indicates disabled. This meant that ATA_CMD_READ_LOG_DMA_EXT was used and probed for before DMA was enabled, which caused hangs for some combinations of controllers and devices. It might also have caused it to be incorrectly disabled as broken, but there have been no reports of that. Cc: stable@vger.kernel.org BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=195895 Signed-off-by: Reimar Döffinger Tested-by: Paul Menzel Signed-off-by: Damien Le Moal Signed-off-by: Greg Kroah-Hartman --- drivers/ata/libata-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 7c6c05fd5dfc3..bed433fd9c700 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -2057,7 +2057,7 @@ unsigned int ata_read_log_page(struct ata_device *dev, u8 log, retry: ata_tf_init(dev, &tf); - if (dev->dma_mode && ata_id_has_read_log_dma_ext(dev->id) && + if (ata_dma_enabled(dev) && ata_id_has_read_log_dma_ext(dev->id) && !(dev->horkage & ATA_HORKAGE_NO_DMA_LOG)) { tf.command = ATA_CMD_READ_LOG_DMA_EXT; tf.protocol = ATA_PROT_DMA; From dfc5cd9b5673f14bdc413f9dcf8b50c56c1a6a87 Mon Sep 17 00:00:00 2001 From: Benjamin Li Date: Wed, 1 Sep 2021 11:06:05 -0700 Subject: [PATCH 1411/5344] wcn36xx: handle connection loss indication commit d6dbce453b19c64b96f3e927b10230f9a704b504 upstream. Firmware sends delete_sta_context_ind when it detects the AP has gone away in STA mode. Right now the handler for that indication only handles AP mode; fix it to also handle STA mode. Cc: stable@vger.kernel.org Signed-off-by: Benjamin Li Reviewed-by: Bryan O'Donoghue Reviewed-by: Loic Poulain Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210901180606.11686-1-benl@squareup.com Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/ath/wcn36xx/smd.c | 44 +++++++++++++++++++------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/drivers/net/wireless/ath/wcn36xx/smd.c b/drivers/net/wireless/ath/wcn36xx/smd.c index 523550f94a3f0..a7532028bf9db 100644 --- a/drivers/net/wireless/ath/wcn36xx/smd.c +++ b/drivers/net/wireless/ath/wcn36xx/smd.c @@ -2340,30 +2340,52 @@ static int wcn36xx_smd_delete_sta_context_ind(struct wcn36xx *wcn, size_t len) { struct wcn36xx_hal_delete_sta_context_ind_msg *rsp = buf; - struct wcn36xx_vif *tmp; + struct wcn36xx_vif *vif_priv; + struct ieee80211_vif *vif; + struct ieee80211_bss_conf *bss_conf; struct ieee80211_sta *sta; + bool found = false; if (len != sizeof(*rsp)) { wcn36xx_warn("Corrupted delete sta indication\n"); return -EIO; } - wcn36xx_dbg(WCN36XX_DBG_HAL, "delete station indication %pM index %d\n", - rsp->addr2, rsp->sta_id); + wcn36xx_dbg(WCN36XX_DBG_HAL, + "delete station indication %pM index %d reason %d\n", + rsp->addr2, rsp->sta_id, rsp->reason_code); - list_for_each_entry(tmp, &wcn->vif_list, list) { + list_for_each_entry(vif_priv, &wcn->vif_list, list) { rcu_read_lock(); - sta = ieee80211_find_sta(wcn36xx_priv_to_vif(tmp), rsp->addr2); - if (sta) - ieee80211_report_low_ack(sta, 0); + vif = wcn36xx_priv_to_vif(vif_priv); + + if (vif->type == NL80211_IFTYPE_STATION) { + /* We could call ieee80211_find_sta too, but checking + * bss_conf is clearer. + */ + bss_conf = &vif->bss_conf; + if (vif_priv->sta_assoc && + !memcmp(bss_conf->bssid, rsp->addr2, ETH_ALEN)) { + found = true; + wcn36xx_dbg(WCN36XX_DBG_HAL, + "connection loss bss_index %d\n", + vif_priv->bss_index); + ieee80211_connection_loss(vif); + } + } else { + sta = ieee80211_find_sta(vif, rsp->addr2); + if (sta) { + found = true; + ieee80211_report_low_ack(sta, 0); + } + } + rcu_read_unlock(); - if (sta) + if (found) return 0; } - wcn36xx_warn("STA with addr %pM and index %d not found\n", - rsp->addr2, - rsp->sta_id); + wcn36xx_warn("BSS or STA with addr %pM not found\n", rsp->addr2); return -ENOENT; } From 9684721e44e9bfbaefecff8b4c0226e841db1779 Mon Sep 17 00:00:00 2001 From: Martin Fuzzey Date: Mon, 30 Aug 2021 17:26:44 +0200 Subject: [PATCH 1412/5344] rsi: fix occasional initialisation failure with BT coex commit 9b14ed6e11b72dd4806535449ca6c6962cb2369d upstream. When BT coexistence is enabled (eg oper mode 13, which is the default) the initialisation on startup sometimes silently fails. In a normal initialisation we see usb 1-1.3: Product: Wireless USB Network Module usb 1-1.3: Manufacturer: Redpine Signals, Inc. usb 1-1.3: SerialNumber: 000000000001 rsi_91x: rsi_probe: Initialized os intf ops rsi_91x: rsi_load_9116_firmware: Loading chunk 0 rsi_91x: rsi_load_9116_firmware: Loading chunk 1 rsi_91x: rsi_load_9116_firmware: Loading chunk 2 rsi_91x: Max Stations Allowed = 1 But sometimes the last log is missing and the wlan net device is not created. Running a userspace loop that resets the hardware via a GPIO shows the problem occurring ~5/100 resets. The problem does not occur in oper mode 1 (wifi only). Adding logs shows that the initialisation state machine requests a MAC reset via rsi_send_reset_mac() but the firmware does not reply, leading to the initialisation sequence being incomplete. Fix this by delaying attaching the BT adapter until the wifi initialisation has completed. With this applied I have done > 300 reset loops with no errors. Fixes: 716b840c7641 ("rsi: handle BT traffic in driver") Signed-off-by: Martin Fuzzey CC: stable@vger.kernel.org Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1630337206-12410-2-git-send-email-martin.fuzzey@flowbird.group Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/rsi/rsi_91x_main.c | 16 +++++++++++++--- drivers/net/wireless/rsi/rsi_91x_mgmt.c | 3 +++ drivers/net/wireless/rsi/rsi_main.h | 2 ++ 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/rsi/rsi_91x_main.c b/drivers/net/wireless/rsi/rsi_91x_main.c index 29d83049c5f56..aece1d3a6b055 100644 --- a/drivers/net/wireless/rsi/rsi_91x_main.c +++ b/drivers/net/wireless/rsi/rsi_91x_main.c @@ -210,9 +210,10 @@ int rsi_read_pkt(struct rsi_common *common, u8 *rx_pkt, s32 rcv_pkt_len) bt_pkt_type = frame_desc[offset + BT_RX_PKT_TYPE_OFST]; if (bt_pkt_type == BT_CARD_READY_IND) { rsi_dbg(INFO_ZONE, "BT Card ready recvd\n"); - if (rsi_bt_ops.attach(common, &g_proto_ops)) - rsi_dbg(ERR_ZONE, - "Failed to attach BT module\n"); + if (common->fsm_state == FSM_MAC_INIT_DONE) + rsi_attach_bt(common); + else + common->bt_defer_attach = true; } else { if (common->bt_adapter) rsi_bt_ops.recv_pkt(common->bt_adapter, @@ -277,6 +278,15 @@ void rsi_set_bt_context(void *priv, void *bt_context) } #endif +void rsi_attach_bt(struct rsi_common *common) +{ +#ifdef CONFIG_RSI_COEX + if (rsi_bt_ops.attach(common, &g_proto_ops)) + rsi_dbg(ERR_ZONE, + "Failed to attach BT module\n"); +#endif +} + /** * rsi_91x_init() - This function initializes os interface operations. * @void: Void. diff --git a/drivers/net/wireless/rsi/rsi_91x_mgmt.c b/drivers/net/wireless/rsi/rsi_91x_mgmt.c index ed67f65986775..50c4b9b5b1a4a 100644 --- a/drivers/net/wireless/rsi/rsi_91x_mgmt.c +++ b/drivers/net/wireless/rsi/rsi_91x_mgmt.c @@ -2056,6 +2056,9 @@ static int rsi_handle_ta_confirm_type(struct rsi_common *common, if (common->reinit_hw) { complete(&common->wlan_init_completion); } else { + if (common->bt_defer_attach) + rsi_attach_bt(common); + return rsi_mac80211_attach(common); } } diff --git a/drivers/net/wireless/rsi/rsi_main.h b/drivers/net/wireless/rsi/rsi_main.h index 1820015dc74d9..526e60e02125c 100644 --- a/drivers/net/wireless/rsi/rsi_main.h +++ b/drivers/net/wireless/rsi/rsi_main.h @@ -321,6 +321,7 @@ struct rsi_common { struct ieee80211_vif *roc_vif; bool eapol4_confirm; + bool bt_defer_attach; void *bt_adapter; struct cfg80211_scan_request *hwscan; @@ -402,5 +403,6 @@ struct rsi_host_intf_ops { enum rsi_host_intf rsi_get_host_intf(void *priv); void rsi_set_bt_context(void *priv, void *bt_context); +void rsi_attach_bt(struct rsi_common *common); #endif From fd494ee67a279a1552e9df6978cdbf243c4a0ebf Mon Sep 17 00:00:00 2001 From: Martin Fuzzey Date: Mon, 30 Aug 2021 17:26:46 +0200 Subject: [PATCH 1413/5344] rsi: fix rate mask set leading to P2P failure commit b515d097053a71d624e0c5840b42cd4caa653941 upstream. P2P client mode was only working the first time. On subsequent connection attempts the group was successfully created but no data was sent (no transmitted data packets were seen with a sniffer). The reason for this was that the hardware was being configured in fixed rate mode with rate RSI_RATE_1 (1Mbps) which is not valid in the 5GHz band. In P2P mode wpa_supplicant uses NL80211_CMD_SET_TX_BITRATE_MASK to disallow the 11b rates in the 2.4GHz band which updated common->fixedrate_mask. rsi_set_min_rate() then used the fixedrate_mask to calculate the minimum allowed rate, or 0xffff = auto if none was found. However that calculation did not account for the different rate sets allowed in the different bands leading to the error. Fixing set_min_rate() would result in 6Mb/s being used all the time which is not what we want either. The reason the problem did not occur on the first connection is that rsi_mac80211_set_rate_mask() only updated the fixedrate_mask for the *current* band. When it was called that was still 2.4GHz as the switch is done later. So the when set_min_rate() was subsequently called after the switch to 5GHz it still had a mask of zero, leading to defaulting to auto mode. Fix this by differentiating the case of a single rate being requested, in which case the hardware will be used in fixed rate mode with just that rate, and multiple rates being requested, in which case we remain in auto mode but the firmware rate selection algorithm is configured with a restricted set of rates. Fixes: dad0d04fa7ba ("rsi: Add RS9113 wireless driver") Signed-off-by: Martin Fuzzey CC: stable@vger.kernel.org Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1630337206-12410-4-git-send-email-martin.fuzzey@flowbird.group Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/rsi/rsi_91x_hal.c | 8 ++- drivers/net/wireless/rsi/rsi_91x_mac80211.c | 74 ++++++--------------- drivers/net/wireless/rsi/rsi_91x_mgmt.c | 21 ++++-- drivers/net/wireless/rsi/rsi_main.h | 12 +++- 4 files changed, 50 insertions(+), 65 deletions(-) diff --git a/drivers/net/wireless/rsi/rsi_91x_hal.c b/drivers/net/wireless/rsi/rsi_91x_hal.c index a82412b9f409e..7d0b44fd56901 100644 --- a/drivers/net/wireless/rsi/rsi_91x_hal.c +++ b/drivers/net/wireless/rsi/rsi_91x_hal.c @@ -214,15 +214,17 @@ int rsi_prepare_data_desc(struct rsi_common *common, struct sk_buff *skb) RSI_WIFI_DATA_Q); data_desc->header_len = ieee80211_size; - if (common->min_rate != RSI_RATE_AUTO) { + if (common->rate_config[common->band].fixed_enabled) { /* Send fixed rate */ + u16 fixed_rate = common->rate_config[common->band].fixed_hw_rate; + data_desc->frame_info = cpu_to_le16(RATE_INFO_ENABLE); - data_desc->rate_info = cpu_to_le16(common->min_rate); + data_desc->rate_info = cpu_to_le16(fixed_rate); if (conf_is_ht40(&common->priv->hw->conf)) data_desc->bbp_info = cpu_to_le16(FULL40M_ENABLE); - if ((common->vif_info[0].sgi) && (common->min_rate & 0x100)) { + if (common->vif_info[0].sgi && (fixed_rate & 0x100)) { /* Only MCS rates */ data_desc->rate_info |= cpu_to_le16(ENABLE_SHORTGI_RATE); diff --git a/drivers/net/wireless/rsi/rsi_91x_mac80211.c b/drivers/net/wireless/rsi/rsi_91x_mac80211.c index ca1e609f637e4..1279339a4b59b 100644 --- a/drivers/net/wireless/rsi/rsi_91x_mac80211.c +++ b/drivers/net/wireless/rsi/rsi_91x_mac80211.c @@ -510,7 +510,6 @@ static int rsi_mac80211_add_interface(struct ieee80211_hw *hw, if ((vif->type == NL80211_IFTYPE_AP) || (vif->type == NL80211_IFTYPE_P2P_GO)) { rsi_send_rx_filter_frame(common, DISALLOW_BEACONS); - common->min_rate = RSI_RATE_AUTO; for (i = 0; i < common->max_stations; i++) common->stations[i].sta = NULL; } @@ -1211,20 +1210,32 @@ static int rsi_mac80211_set_rate_mask(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const struct cfg80211_bitrate_mask *mask) { + const unsigned int mcs_offset = ARRAY_SIZE(rsi_rates); struct rsi_hw *adapter = hw->priv; struct rsi_common *common = adapter->priv; - enum nl80211_band band = hw->conf.chandef.chan->band; + int i; mutex_lock(&common->mutex); - common->fixedrate_mask[band] = 0; - if (mask->control[band].legacy == 0xfff) { - common->fixedrate_mask[band] = - (mask->control[band].ht_mcs[0] << 12); - } else { - common->fixedrate_mask[band] = - mask->control[band].legacy; + for (i = 0; i < ARRAY_SIZE(common->rate_config); i++) { + struct rsi_rate_config *cfg = &common->rate_config[i]; + u32 bm; + + bm = mask->control[i].legacy | (mask->control[i].ht_mcs[0] << mcs_offset); + if (hweight32(bm) == 1) { /* single rate */ + int rate_index = ffs(bm) - 1; + + if (rate_index < mcs_offset) + cfg->fixed_hw_rate = rsi_rates[rate_index].hw_value; + else + cfg->fixed_hw_rate = rsi_mcsrates[rate_index - mcs_offset]; + cfg->fixed_enabled = true; + } else { + cfg->configured_mask = bm; + cfg->fixed_enabled = false; + } } + mutex_unlock(&common->mutex); return 0; @@ -1360,46 +1371,6 @@ void rsi_indicate_pkt_to_os(struct rsi_common *common, ieee80211_rx_irqsafe(hw, skb); } -static void rsi_set_min_rate(struct ieee80211_hw *hw, - struct ieee80211_sta *sta, - struct rsi_common *common) -{ - u8 band = hw->conf.chandef.chan->band; - u8 ii; - u32 rate_bitmap; - bool matched = false; - - common->bitrate_mask[band] = sta->supp_rates[band]; - - rate_bitmap = (common->fixedrate_mask[band] & sta->supp_rates[band]); - - if (rate_bitmap & 0xfff) { - /* Find out the min rate */ - for (ii = 0; ii < ARRAY_SIZE(rsi_rates); ii++) { - if (rate_bitmap & BIT(ii)) { - common->min_rate = rsi_rates[ii].hw_value; - matched = true; - break; - } - } - } - - common->vif_info[0].is_ht = sta->ht_cap.ht_supported; - - if ((common->vif_info[0].is_ht) && (rate_bitmap >> 12)) { - for (ii = 0; ii < ARRAY_SIZE(rsi_mcsrates); ii++) { - if ((rate_bitmap >> 12) & BIT(ii)) { - common->min_rate = rsi_mcsrates[ii]; - matched = true; - break; - } - } - } - - if (!matched) - common->min_rate = 0xffff; -} - /** * rsi_mac80211_sta_add() - This function notifies driver about a peer getting * connected. @@ -1498,9 +1469,9 @@ static int rsi_mac80211_sta_add(struct ieee80211_hw *hw, if ((vif->type == NL80211_IFTYPE_STATION) || (vif->type == NL80211_IFTYPE_P2P_CLIENT)) { - rsi_set_min_rate(hw, sta, common); + common->bitrate_mask[common->band] = sta->supp_rates[common->band]; + common->vif_info[0].is_ht = sta->ht_cap.ht_supported; if (sta->ht_cap.ht_supported) { - common->vif_info[0].is_ht = true; common->bitrate_mask[NL80211_BAND_2GHZ] = sta->supp_rates[NL80211_BAND_2GHZ]; if ((sta->ht_cap.cap & IEEE80211_HT_CAP_SGI_20) || @@ -1574,7 +1545,6 @@ static int rsi_mac80211_sta_remove(struct ieee80211_hw *hw, bss->qos = sta->wme; common->bitrate_mask[NL80211_BAND_2GHZ] = 0; common->bitrate_mask[NL80211_BAND_5GHZ] = 0; - common->min_rate = 0xffff; common->vif_info[0].is_ht = false; common->vif_info[0].sgi = false; common->vif_info[0].seq_start = 0; diff --git a/drivers/net/wireless/rsi/rsi_91x_mgmt.c b/drivers/net/wireless/rsi/rsi_91x_mgmt.c index 50c4b9b5b1a4a..0c0e4c08be8aa 100644 --- a/drivers/net/wireless/rsi/rsi_91x_mgmt.c +++ b/drivers/net/wireless/rsi/rsi_91x_mgmt.c @@ -276,7 +276,7 @@ static void rsi_set_default_parameters(struct rsi_common *common) common->channel_width = BW_20MHZ; common->rts_threshold = IEEE80211_MAX_RTS_THRESHOLD; common->channel = 1; - common->min_rate = 0xffff; + memset(&common->rate_config, 0, sizeof(common->rate_config)); common->fsm_state = FSM_CARD_NOT_READY; common->iface_down = true; common->endpoint = EP_2GHZ_20MHZ; @@ -1304,7 +1304,7 @@ static int rsi_send_auto_rate_request(struct rsi_common *common, u8 band = hw->conf.chandef.chan->band; u8 num_supported_rates = 0; u8 rate_table_offset, rate_offset = 0; - u32 rate_bitmap; + u32 rate_bitmap, configured_rates; u16 *selected_rates, min_rate; bool is_ht = false, is_sgi = false; u16 frame_len = sizeof(struct rsi_auto_rate); @@ -1354,6 +1354,10 @@ static int rsi_send_auto_rate_request(struct rsi_common *common, is_sgi = true; } + /* Limit to any rates administratively configured by cfg80211 */ + configured_rates = common->rate_config[band].configured_mask ?: 0xffffffff; + rate_bitmap &= configured_rates; + if (band == NL80211_BAND_2GHZ) { if ((rate_bitmap == 0) && (is_ht)) min_rate = RSI_RATE_MCS0; @@ -1379,10 +1383,13 @@ static int rsi_send_auto_rate_request(struct rsi_common *common, num_supported_rates = jj; if (is_ht) { - for (ii = 0; ii < ARRAY_SIZE(mcs); ii++) - selected_rates[jj++] = mcs[ii]; - num_supported_rates += ARRAY_SIZE(mcs); - rate_offset += ARRAY_SIZE(mcs); + for (ii = 0; ii < ARRAY_SIZE(mcs); ii++) { + if (configured_rates & BIT(ii + ARRAY_SIZE(rsi_rates))) { + selected_rates[jj++] = mcs[ii]; + num_supported_rates++; + rate_offset++; + } + } } sort(selected_rates, jj, sizeof(u16), &rsi_compare, NULL); @@ -1467,7 +1474,7 @@ void rsi_inform_bss_status(struct rsi_common *common, qos_enable, aid, sta_id, vif); - if (common->min_rate == 0xffff) + if (!common->rate_config[common->band].fixed_enabled) rsi_send_auto_rate_request(common, sta, sta_id, vif); if (opmode == RSI_OPMODE_STA && !(assoc_cap & WLAN_CAPABILITY_PRIVACY) && diff --git a/drivers/net/wireless/rsi/rsi_main.h b/drivers/net/wireless/rsi/rsi_main.h index 526e60e02125c..de595025c0197 100644 --- a/drivers/net/wireless/rsi/rsi_main.h +++ b/drivers/net/wireless/rsi/rsi_main.h @@ -61,6 +61,7 @@ enum RSI_FSM_STATES { extern u32 rsi_zone_enabled; extern __printf(2, 3) void rsi_dbg(u32 zone, const char *fmt, ...); +#define RSI_MAX_BANDS 2 #define RSI_MAX_VIFS 3 #define NUM_EDCA_QUEUES 4 #define IEEE80211_ADDR_LEN 6 @@ -230,6 +231,12 @@ struct rsi_9116_features { u32 ps_options; }; +struct rsi_rate_config { + u32 configured_mask; /* configured by mac80211 bits 0-11=legacy 12+ mcs */ + u16 fixed_hw_rate; + bool fixed_enabled; +}; + struct rsi_common { struct rsi_hw *priv; struct vif_priv vif_info[RSI_MAX_VIFS]; @@ -255,8 +262,8 @@ struct rsi_common { u8 channel_width; u16 rts_threshold; - u16 bitrate_mask[2]; - u32 fixedrate_mask[2]; + u32 bitrate_mask[RSI_MAX_BANDS]; + struct rsi_rate_config rate_config[RSI_MAX_BANDS]; u8 rf_reset; struct transmit_q_stats tx_stats; @@ -277,7 +284,6 @@ struct rsi_common { u8 mac_id; u8 radio_id; u16 rate_pwr[20]; - u16 min_rate; /* WMM algo related */ u8 selected_qnum; From cd5d6c70d28252c39e35bd74c4eb348019fd3e6e Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Thu, 16 Sep 2021 16:42:45 +0200 Subject: [PATCH 1414/5344] rsi: Fix module dev_oper_mode parameter description commit 31f97cf9f0c31143a2a6fcc89c4a1286ce20157e upstream. The module parameters are missing dev_oper_mode 12, BT classic alone, add it. Moreover, the parameters encode newlines, which ends up being printed malformed e.g. by modinfo, so fix that too. However, the module parameter string is duplicated in both USB and SDIO modules and the dev_oper_mode mode enumeration in those module parameters is a duplicate of macros used by the driver. Furthermore, the enumeration is confusing. So, deduplicate the module parameter string and use __stringify() to encode the correct mode enumeration values into the module parameter string. Finally, replace 'Wi-Fi' with 'Wi-Fi alone' and 'BT' with 'BT classic alone' to clarify what those modes really mean. Fixes: 898b255339310 ("rsi: add module parameter operating mode") Signed-off-by: Marek Vasut Cc: Amitkumar Karwar Cc: Angus Ainslie Cc: David S. Miller Cc: Jakub Kicinski Cc: Kalle Valo Cc: Karun Eagalapati Cc: Martin Fuzzey Cc: Martin Kepplinger Cc: Prameela Rani Garnepudi Cc: Sebastian Krzyszkowiak Cc: Siva Rebbagondla Cc: netdev@vger.kernel.org Cc: # 4.17+ Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210916144245.10181-1-marex@denx.de Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/rsi/rsi_91x_sdio.c | 5 +---- drivers/net/wireless/rsi/rsi_91x_usb.c | 5 +---- drivers/net/wireless/rsi/rsi_hal.h | 11 +++++++++++ 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/net/wireless/rsi/rsi_91x_sdio.c b/drivers/net/wireless/rsi/rsi_91x_sdio.c index d51ec7104fb7c..4fe837090cdae 100644 --- a/drivers/net/wireless/rsi/rsi_91x_sdio.c +++ b/drivers/net/wireless/rsi/rsi_91x_sdio.c @@ -24,10 +24,7 @@ /* Default operating mode is wlan STA + BT */ static u16 dev_oper_mode = DEV_OPMODE_STA_BT_DUAL; module_param(dev_oper_mode, ushort, 0444); -MODULE_PARM_DESC(dev_oper_mode, - "1[Wi-Fi], 4[BT], 8[BT LE], 5[Wi-Fi STA + BT classic]\n" - "9[Wi-Fi STA + BT LE], 13[Wi-Fi STA + BT classic + BT LE]\n" - "6[AP + BT classic], 14[AP + BT classic + BT LE]"); +MODULE_PARM_DESC(dev_oper_mode, DEV_OPMODE_PARAM_DESC); /** * rsi_sdio_set_cmd52_arg() - This function prepares cmd 52 read/write arg. diff --git a/drivers/net/wireless/rsi/rsi_91x_usb.c b/drivers/net/wireless/rsi/rsi_91x_usb.c index 1e5a2a0cc6700..68ce3d2bc5357 100644 --- a/drivers/net/wireless/rsi/rsi_91x_usb.c +++ b/drivers/net/wireless/rsi/rsi_91x_usb.c @@ -25,10 +25,7 @@ /* Default operating mode is wlan STA + BT */ static u16 dev_oper_mode = DEV_OPMODE_STA_BT_DUAL; module_param(dev_oper_mode, ushort, 0444); -MODULE_PARM_DESC(dev_oper_mode, - "1[Wi-Fi], 4[BT], 8[BT LE], 5[Wi-Fi STA + BT classic]\n" - "9[Wi-Fi STA + BT LE], 13[Wi-Fi STA + BT classic + BT LE]\n" - "6[AP + BT classic], 14[AP + BT classic + BT LE]"); +MODULE_PARM_DESC(dev_oper_mode, DEV_OPMODE_PARAM_DESC); static int rsi_rx_urb_submit(struct rsi_hw *adapter, u8 ep_num, gfp_t flags); diff --git a/drivers/net/wireless/rsi/rsi_hal.h b/drivers/net/wireless/rsi/rsi_hal.h index 46e36df9e8e3c..a2fbec1cec4c3 100644 --- a/drivers/net/wireless/rsi/rsi_hal.h +++ b/drivers/net/wireless/rsi/rsi_hal.h @@ -28,6 +28,17 @@ #define DEV_OPMODE_AP_BT 6 #define DEV_OPMODE_AP_BT_DUAL 14 +#define DEV_OPMODE_PARAM_DESC \ + __stringify(DEV_OPMODE_WIFI_ALONE) "[Wi-Fi alone], " \ + __stringify(DEV_OPMODE_BT_ALONE) "[BT classic alone], " \ + __stringify(DEV_OPMODE_BT_LE_ALONE) "[BT LE alone], " \ + __stringify(DEV_OPMODE_BT_DUAL) "[BT classic + BT LE alone], " \ + __stringify(DEV_OPMODE_STA_BT) "[Wi-Fi STA + BT classic], " \ + __stringify(DEV_OPMODE_STA_BT_LE) "[Wi-Fi STA + BT LE], " \ + __stringify(DEV_OPMODE_STA_BT_DUAL) "[Wi-Fi STA + BT classic + BT LE], " \ + __stringify(DEV_OPMODE_AP_BT) "[Wi-Fi AP + BT classic], " \ + __stringify(DEV_OPMODE_AP_BT_DUAL) "[Wi-Fi AP + BT classic + BT LE]" + #define FLASH_WRITE_CHUNK_SIZE (4 * 1024) #define FLASH_SECTOR_SIZE (4 * 1024) From 79ab4c3683a77cb4b7052db556d89b4f6bb2fdb4 Mon Sep 17 00:00:00 2001 From: Alok Prasad Date: Wed, 27 Oct 2021 18:43:29 +0000 Subject: [PATCH 1415/5344] RDMA/qedr: Fix NULL deref for query_qp on the GSI QP commit 4f960393a0ee9a39469ceb7c8077ae8db665cc12 upstream. This patch fixes a crash caused by querying the QP via netlink, and corrects the state of GSI qp. GSI qp's have a NULL qed_qp. The call trace is generated by: $ rdma res show BUG: kernel NULL pointer dereference, address: 0000000000000034 Hardware name: Dell Inc. PowerEdge R720/0M1GCR, BIOS 1.2.6 05/10/2012 RIP: 0010:qed_rdma_query_qp+0x33/0x1a0 [qed] RSP: 0018:ffffba560a08f580 EFLAGS: 00010206 RAX: 0000000200000000 RBX: ffffba560a08f5b8 RCX: 0000000000000000 RDX: ffffba560a08f5b8 RSI: 0000000000000000 RDI: ffff9807ee458090 RBP: ffffba560a08f5a0 R08: 0000000000000000 R09: ffff9807890e7048 R10: ffffba560a08f658 R11: 0000000000000000 R12: 0000000000000000 R13: ffff9807ee458090 R14: ffff9807f0afb000 R15: ffffba560a08f7ec FS: 00007fbbf8bfe740(0000) GS:ffff980aafa00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000034 CR3: 00000001720ba001 CR4: 00000000000606f0 Call Trace: qedr_query_qp+0x82/0x360 [qedr] ib_query_qp+0x34/0x40 [ib_core] ? ib_query_qp+0x34/0x40 [ib_core] fill_res_qp_entry_query.isra.26+0x47/0x1d0 [ib_core] ? __nla_put+0x20/0x30 ? nla_put+0x33/0x40 fill_res_qp_entry+0xe3/0x120 [ib_core] res_get_common_dumpit+0x3f8/0x5d0 [ib_core] ? fill_res_cm_id_entry+0x1f0/0x1f0 [ib_core] nldev_res_get_qp_dumpit+0x1a/0x20 [ib_core] netlink_dump+0x156/0x2f0 __netlink_dump_start+0x1ab/0x260 rdma_nl_rcv+0x1de/0x330 [ib_core] ? nldev_res_get_cm_id_dumpit+0x20/0x20 [ib_core] netlink_unicast+0x1b8/0x270 netlink_sendmsg+0x33e/0x470 sock_sendmsg+0x63/0x70 __sys_sendto+0x13f/0x180 ? setup_sgl.isra.12+0x70/0xc0 __x64_sys_sendto+0x28/0x30 do_syscall_64+0x3a/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae Cc: stable@vger.kernel.org Fixes: cecbcddf6461 ("qedr: Add support for QP verbs") Link: https://lore.kernel.org/r/20211027184329.18454-1-palok@marvell.com Signed-off-by: Ariel Elior Signed-off-by: Shai Malin Signed-off-by: Prabhakar Kushwaha Signed-off-by: Alok Prasad Signed-off-by: Jason Gunthorpe Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/hw/qedr/verbs.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index aaea511fd42e3..a15190fc887f1 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -2383,15 +2383,18 @@ int qedr_query_qp(struct ib_qp *ibqp, int rc = 0; memset(¶ms, 0, sizeof(params)); - - rc = dev->ops->rdma_query_qp(dev->rdma_ctx, qp->qed_qp, ¶ms); - if (rc) - goto err; - memset(qp_attr, 0, sizeof(*qp_attr)); memset(qp_init_attr, 0, sizeof(*qp_init_attr)); - qp_attr->qp_state = qedr_get_ibqp_state(params.state); + if (qp->qp_type != IB_QPT_GSI) { + rc = dev->ops->rdma_query_qp(dev->rdma_ctx, qp->qed_qp, ¶ms); + if (rc) + goto err; + qp_attr->qp_state = qedr_get_ibqp_state(params.state); + } else { + qp_attr->qp_state = qedr_get_ibqp_state(QED_ROCE_QP_STATE_RTS); + } + qp_attr->cur_qp_state = qedr_get_ibqp_state(params.state); qp_attr->path_mtu = ib_mtu_int_to_enum(params.mtu); qp_attr->path_mig_state = IB_MIG_MIGRATED; From 6ab45389826dccf4963d480f3555ab8094fb8f4d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 1 Sep 2021 13:21:34 -0500 Subject: [PATCH 1416/5344] signal: Remove the bogus sigkill_pending in ptrace_stop commit 7d613f9f72ec8f90ddefcae038fdae5adb8404b3 upstream. The existence of sigkill_pending is a little silly as it is functionally a duplicate of fatal_signal_pending that is used in exactly one place. Checking for pending fatal signals and returning early in ptrace_stop is actively harmful. It casues the ptrace_stop called by ptrace_signal to return early before setting current->exit_code. Later when ptrace_signal reads the signal number from current->exit_code is undefined, making it unpredictable what will happen. Instead rely on the fact that schedule will not sleep if there is a pending signal that can awaken a task. Removing the explict sigkill_pending test fixes fixes ptrace_signal when ptrace_stop does not stop because current->exit_code is always set to to signr. Cc: stable@vger.kernel.org Fixes: 3d749b9e676b ("ptrace: simplify ptrace_stop()->sigkill_pending() path") Fixes: 1a669c2f16d4 ("Add arch_ptrace_stop") Link: https://lkml.kernel.org/r/87pmsyx29t.fsf@disp2133 Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- kernel/signal.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/kernel/signal.c b/kernel/signal.c index 29fba42478771..3dbd45d6c2fb7 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2099,15 +2099,6 @@ static inline bool may_ptrace_stop(void) return true; } -/* - * Return non-zero if there is a SIGKILL that should be waking us up. - * Called with the siglock held. - */ -static bool sigkill_pending(struct task_struct *tsk) -{ - return sigismember(&tsk->pending.signal, SIGKILL) || - sigismember(&tsk->signal->shared_pending.signal, SIGKILL); -} /* * This must be called with current->sighand->siglock held. @@ -2134,17 +2125,16 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t * calling arch_ptrace_stop, so we must release it now. * To preserve proper semantics, we must do this before * any signal bookkeeping like checking group_stop_count. - * Meanwhile, a SIGKILL could come in before we retake the - * siglock. That must prevent us from sleeping in TASK_TRACED. - * So after regaining the lock, we must check for SIGKILL. */ spin_unlock_irq(¤t->sighand->siglock); arch_ptrace_stop(exit_code, info); spin_lock_irq(¤t->sighand->siglock); - if (sigkill_pending(current)) - return; } + /* + * schedule() will not sleep if there is a pending signal that + * can awaken the task. + */ set_special_state(TASK_TRACED); /* From 010014f7dedb35a5f3272be132c33aa8dc1b259d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 20 Oct 2021 12:43:51 -0500 Subject: [PATCH 1417/5344] signal/mips: Update (_save|_restore)_fp_context to fail with -EFAULT commit 95bf9d646c3c3f95cb0be7e703b371db8da5be68 upstream. When an instruction to save or restore a register from the stack fails in _save_fp_context or _restore_fp_context return with -EFAULT. This change was made to r2300_fpu.S[1] but it looks like it got lost with the introduction of EX2[2]. This is also what the other implementation of _save_fp_context and _restore_fp_context in r4k_fpu.S does, and what is needed for the callers to be able to handle the error. Furthermore calling do_exit(SIGSEGV) from bad_stack is wrong because it does not terminate the entire process it just terminates a single thread. As the changed code was the only caller of arch/mips/kernel/syscall.c:bad_stack remove the problematic and now unused helper function. Cc: Thomas Bogendoerfer Cc: Maciej Rozycki Cc: linux-mips@vger.kernel.org [1] 35938a00ba86 ("MIPS: Fix ISA I FP sigcontext access violation handling") [2] f92722dc4545 ("MIPS: Correct MIPS I FP sigcontext layout") Cc: stable@vger.kernel.org Fixes: f92722dc4545 ("MIPS: Correct MIPS I FP sigcontext layout") Acked-by: Maciej W. Rozycki Acked-by: Thomas Bogendoerfer Link: https://lkml.kernel.org/r/20211020174406.17889-5-ebiederm@xmission.com Signed-off-by: Eric W. Biederman Signed-off-by: Greg Kroah-Hartman --- arch/mips/kernel/r2300_fpu.S | 4 ++-- arch/mips/kernel/syscall.c | 9 --------- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/arch/mips/kernel/r2300_fpu.S b/arch/mips/kernel/r2300_fpu.S index 12e58053544fc..cbf6db98cfb38 100644 --- a/arch/mips/kernel/r2300_fpu.S +++ b/arch/mips/kernel/r2300_fpu.S @@ -29,8 +29,8 @@ #define EX2(a,b) \ 9: a,##b; \ .section __ex_table,"a"; \ - PTR 9b,bad_stack; \ - PTR 9b+4,bad_stack; \ + PTR 9b,fault; \ + PTR 9b+4,fault; \ .previous .set mips1 diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c index 3f16f38230310..9a28e79f2e66e 100644 --- a/arch/mips/kernel/syscall.c +++ b/arch/mips/kernel/syscall.c @@ -239,12 +239,3 @@ SYSCALL_DEFINE3(cachectl, char *, addr, int, nbytes, int, op) { return -ENOSYS; } - -/* - * If we ever come here the user sp is bad. Zap the process right away. - * Due to the bad stack signaling wouldn't work. - */ -asmlinkage void bad_stack(void) -{ - do_exit(SIGSEGV); -} From 29a56bcc7ec0ab87e90b51114a44b9588fbd358f Mon Sep 17 00:00:00 2001 From: Sebastian Krzyszkowiak Date: Tue, 14 Sep 2021 14:18:06 +0200 Subject: [PATCH 1418/5344] power: supply: max17042_battery: Prevent int underflow in set_soc_threshold commit e660dbb68c6b3f7b9eb8b9775846a44f9798b719 upstream. max17042_set_soc_threshold gets called with offset set to 1, which means that minimum threshold value would underflow once SOC got down to 0, causing invalid alerts from the gauge. Fixes: e5f3872d2044 ("max17042: Add support for signalling change in SOC") Cc: Signed-off-by: Sebastian Krzyszkowiak Reviewed-by: Krzysztof Kozlowski Signed-off-by: Sebastian Reichel Signed-off-by: Greg Kroah-Hartman --- drivers/power/supply/max17042_battery.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/power/supply/max17042_battery.c b/drivers/power/supply/max17042_battery.c index f8f8207a1895e..8fe67490366dc 100644 --- a/drivers/power/supply/max17042_battery.c +++ b/drivers/power/supply/max17042_battery.c @@ -834,7 +834,8 @@ static void max17042_set_soc_threshold(struct max17042_chip *chip, u16 off) regmap_read(map, MAX17042_RepSOC, &soc); soc >>= 8; soc_tr = (soc + off) << 8; - soc_tr |= (soc - off); + if (off < soc) + soc_tr |= soc - off; regmap_write(map, MAX17042_SALRT_Th, soc_tr); } From e0ee467ac27632168c638c7a0121b9ca3e6f31f0 Mon Sep 17 00:00:00 2001 From: Henrik Grimler Date: Wed, 29 Sep 2021 20:14:17 +0200 Subject: [PATCH 1419/5344] power: supply: max17042_battery: use VFSOC for capacity when no rsns commit 223a3b82834f036a62aa831f67cbf1f1d644c6e2 upstream. On Galaxy S3 (i9300/i9305), which has the max17047 fuel gauge and no current sense resistor (rsns), the RepSOC register does not provide an accurate state of charge value. The reported value is wrong, and does not change over time. VFSOC however, which uses the voltage fuel gauge to determine the state of charge, always shows an accurate value. For devices without current sense, VFSOC is already used for the soc-alert (0x0003 is written to MiscCFG register), so with this change the source of the alert and the PROP_CAPACITY value match. Fixes: 359ab9f5b154 ("power_supply: Add MAX17042 Fuel Gauge Driver") Cc: Reviewed-by: Krzysztof Kozlowski Suggested-by: Wolfgang Wiedmeyer Signed-off-by: Henrik Grimler Reviewed-by: Hans de Goede Signed-off-by: Sebastian Reichel Signed-off-by: Greg Kroah-Hartman --- drivers/power/supply/max17042_battery.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/power/supply/max17042_battery.c b/drivers/power/supply/max17042_battery.c index 8fe67490366dc..170639a1e734a 100644 --- a/drivers/power/supply/max17042_battery.c +++ b/drivers/power/supply/max17042_battery.c @@ -312,7 +312,10 @@ static int max17042_get_property(struct power_supply *psy, val->intval = data * 625 / 8; break; case POWER_SUPPLY_PROP_CAPACITY: - ret = regmap_read(map, MAX17042_RepSOC, &data); + if (chip->pdata->enable_current_sense) + ret = regmap_read(map, MAX17042_RepSOC, &data); + else + ret = regmap_read(map, MAX17042_VFSOC, &data); if (ret < 0) return ret; From 25abda2748085d4720115cda4c928bd44784a405 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 9 Nov 2021 01:30:44 +0000 Subject: [PATCH 1420/5344] KVM: nVMX: Query current VMCS when determining if MSR bitmaps are in use commit 7dfbc624eb5726367900c8d86deff50836240361 upstream. Check the current VMCS controls to determine if an MSR write will be intercepted due to MSR bitmaps being disabled. In the nested VMX case, KVM will disable MSR bitmaps in vmcs02 if they're disabled in vmcs12 or if KVM can't map L1's bitmaps for whatever reason. Note, the bad behavior is relatively benign in the current code base as KVM sets all bits in vmcs02's MSR bitmap by default, clears bits if and only if L0 KVM also disables interception of an MSR, and only uses the buggy helper for MSR_IA32_SPEC_CTRL. Because KVM explicitly tests WRMSR before disabling interception of MSR_IA32_SPEC_CTRL, the flawed check will only result in KVM reading MSR_IA32_SPEC_CTRL from hardware when it isn't strictly necessary. Tag the fix for stable in case a future fix wants to use msr_write_intercepted(), in which case a buggy implementation in older kernels could prove subtly problematic. Fixes: d28b387fb74d ("KVM/VMX: Allow direct access to MSR_IA32_SPEC_CTRL") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20211109013047.2041518-2-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx/vmx.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d32bd16bb3203..44058b8a3d842 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -785,15 +785,15 @@ void update_exception_bitmap(struct kvm_vcpu *vcpu) /* * Check if MSR is intercepted for currently loaded MSR bitmap. */ -static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) +static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr) { unsigned long *msr_bitmap; int f = sizeof(unsigned long); - if (!cpu_has_vmx_msr_bitmap()) + if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS)) return true; - msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap; + msr_bitmap = vmx->loaded_vmcs->msr_bitmap; if (msr <= 0x1fff) { return !!test_bit(msr, msr_bitmap + 0x800 / f); @@ -6572,7 +6572,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) * If the L02 MSR bitmap does not intercept the MSR, then we need to * save it. */ - if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) + if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))) vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); From 71d6166678641eac04e6310ef2e1a765a70e91e8 Mon Sep 17 00:00:00 2001 From: Zhang Changzhong Date: Thu, 28 Oct 2021 22:38:25 +0800 Subject: [PATCH 1421/5344] can: j1939: j1939_tp_cmd_recv(): ignore abort message in the BAM transport commit c0f49d98006f2db3333b917caac65bce2af9865c upstream. This patch prevents BAM transport from being closed by receiving abort message, as specified in SAE-J1939-82 2015 (A.3.3 Row 4). Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Link: https://lore.kernel.org/all/1635431907-15617-2-git-send-email-zhangchangzhong@huawei.com Cc: stable@vger.kernel.org Signed-off-by: Zhang Changzhong Acked-by: Oleksij Rempel Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- net/can/j1939/transport.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 7a54170da09df..811682e06951b 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -2065,6 +2065,12 @@ static void j1939_tp_cmd_recv(struct j1939_priv *priv, struct sk_buff *skb) break; case J1939_ETP_CMD_ABORT: /* && J1939_TP_CMD_ABORT */ + if (j1939_cb_is_broadcast(skcb)) { + netdev_err_once(priv->ndev, "%s: abort to broadcast (%02x), ignoring!\n", + __func__, skcb->addr.sa); + return; + } + if (j1939_tp_im_transmitter(skcb)) j1939_xtp_rx_abort(priv, skb, true); From 69a0579e786ede106f3b2e943332ea2a69f8bda6 Mon Sep 17 00:00:00 2001 From: Zhang Changzhong Date: Thu, 28 Oct 2021 22:38:26 +0800 Subject: [PATCH 1422/5344] can: j1939: j1939_can_recv(): ignore messages with invalid source address commit a79305e156db3d24fcd8eb649cdb3c3b2350e5c2 upstream. According to SAE-J1939-82 2015 (A.3.6 Row 2), a receiver should never send TP.CM_CTS to the global address, so we can add a check in j1939_can_recv() to drop messages with invalid source address. Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Link: https://lore.kernel.org/all/1635431907-15617-3-git-send-email-zhangchangzhong@huawei.com Cc: stable@vger.kernel.org Signed-off-by: Zhang Changzhong Acked-by: Oleksij Rempel Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- net/can/j1939/main.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index 266c189f1e809..ca75d1b8f415c 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -75,6 +75,13 @@ static void j1939_can_recv(struct sk_buff *iskb, void *data) skcb->addr.pgn = (cf->can_id >> 8) & J1939_PGN_MAX; /* set default message type */ skcb->addr.type = J1939_TP; + + if (!j1939_address_is_valid(skcb->addr.sa)) { + netdev_err_once(priv->ndev, "%s: sa is broadcast address, ignoring!\n", + __func__); + goto done; + } + if (j1939_pgn_is_pdu1(skcb->addr.pgn)) { /* Type 1: with destination address */ skcb->addr.da = skcb->addr.pgn; From c42d77f40bd547817da8c2d906f7c236241378cb Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Wed, 29 Sep 2021 11:36:45 +0800 Subject: [PATCH 1423/5344] powerpc/85xx: Fix oops when mpc85xx_smp_guts_ids node cannot be found commit 3c2172c1c47b4079c29f0e6637d764a99355ebcd upstream. When the field described in mpc85xx_smp_guts_ids[] is not configured in dtb, the mpc85xx_setup_pmc() does not assign a value to the "guts" variable. As a result, the oops is triggered when mpc85xx_freeze_time_base() is executed. Fixes: 56f1ba280719 ("powerpc/mpc85xx: refactor the PM operations") Cc: stable@vger.kernel.org # v4.6+ Signed-off-by: Xiaoming Ni Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210929033646.39630-2-nixiaoming@huawei.com Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/platforms/85xx/mpc85xx_pm_ops.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/85xx/mpc85xx_pm_ops.c b/arch/powerpc/platforms/85xx/mpc85xx_pm_ops.c index 7c0133f558d02..ffa8a7a6a2db7 100644 --- a/arch/powerpc/platforms/85xx/mpc85xx_pm_ops.c +++ b/arch/powerpc/platforms/85xx/mpc85xx_pm_ops.c @@ -94,9 +94,8 @@ int __init mpc85xx_setup_pmc(void) pr_err("Could not map guts node address\n"); return -ENOMEM; } + qoriq_pm_ops = &mpc85xx_pm_ops; } - qoriq_pm_ops = &mpc85xx_pm_ops; - return 0; } From 5acffc73c5f0ef89f3ca8d0fc0a68a53f60d4366 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Sat, 2 Oct 2021 15:09:00 +0200 Subject: [PATCH 1424/5344] serial: core: Fix initializing and restoring termios speed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 027b57170bf8bb6999a28e4a5f3d78bf1db0f90c upstream. Since commit edc6afc54968 ("tty: switch to ktermios and new framework") termios speed is no longer stored only in c_cflag member but also in new additional c_ispeed and c_ospeed members. If BOTHER flag is set in c_cflag then termios speed is stored only in these new members. Therefore to correctly restore termios speed it is required to store also ispeed and ospeed members, not only cflag member. In case only cflag member with BOTHER flag is restored then functions tty_termios_baud_rate() and tty_termios_input_baud_rate() returns baudrate stored in c_ospeed / c_ispeed member, which is zero as it was not restored too. If reported baudrate is invalid (e.g. zero) then serial core functions report fallback baudrate value 9600. So it means that in this case original baudrate is lost and kernel changes it to value 9600. Simple reproducer of this issue is to boot kernel with following command line argument: "console=ttyXXX,86400" (where ttyXXX is the device name). For speed 86400 there is no Bnnn constant and therefore kernel has to represent this speed via BOTHER c_cflag. Which means that speed is stored only in c_ospeed and c_ispeed members, not in c_cflag anymore. If bootloader correctly configures serial device to speed 86400 then kernel prints boot log to early console at speed speed 86400 without any issue. But after kernel starts initializing real console device ttyXXX then speed is changed to fallback value 9600 because information about speed was lost. This patch fixes above issue by storing and restoring also ispeed and ospeed members, which are required for BOTHER flag. Fixes: edc6afc54968 ("[PATCH] tty: switch to ktermios and new framework") Cc: stable@vger.kernel.org Signed-off-by: Pali Rohár Link: https://lore.kernel.org/r/20211002130900.9518-1-pali@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_core.c | 16 ++++++++++++++-- include/linux/console.h | 2 ++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index fa3bd8a97b244..38ee13bbcab81 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -220,7 +220,11 @@ static int uart_port_startup(struct tty_struct *tty, struct uart_state *state, if (retval == 0) { if (uart_console(uport) && uport->cons->cflag) { tty->termios.c_cflag = uport->cons->cflag; + tty->termios.c_ispeed = uport->cons->ispeed; + tty->termios.c_ospeed = uport->cons->ospeed; uport->cons->cflag = 0; + uport->cons->ispeed = 0; + uport->cons->ospeed = 0; } /* * Initialise the hardware port settings. @@ -288,8 +292,11 @@ static void uart_shutdown(struct tty_struct *tty, struct uart_state *state) /* * Turn off DTR and RTS early. */ - if (uport && uart_console(uport) && tty) + if (uport && uart_console(uport) && tty) { uport->cons->cflag = tty->termios.c_cflag; + uport->cons->ispeed = tty->termios.c_ispeed; + uport->cons->ospeed = tty->termios.c_ospeed; + } if (!tty || C_HUPCL(tty)) uart_port_dtr_rts(uport, 0); @@ -2110,8 +2117,11 @@ uart_set_options(struct uart_port *port, struct console *co, * Allow the setting of the UART parameters with a NULL console * too: */ - if (co) + if (co) { co->cflag = termios.c_cflag; + co->ispeed = termios.c_ispeed; + co->ospeed = termios.c_ospeed; + } return 0; } @@ -2245,6 +2255,8 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *uport) */ memset(&termios, 0, sizeof(struct ktermios)); termios.c_cflag = uport->cons->cflag; + termios.c_ispeed = uport->cons->ispeed; + termios.c_ospeed = uport->cons->ospeed; /* * If that's unset, use the tty termios setting. diff --git a/include/linux/console.h b/include/linux/console.h index d09951d5a94e4..d1d03c9c7a512 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -153,6 +153,8 @@ struct console { short flags; short index; int cflag; + uint ispeed; + uint ospeed; void *data; struct console *next; }; From 04e1b3114a4cf5ec4f7e0139a7740c0365afad3f Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 20 Oct 2021 18:48:46 +0200 Subject: [PATCH 1425/5344] ALSA: mixer: oss: Fix racy access to slots commit 411cef6adfb38a5bb6bd9af3941b28198e7fb680 upstream. The OSS mixer can reassign the mapping slots dynamically via proc file. Although the addition and deletion of those slots are protected by mixer->reg_mutex, the access to slots aren't, hence this may cause UAF when the slots in use are deleted concurrently. This patch applies the mixer->reg_mutex in all appropriate code paths (i.e. the ioctl functions) that may access slots. Reported-by: syzbot+9988f17cf72a1045a189@syzkaller.appspotmail.com Reviewed-by: Jaroslav Kysela Cc: Link: https://lore.kernel.org/r/00000000000036adc005ceca9175@google.com Link: https://lore.kernel.org/r/20211020164846.922-1-tiwai@suse.de Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/oss/mixer_oss.c | 43 +++++++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/sound/core/oss/mixer_oss.c b/sound/core/oss/mixer_oss.c index 7eb54df5556df..fd567611f67ed 100644 --- a/sound/core/oss/mixer_oss.c +++ b/sound/core/oss/mixer_oss.c @@ -130,11 +130,13 @@ static int snd_mixer_oss_devmask(struct snd_mixer_oss_file *fmixer) if (mixer == NULL) return -EIO; + mutex_lock(&mixer->reg_mutex); for (chn = 0; chn < 31; chn++) { pslot = &mixer->slots[chn]; if (pslot->put_volume || pslot->put_recsrc) result |= 1 << chn; } + mutex_unlock(&mixer->reg_mutex); return result; } @@ -146,11 +148,13 @@ static int snd_mixer_oss_stereodevs(struct snd_mixer_oss_file *fmixer) if (mixer == NULL) return -EIO; + mutex_lock(&mixer->reg_mutex); for (chn = 0; chn < 31; chn++) { pslot = &mixer->slots[chn]; if (pslot->put_volume && pslot->stereo) result |= 1 << chn; } + mutex_unlock(&mixer->reg_mutex); return result; } @@ -161,6 +165,7 @@ static int snd_mixer_oss_recmask(struct snd_mixer_oss_file *fmixer) if (mixer == NULL) return -EIO; + mutex_lock(&mixer->reg_mutex); if (mixer->put_recsrc && mixer->get_recsrc) { /* exclusive */ result = mixer->mask_recsrc; } else { @@ -172,6 +177,7 @@ static int snd_mixer_oss_recmask(struct snd_mixer_oss_file *fmixer) result |= 1 << chn; } } + mutex_unlock(&mixer->reg_mutex); return result; } @@ -182,11 +188,12 @@ static int snd_mixer_oss_get_recsrc(struct snd_mixer_oss_file *fmixer) if (mixer == NULL) return -EIO; + mutex_lock(&mixer->reg_mutex); if (mixer->put_recsrc && mixer->get_recsrc) { /* exclusive */ - int err; unsigned int index; - if ((err = mixer->get_recsrc(fmixer, &index)) < 0) - return err; + result = mixer->get_recsrc(fmixer, &index); + if (result < 0) + goto unlock; result = 1 << index; } else { struct snd_mixer_oss_slot *pslot; @@ -201,7 +208,10 @@ static int snd_mixer_oss_get_recsrc(struct snd_mixer_oss_file *fmixer) } } } - return mixer->oss_recsrc = result; + mixer->oss_recsrc = result; + unlock: + mutex_unlock(&mixer->reg_mutex); + return result; } static int snd_mixer_oss_set_recsrc(struct snd_mixer_oss_file *fmixer, int recsrc) @@ -214,6 +224,7 @@ static int snd_mixer_oss_set_recsrc(struct snd_mixer_oss_file *fmixer, int recsr if (mixer == NULL) return -EIO; + mutex_lock(&mixer->reg_mutex); if (mixer->get_recsrc && mixer->put_recsrc) { /* exclusive input */ if (recsrc & ~mixer->oss_recsrc) recsrc &= ~mixer->oss_recsrc; @@ -239,6 +250,7 @@ static int snd_mixer_oss_set_recsrc(struct snd_mixer_oss_file *fmixer, int recsr } } } + mutex_unlock(&mixer->reg_mutex); return result; } @@ -250,6 +262,7 @@ static int snd_mixer_oss_get_volume(struct snd_mixer_oss_file *fmixer, int slot) if (mixer == NULL || slot > 30) return -EIO; + mutex_lock(&mixer->reg_mutex); pslot = &mixer->slots[slot]; left = pslot->volume[0]; right = pslot->volume[1]; @@ -257,15 +270,21 @@ static int snd_mixer_oss_get_volume(struct snd_mixer_oss_file *fmixer, int slot) result = pslot->get_volume(fmixer, pslot, &left, &right); if (!pslot->stereo) right = left; - if (snd_BUG_ON(left < 0 || left > 100)) - return -EIO; - if (snd_BUG_ON(right < 0 || right > 100)) - return -EIO; + if (snd_BUG_ON(left < 0 || left > 100)) { + result = -EIO; + goto unlock; + } + if (snd_BUG_ON(right < 0 || right > 100)) { + result = -EIO; + goto unlock; + } if (result >= 0) { pslot->volume[0] = left; pslot->volume[1] = right; result = (left & 0xff) | ((right & 0xff) << 8); } + unlock: + mutex_unlock(&mixer->reg_mutex); return result; } @@ -278,6 +297,7 @@ static int snd_mixer_oss_set_volume(struct snd_mixer_oss_file *fmixer, if (mixer == NULL || slot > 30) return -EIO; + mutex_lock(&mixer->reg_mutex); pslot = &mixer->slots[slot]; if (left > 100) left = 100; @@ -288,10 +308,13 @@ static int snd_mixer_oss_set_volume(struct snd_mixer_oss_file *fmixer, if (pslot->put_volume) result = pslot->put_volume(fmixer, pslot, left, right); if (result < 0) - return result; + goto unlock; pslot->volume[0] = left; pslot->volume[1] = right; - return (left & 0xff) | ((right & 0xff) << 8); + result = (left & 0xff) | ((right & 0xff) << 8); + unlock: + mutex_lock(&mixer->reg_mutex); + return result; } static int snd_mixer_oss_ioctl1(struct snd_mixer_oss_file *fmixer, unsigned int cmd, unsigned long arg) From f1fb04f32b431cfc495d1745b2099d4b3ed53f36 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Sun, 24 Oct 2021 17:03:15 +0300 Subject: [PATCH 1426/5344] ALSA: mixer: fix deadlock in snd_mixer_oss_set_volume commit 3ab7992018455ac63c33e9b3eaa7264e293e40f4 upstream. In commit 411cef6adfb3 ("ALSA: mixer: oss: Fix racy access to slots") added mutex protection in snd_mixer_oss_set_volume(). Second mutex_lock() in same function looks like typo, fix it. Reported-by: syzbot+ace149a75a9a0a399ac7@syzkaller.appspotmail.com Fixes: 411cef6adfb3 ("ALSA: mixer: oss: Fix racy access to slots") Cc: Signed-off-by: Pavel Skripkin Link: https://lore.kernel.org/r/20211024140315.16704-1-paskripkin@gmail.com Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/oss/mixer_oss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/core/oss/mixer_oss.c b/sound/core/oss/mixer_oss.c index fd567611f67ed..50ec8b8ff68c9 100644 --- a/sound/core/oss/mixer_oss.c +++ b/sound/core/oss/mixer_oss.c @@ -313,7 +313,7 @@ static int snd_mixer_oss_set_volume(struct snd_mixer_oss_file *fmixer, pslot->volume[1] = right; result = (left & 0xff) | ((right & 0xff) << 8); unlock: - mutex_lock(&mixer->reg_mutex); + mutex_unlock(&mixer->reg_mutex); return result; } From bbf68008e84eb43d0c4f5c6a9657a54c84a8a0c3 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 2 Nov 2021 10:19:44 +0100 Subject: [PATCH 1427/5344] xen/balloon: add late_initcall_sync() for initial ballooning done MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 40fdea0284bb20814399da0484a658a96c735d90 upstream. When running as PVH or HVM guest with actual memory < max memory the hypervisor is using "populate on demand" in order to allow the guest to balloon down from its maximum memory size. For this to work correctly the guest must not touch more memory pages than its target memory size as otherwise the PoD cache will be exhausted and the guest is crashed as a result of that. In extreme cases ballooning down might not be finished today before the init process is started, which can consume lots of memory. In order to avoid random boot crashes in such cases, add a late init call to wait for ballooning down having finished for PVH/HVM guests. Warn on console if initial ballooning fails, panic() after stalling for more than 3 minutes per default. Add a module parameter for changing this timeout. [boris: replaced pr_info() with pr_notice()] Cc: Reported-by: Marek Marczykowski-Górecki Signed-off-by: Juergen Gross Link: https://lore.kernel.org/r/20211102091944.17487-1-jgross@suse.com Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky Signed-off-by: Greg Kroah-Hartman --- .../admin-guide/kernel-parameters.txt | 7 ++ drivers/xen/balloon.c | 86 ++++++++++++++----- 2 files changed, 70 insertions(+), 23 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 87443ffe7e5e4..a8b3f8d244942 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5504,6 +5504,13 @@ as generic guest with no PV drivers. Currently support XEN HVM, KVM, HYPER_V and VMWARE guest. + xen.balloon_boot_timeout= [XEN] + The time (in seconds) to wait before giving up to boot + in case initial ballooning fails to free enough memory. + Applies only when running as HVM or PVH guest and + started with less memory configured than allowed at + max. Default is 180. + xen.event_eoi_delay= [XEN] How long to delay EOI handling in case of event storms (jiffies). Default is 10. diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 07f362c63ae90..39bbb127852f9 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include @@ -75,6 +76,12 @@ #include #include +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "xen." + +static uint __read_mostly balloon_boot_timeout = 180; +module_param(balloon_boot_timeout, uint, 0444); + static int xen_hotplug_unpopulated; #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG @@ -127,12 +134,12 @@ static struct ctl_table xen_root[] = { * BP_ECANCELED: error, balloon operation canceled. */ -enum bp_state { +static enum bp_state { BP_DONE, BP_WAIT, BP_EAGAIN, BP_ECANCELED -}; +} balloon_state = BP_DONE; /* Main waiting point for xen-balloon thread. */ static DECLARE_WAIT_QUEUE_HEAD(balloon_thread_wq); @@ -201,18 +208,15 @@ static struct page *balloon_next_page(struct page *page) return list_entry(next, struct page, lru); } -static enum bp_state update_schedule(enum bp_state state) +static void update_schedule(void) { - if (state == BP_WAIT) - return BP_WAIT; - - if (state == BP_ECANCELED) - return BP_ECANCELED; + if (balloon_state == BP_WAIT || balloon_state == BP_ECANCELED) + return; - if (state == BP_DONE) { + if (balloon_state == BP_DONE) { balloon_stats.schedule_delay = 1; balloon_stats.retry_count = 1; - return BP_DONE; + return; } ++balloon_stats.retry_count; @@ -221,7 +225,8 @@ static enum bp_state update_schedule(enum bp_state state) balloon_stats.retry_count > balloon_stats.max_retry_count) { balloon_stats.schedule_delay = 1; balloon_stats.retry_count = 1; - return BP_ECANCELED; + balloon_state = BP_ECANCELED; + return; } balloon_stats.schedule_delay <<= 1; @@ -229,7 +234,7 @@ static enum bp_state update_schedule(enum bp_state state) if (balloon_stats.schedule_delay > balloon_stats.max_schedule_delay) balloon_stats.schedule_delay = balloon_stats.max_schedule_delay; - return BP_EAGAIN; + balloon_state = BP_EAGAIN; } #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG @@ -511,9 +516,9 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) * Stop waiting if either state is BP_DONE and ballooning action is * needed, or if the credit has changed while state is not BP_DONE. */ -static bool balloon_thread_cond(enum bp_state state, long credit) +static bool balloon_thread_cond(long credit) { - if (state == BP_DONE) + if (balloon_state == BP_DONE) credit = 0; return current_credit() != credit || kthread_should_stop(); @@ -527,13 +532,12 @@ static bool balloon_thread_cond(enum bp_state state, long credit) */ static int balloon_thread(void *unused) { - enum bp_state state = BP_DONE; long credit; unsigned long timeout; set_freezable(); for (;;) { - switch (state) { + switch (balloon_state) { case BP_DONE: case BP_ECANCELED: timeout = 3600 * HZ; @@ -549,7 +553,7 @@ static int balloon_thread(void *unused) credit = current_credit(); wait_event_freezable_timeout(balloon_thread_wq, - balloon_thread_cond(state, credit), timeout); + balloon_thread_cond(credit), timeout); if (kthread_should_stop()) return 0; @@ -560,22 +564,23 @@ static int balloon_thread(void *unused) if (credit > 0) { if (balloon_is_inflated()) - state = increase_reservation(credit); + balloon_state = increase_reservation(credit); else - state = reserve_additional_memory(); + balloon_state = reserve_additional_memory(); } if (credit < 0) { long n_pages; n_pages = min(-credit, si_mem_available()); - state = decrease_reservation(n_pages, GFP_BALLOON); - if (state == BP_DONE && n_pages != -credit && + balloon_state = decrease_reservation(n_pages, + GFP_BALLOON); + if (balloon_state == BP_DONE && n_pages != -credit && n_pages < totalreserve_pages) - state = BP_EAGAIN; + balloon_state = BP_EAGAIN; } - state = update_schedule(state); + update_schedule(); mutex_unlock(&balloon_mutex); @@ -782,3 +787,38 @@ static int __init balloon_init(void) return 0; } subsys_initcall(balloon_init); + +static int __init balloon_wait_finish(void) +{ + long credit, last_credit = 0; + unsigned long last_changed = 0; + + if (!xen_domain()) + return -ENODEV; + + /* PV guests don't need to wait. */ + if (xen_pv_domain() || !current_credit()) + return 0; + + pr_notice("Waiting for initial ballooning down having finished.\n"); + + while ((credit = current_credit()) < 0) { + if (credit != last_credit) { + last_changed = jiffies; + last_credit = credit; + } + if (balloon_state == BP_ECANCELED) { + pr_warn_once("Initial ballooning failed, %ld pages need to be freed.\n", + -credit); + if (jiffies - last_changed >= HZ * balloon_boot_timeout) + panic("Initial ballooning failed!\n"); + } + + schedule_timeout_interruptible(HZ / 10); + } + + pr_notice("Initial ballooning down finished.\n"); + + return 0; +} +late_initcall_sync(balloon_wait_finish); From 42f3e80c86d25d5f9157576b42b310719b6596d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Thu, 28 Oct 2021 20:56:53 +0200 Subject: [PATCH 1428/5344] PCI: pci-bridge-emul: Fix emulation of W1C bits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 7a41ae80bdcb17e14dd7d83239b8a0cf368f18be upstream. The pci_bridge_emul_conf_write() function correctly clears W1C bits in cfgspace cache, but it does not inform the underlying implementation about the clear request: the .write_op() method is given the value with these bits cleared. This is wrong if the .write_op() needs to know which bits were requested to be cleared. Fix the value to be passed into the .write_op() method to have requested W1C bits set, so that it can clear them. Both pci-bridge-emul users (mvebu and aardvark) are compatible with this change. Link: https://lore.kernel.org/r/20211028185659.20329-2-kabel@kernel.org Fixes: 23a5fba4d941 ("PCI: Introduce PCI bridge emulated config space common logic") Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org Cc: Russell King Signed-off-by: Greg Kroah-Hartman --- drivers/pci/pci-bridge-emul.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/pci/pci-bridge-emul.c b/drivers/pci/pci-bridge-emul.c index 06c800595e036..b3d63e319bb39 100644 --- a/drivers/pci/pci-bridge-emul.c +++ b/drivers/pci/pci-bridge-emul.c @@ -432,8 +432,21 @@ int pci_bridge_emul_conf_write(struct pci_bridge_emul *bridge, int where, /* Clear the W1C bits */ new &= ~((value << shift) & (behavior[reg / 4].w1c & mask)); + /* Save the new value with the cleared W1C bits into the cfgspace */ cfgspace[reg / 4] = cpu_to_le32(new); + /* + * Clear the W1C bits not specified by the write mask, so that the + * write_op() does not clear them. + */ + new &= ~(behavior[reg / 4].w1c & ~mask); + + /* + * Set the W1C bits specified by the write mask, so that write_op() + * knows about that they are to be cleared. + */ + new |= (value << shift) & (behavior[reg / 4].w1c & mask); + if (write_op) write_op(bridge, reg, old, new, mask); From eca443e63f497c474e47d52c28800d7923aa05d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Tue, 5 Oct 2021 20:09:45 +0200 Subject: [PATCH 1429/5344] PCI: aardvark: Do not clear status bits of masked interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit a7ca6d7fa3c02c032db5440ff392d96c04684c21 upstream. The PCIE_ISR1_REG says which interrupts are currently set / active, including those which are masked. The driver currently reads this register and looks if some unmasked interrupts are active, and if not, it clears status bits of _all_ interrupts, including the masked ones. This is incorrect, since, for example, some drivers may poll these bits. Remove this clearing, and also remove this early return statement completely, since it does not change functionality in any way. Link: https://lore.kernel.org/r/20211005180952.6812-7-kabel@kernel.org Fixes: 8c39d710363c ("PCI: aardvark: Add Aardvark PCI host controller driver") Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marek Behún Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-aardvark.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 18753fd218a31..bb53fcd638568 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -1055,12 +1055,6 @@ static void advk_pcie_handle_int(struct advk_pcie *pcie) isr1_mask = advk_readl(pcie, PCIE_ISR1_MASK_REG); isr1_status = isr1_val & ((~isr1_mask) & PCIE_ISR1_ALL_MASK); - if (!isr0_status && !isr1_status) { - advk_writel(pcie, isr0_val, PCIE_ISR0_REG); - advk_writel(pcie, isr1_val, PCIE_ISR1_REG); - return; - } - /* Process MSI interrupts */ if (isr0_status & PCIE_ISR0_MSI_INT_PENDING) advk_pcie_handle_msi(pcie); From 5eb4351643d611d9a25f74e6884f9d8ec9a9711f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Tue, 5 Oct 2021 20:09:51 +0200 Subject: [PATCH 1430/5344] PCI: aardvark: Fix checking for link up via LTSSM state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 661c399a651c11aaf83c45cbfe0b4a1fb7bc3179 upstream. Current implementation of advk_pcie_link_up() is wrong as it marks also link disabled or hot reset states as link up. Fix it by marking link up only to those states which are defined in PCIe Base specification 3.0, Table 4-14: Link Status Mapped to the LTSSM. To simplify implementation, Define macros for every LTSSM state which aardvark hardware can return in CFG_REG register. Fix also checking for link training according to the same Table 4-14. Define a new function advk_pcie_link_training() for this purpose. Link: https://lore.kernel.org/r/20211005180952.6812-13-kabel@kernel.org Fixes: 8c39d710363c ("PCI: aardvark: Add Aardvark PCI host controller driver") Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marek Behún Cc: stable@vger.kernel.org Cc: Remi Pommarel Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-aardvark.c | 76 ++++++++++++++++++++++++--- 1 file changed, 70 insertions(+), 6 deletions(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index bb53fcd638568..a5b0a952ee6d2 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -126,8 +126,50 @@ #define CFG_REG (LMI_BASE_ADDR + 0x0) #define LTSSM_SHIFT 24 #define LTSSM_MASK 0x3f -#define LTSSM_L0 0x10 #define RC_BAR_CONFIG 0x300 + +/* LTSSM values in CFG_REG */ +enum { + LTSSM_DETECT_QUIET = 0x0, + LTSSM_DETECT_ACTIVE = 0x1, + LTSSM_POLLING_ACTIVE = 0x2, + LTSSM_POLLING_COMPLIANCE = 0x3, + LTSSM_POLLING_CONFIGURATION = 0x4, + LTSSM_CONFIG_LINKWIDTH_START = 0x5, + LTSSM_CONFIG_LINKWIDTH_ACCEPT = 0x6, + LTSSM_CONFIG_LANENUM_ACCEPT = 0x7, + LTSSM_CONFIG_LANENUM_WAIT = 0x8, + LTSSM_CONFIG_COMPLETE = 0x9, + LTSSM_CONFIG_IDLE = 0xa, + LTSSM_RECOVERY_RCVR_LOCK = 0xb, + LTSSM_RECOVERY_SPEED = 0xc, + LTSSM_RECOVERY_RCVR_CFG = 0xd, + LTSSM_RECOVERY_IDLE = 0xe, + LTSSM_L0 = 0x10, + LTSSM_RX_L0S_ENTRY = 0x11, + LTSSM_RX_L0S_IDLE = 0x12, + LTSSM_RX_L0S_FTS = 0x13, + LTSSM_TX_L0S_ENTRY = 0x14, + LTSSM_TX_L0S_IDLE = 0x15, + LTSSM_TX_L0S_FTS = 0x16, + LTSSM_L1_ENTRY = 0x17, + LTSSM_L1_IDLE = 0x18, + LTSSM_L2_IDLE = 0x19, + LTSSM_L2_TRANSMIT_WAKE = 0x1a, + LTSSM_DISABLED = 0x20, + LTSSM_LOOPBACK_ENTRY_MASTER = 0x21, + LTSSM_LOOPBACK_ACTIVE_MASTER = 0x22, + LTSSM_LOOPBACK_EXIT_MASTER = 0x23, + LTSSM_LOOPBACK_ENTRY_SLAVE = 0x24, + LTSSM_LOOPBACK_ACTIVE_SLAVE = 0x25, + LTSSM_LOOPBACK_EXIT_SLAVE = 0x26, + LTSSM_HOT_RESET = 0x27, + LTSSM_RECOVERY_EQUALIZATION_PHASE0 = 0x28, + LTSSM_RECOVERY_EQUALIZATION_PHASE1 = 0x29, + LTSSM_RECOVERY_EQUALIZATION_PHASE2 = 0x2a, + LTSSM_RECOVERY_EQUALIZATION_PHASE3 = 0x2b, +}; + #define VENDOR_ID_REG (LMI_BASE_ADDR + 0x44) /* PCIe core controller registers */ @@ -219,13 +261,35 @@ static inline u32 advk_readl(struct advk_pcie *pcie, u64 reg) return readl(pcie->base + reg); } -static int advk_pcie_link_up(struct advk_pcie *pcie) +static u8 advk_pcie_ltssm_state(struct advk_pcie *pcie) { - u32 val, ltssm_state; + u32 val; + u8 ltssm_state; val = advk_readl(pcie, CFG_REG); ltssm_state = (val >> LTSSM_SHIFT) & LTSSM_MASK; - return ltssm_state >= LTSSM_L0; + return ltssm_state; +} + +static inline bool advk_pcie_link_up(struct advk_pcie *pcie) +{ + /* check if LTSSM is in normal operation - some L* state */ + u8 ltssm_state = advk_pcie_ltssm_state(pcie); + return ltssm_state >= LTSSM_L0 && ltssm_state < LTSSM_DISABLED; +} + +static inline bool advk_pcie_link_training(struct advk_pcie *pcie) +{ + /* + * According to PCIe Base specification 3.0, Table 4-14: Link + * Status Mapped to the LTSSM is Link Training mapped to LTSSM + * Configuration and Recovery states. + */ + u8 ltssm_state = advk_pcie_ltssm_state(pcie); + return ((ltssm_state >= LTSSM_CONFIG_LINKWIDTH_START && + ltssm_state < LTSSM_L0) || + (ltssm_state >= LTSSM_RECOVERY_EQUALIZATION_PHASE0 && + ltssm_state <= LTSSM_RECOVERY_EQUALIZATION_PHASE3)); } static int advk_pcie_wait_for_link(struct advk_pcie *pcie) @@ -252,7 +316,7 @@ static void advk_pcie_wait_for_retrain(struct advk_pcie *pcie) size_t retries; for (retries = 0; retries < RETRAIN_WAIT_MAX_RETRIES; ++retries) { - if (!advk_pcie_link_up(pcie)) + if (advk_pcie_link_training(pcie)) break; udelay(RETRAIN_WAIT_USLEEP_US); } @@ -516,7 +580,7 @@ advk_pci_bridge_emul_pcie_conf_read(struct pci_bridge_emul *bridge, /* u32 contains both PCI_EXP_LNKCTL and PCI_EXP_LNKSTA */ u32 val = advk_readl(pcie, PCIE_CORE_PCIEXP_CAP + reg) & ~(PCI_EXP_LNKSTA_LT << 16); - if (!advk_pcie_link_up(pcie)) + if (advk_pcie_link_training(pcie)) val |= (PCI_EXP_LNKSTA_LT << 16); *value = val; return PCI_BRIDGE_EMUL_HANDLED; From bc590f1c7511444ab85ee8738d0891d5652664c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Tue, 5 Oct 2021 20:09:46 +0200 Subject: [PATCH 1431/5344] PCI: aardvark: Do not unmask unused interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 1fb95d7d3c7a926b002fe8a6bd27a1cb428b46dc upstream. There are lot of undocumented interrupt bits. To prevent unwanted spurious interrupts, fix all *_ALL_MASK macros to define all interrupt bits, so that driver can properly mask all interrupts, including those which are undocumented. Link: https://lore.kernel.org/r/20211005180952.6812-8-kabel@kernel.org Fixes: 8c39d710363c ("PCI: aardvark: Add Aardvark PCI host controller driver") Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marek Behún Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-aardvark.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index a5b0a952ee6d2..88be131fd22db 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -108,13 +108,13 @@ #define PCIE_ISR0_MSI_INT_PENDING BIT(24) #define PCIE_ISR0_INTX_ASSERT(val) BIT(16 + (val)) #define PCIE_ISR0_INTX_DEASSERT(val) BIT(20 + (val)) -#define PCIE_ISR0_ALL_MASK GENMASK(26, 0) +#define PCIE_ISR0_ALL_MASK GENMASK(31, 0) #define PCIE_ISR1_REG (CONTROL_BASE_ADDR + 0x48) #define PCIE_ISR1_MASK_REG (CONTROL_BASE_ADDR + 0x4C) #define PCIE_ISR1_POWER_STATE_CHANGE BIT(4) #define PCIE_ISR1_FLUSH BIT(5) #define PCIE_ISR1_INTX_ASSERT(val) BIT(8 + (val)) -#define PCIE_ISR1_ALL_MASK GENMASK(11, 4) +#define PCIE_ISR1_ALL_MASK GENMASK(31, 0) #define PCIE_MSI_ADDR_LOW_REG (CONTROL_BASE_ADDR + 0x50) #define PCIE_MSI_ADDR_HIGH_REG (CONTROL_BASE_ADDR + 0x54) #define PCIE_MSI_STATUS_REG (CONTROL_BASE_ADDR + 0x58) @@ -202,7 +202,7 @@ enum { #define PCIE_IRQ_MSI_INT2_DET BIT(21) #define PCIE_IRQ_RC_DBELL_DET BIT(22) #define PCIE_IRQ_EP_STATUS BIT(23) -#define PCIE_IRQ_ALL_MASK 0xfff0fb +#define PCIE_IRQ_ALL_MASK GENMASK(31, 0) #define PCIE_IRQ_ENABLE_INTS_MASK PCIE_IRQ_CORE_INT /* Transaction types */ From 905d2d4d95cfb718194bd37d513347833f810369 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Tue, 5 Oct 2021 20:09:52 +0200 Subject: [PATCH 1432/5344] PCI: aardvark: Fix reporting Data Link Layer Link Active MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 2b650b7ff20eb7ea8ef9031d20fb657286ab90cc upstream. Add support for reporting PCI_EXP_LNKSTA_DLLLA bit in Link Control register on emulated bridge via current LTSSM state. Also correctly indicate DLLLA capability via PCI_EXP_LNKCAP_DLLLARC bit in Link Control Capability register. Link: https://lore.kernel.org/r/20211005180952.6812-14-kabel@kernel.org Fixes: 8a3ebd8de328 ("PCI: aardvark: Implement emulated root PCI bridge config space") Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marek Behún Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-aardvark.c | 29 ++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 88be131fd22db..50670ac268847 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -278,6 +278,20 @@ static inline bool advk_pcie_link_up(struct advk_pcie *pcie) return ltssm_state >= LTSSM_L0 && ltssm_state < LTSSM_DISABLED; } +static inline bool advk_pcie_link_active(struct advk_pcie *pcie) +{ + /* + * According to PCIe Base specification 3.0, Table 4-14: Link + * Status Mapped to the LTSSM, and 4.2.6.3.6 Configuration.Idle + * is Link Up mapped to LTSSM Configuration.Idle, Recovery, L0, + * L0s, L1 and L2 states. And according to 3.2.1. Data Link + * Control and Management State Machine Rules is DL Up status + * reported in DL Active state. + */ + u8 ltssm_state = advk_pcie_ltssm_state(pcie); + return ltssm_state >= LTSSM_CONFIG_IDLE && ltssm_state < LTSSM_DISABLED; +} + static inline bool advk_pcie_link_training(struct advk_pcie *pcie) { /* @@ -576,12 +590,26 @@ advk_pci_bridge_emul_pcie_conf_read(struct pci_bridge_emul *bridge, return PCI_BRIDGE_EMUL_HANDLED; } + case PCI_EXP_LNKCAP: { + u32 val = advk_readl(pcie, PCIE_CORE_PCIEXP_CAP + reg); + /* + * PCI_EXP_LNKCAP_DLLLARC bit is hardwired in aardvark HW to 0. + * But support for PCI_EXP_LNKSTA_DLLLA is emulated via ltssm + * state so explicitly enable PCI_EXP_LNKCAP_DLLLARC flag. + */ + val |= PCI_EXP_LNKCAP_DLLLARC; + *value = val; + return PCI_BRIDGE_EMUL_HANDLED; + } + case PCI_EXP_LNKCTL: { /* u32 contains both PCI_EXP_LNKCTL and PCI_EXP_LNKSTA */ u32 val = advk_readl(pcie, PCIE_CORE_PCIEXP_CAP + reg) & ~(PCI_EXP_LNKSTA_LT << 16); if (advk_pcie_link_training(pcie)) val |= (PCI_EXP_LNKSTA_LT << 16); + if (advk_pcie_link_active(pcie)) + val |= (PCI_EXP_LNKSTA_DLLLA << 16); *value = val; return PCI_BRIDGE_EMUL_HANDLED; } @@ -589,7 +617,6 @@ advk_pci_bridge_emul_pcie_conf_read(struct pci_bridge_emul *bridge, case PCI_CAP_LIST_ID: case PCI_EXP_DEVCAP: case PCI_EXP_DEVCTL: - case PCI_EXP_LNKCAP: *value = advk_readl(pcie, PCIE_CORE_PCIEXP_CAP + reg); return PCI_BRIDGE_EMUL_HANDLED; default: From 85079223d0579638d41aa2de2e613004b6a5e562 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Thu, 28 Oct 2021 20:56:54 +0200 Subject: [PATCH 1433/5344] PCI: aardvark: Fix return value of MSI domain .alloc() method MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit e4313be1599d397625c14fb7826996813622decf upstream. MSI domain callback .alloc() (implemented by advk_msi_irq_domain_alloc() function) should return zero on success, since non-zero value indicates failure. When the driver was converted to generic MSI API in commit f21a8b1b6837 ("PCI: aardvark: Move to MSI handling using generic MSI support"), it was converted so that it returns hwirq number. Fix this. Link: https://lore.kernel.org/r/20211028185659.20329-3-kabel@kernel.org Fixes: f21a8b1b6837 ("PCI: aardvark: Move to MSI handling using generic MSI support") Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-aardvark.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 50670ac268847..e619d235406b3 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -942,7 +942,7 @@ static int advk_msi_irq_domain_alloc(struct irq_domain *domain, domain->host_data, handle_simple_irq, NULL, NULL); - return hwirq; + return 0; } static void advk_msi_irq_domain_free(struct irq_domain *domain, From b22e3363af6e40f9a6ba12447a89c8b09186aa37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Thu, 28 Oct 2021 20:56:55 +0200 Subject: [PATCH 1434/5344] PCI: aardvark: Read all 16-bits from PCIE_MSI_PAYLOAD_REG MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 95997723b6402cd6c53e0f9e7ac640ec64eaaff8 upstream. The PCIE_MSI_PAYLOAD_REG contains 16-bit MSI number, not only lower 8 bits. Fix reading content of this register and add a comment describing the access to this register. Link: https://lore.kernel.org/r/20211028185659.20329-4-kabel@kernel.org Fixes: 8c39d710363c ("PCI: aardvark: Add Aardvark PCI host controller driver") Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-aardvark.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index e619d235406b3..7f4c83bde3fe0 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -120,6 +120,7 @@ #define PCIE_MSI_STATUS_REG (CONTROL_BASE_ADDR + 0x58) #define PCIE_MSI_MASK_REG (CONTROL_BASE_ADDR + 0x5C) #define PCIE_MSI_PAYLOAD_REG (CONTROL_BASE_ADDR + 0x9C) +#define PCIE_MSI_DATA_MASK GENMASK(15, 0) /* LMI registers base address and register offsets */ #define LMI_BASE_ADDR 0x6000 @@ -1123,8 +1124,12 @@ static void advk_pcie_handle_msi(struct advk_pcie *pcie) if (!(BIT(msi_idx) & msi_status)) continue; + /* + * msi_idx contains bits [4:0] of the msi_data and msi_data + * contains 16bit MSI interrupt number + */ advk_writel(pcie, BIT(msi_idx), PCIE_MSI_STATUS_REG); - msi_data = advk_readl(pcie, PCIE_MSI_PAYLOAD_REG) & 0xFF; + msi_data = advk_readl(pcie, PCIE_MSI_PAYLOAD_REG) & PCIE_MSI_DATA_MASK; generic_handle_irq(msi_data); } From 95f398466c7ff905b00373f2bf8a016d641ff775 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 8 Oct 2021 17:38:20 +0800 Subject: [PATCH 1435/5344] quota: check block number when reading the block in quota file commit 9bf3d20331295b1ecb81f4ed9ef358c51699a050 upstream. The block number in the quota tree on disk should be smaller than the v2_disk_dqinfo.dqi_blocks. If the quota file was corrupted, we may be allocating an 'allocated' block and that would lead to a loop in a tree, which will probably trigger oops later. This patch adds a check for the block number in the quota tree to prevent such potential issue. Link: https://lore.kernel.org/r/20211008093821.1001186-2-yi.zhang@huawei.com Signed-off-by: Zhang Yi Cc: stable@kernel.org Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/quota/quota_tree.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c index c5562c871c8be..c459ac0cb2c71 100644 --- a/fs/quota/quota_tree.c +++ b/fs/quota/quota_tree.c @@ -488,6 +488,13 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, goto out_buf; } newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); + if (newblk < QT_TREEOFF || newblk >= info->dqi_blocks) { + quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)", + newblk, info->dqi_blocks); + ret = -EUCLEAN; + goto out_buf; + } + if (depth == info->dqi_qtree_depth - 1) { ret = free_dqentry(info, dquot, newblk); newblk = 0; @@ -587,6 +594,13 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info, blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); if (!blk) /* No reference? */ goto out_buf; + if (blk < QT_TREEOFF || blk >= info->dqi_blocks) { + quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)", + blk, info->dqi_blocks); + ret = -EUCLEAN; + goto out_buf; + } + if (depth < info->dqi_qtree_depth - 1) ret = find_tree_dqentry(info, dquot, blk, depth+1); else From 5f97c5f02287cc8773727046938b3cba46c13931 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 8 Oct 2021 17:38:21 +0800 Subject: [PATCH 1436/5344] quota: correct error number in free_dqentry() commit d0e36a62bd4c60c09acc40e06ba4831a4d0bc75b upstream. Fix the error path in free_dqentry(), pass out the error number if the block to free is not correct. Fixes: 1ccd14b9c271 ("quota: Split off quota tree handling into a separate file") Link: https://lore.kernel.org/r/20211008093821.1001186-3-yi.zhang@huawei.com Signed-off-by: Zhang Yi Cc: stable@kernel.org Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/quota/quota_tree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c index c459ac0cb2c71..1a188fbdf34e5 100644 --- a/fs/quota/quota_tree.c +++ b/fs/quota/quota_tree.c @@ -423,6 +423,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, quota_error(dquot->dq_sb, "Quota structure has offset to " "other block (%u) than it should (%u)", blk, (uint)(dquot->dq_off >> info->dqi_blocksize_bits)); + ret = -EIO; goto out_buf; } ret = read_blk(info, blk, buf); From be497d704bc789f205a2d4ca46ff8102e03adb62 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 22 Oct 2021 09:43:23 +0800 Subject: [PATCH 1437/5344] pinctrl: core: fix possible memory leak in pinctrl_enable() commit c7892ae13e461ed20154321eb792e07ebe38f5b3 upstream. I got memory leak as follows when doing fault injection test: unreferenced object 0xffff888020a7a680 (size 64): comm "i2c-mcp23018-41", pid 23090, jiffies 4295160544 (age 8.680s) hex dump (first 32 bytes): 00 48 d3 1e 80 88 ff ff 00 1a 56 c1 ff ff ff ff .H........V..... 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<0000000083c79b35>] kmem_cache_alloc_trace+0x16d/0x360 [<0000000051803c95>] pinctrl_init_controller+0x6ed/0xb70 [<0000000064346707>] pinctrl_register+0x27/0x80 [<0000000029b0e186>] devm_pinctrl_register+0x5b/0xe0 [<00000000391f5a3e>] mcp23s08_probe_one+0x968/0x118a [pinctrl_mcp23s08] [<000000006112c039>] mcp230xx_probe+0x266/0x560 [pinctrl_mcp23s08_i2c] If pinctrl_claim_hogs() fails, the 'pindesc' allocated in pinctrl_register_one_pin() need be freed. Cc: stable@vger.kernel.org Reported-by: Hulk Robot Fixes: 950b0d91dc10 ("pinctrl: core: Fix regression caused by delayed work for hogs") Signed-off-by: Yang Yingliang Link: https://lore.kernel.org/r/20211022014323.1156924-1-yangyingliang@huawei.com Signed-off-by: Linus Walleij Signed-off-by: Greg Kroah-Hartman --- drivers/pinctrl/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c index 6381745e3bb18..9ebeef2ac7b18 100644 --- a/drivers/pinctrl/core.c +++ b/drivers/pinctrl/core.c @@ -2055,6 +2055,8 @@ int pinctrl_enable(struct pinctrl_dev *pctldev) if (error) { dev_err(pctldev->dev, "could not claim hogs: %i\n", error); + pinctrl_free_pindescs(pctldev, pctldev->desc->pins, + pctldev->desc->npins); mutex_destroy(&pctldev->mutex); kfree(pctldev); From 2970a9221f34689c3b7eabdfc6f96eaeaa1ccd20 Mon Sep 17 00:00:00 2001 From: Pekka Korpinen Date: Wed, 29 Sep 2021 21:57:55 +0300 Subject: [PATCH 1438/5344] iio: dac: ad5446: Fix ad5622_write() return value commit 558df982d4ead9cac628153d0d7b60feae05ddc8 upstream. On success i2c_master_send() returns the number of bytes written. The call from iio_write_channel_info(), however, expects the return value to be zero on success. This bug causes incorrect consumption of the sysfs buffer in iio_write_channel_info(). When writing more than two characters to out_voltage0_raw, the ad5446 write handler is called multiple times causing unexpected behavior. Fixes: 3ec36a2cf0d5 ("iio:ad5446: Add support for I2C based DACs") Signed-off-by: Pekka Korpinen Link: https://lore.kernel.org/r/20210929185755.2384-1-pekka.korpinen@iki.fi Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/dac/ad5446.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/iio/dac/ad5446.c b/drivers/iio/dac/ad5446.c index 61c670f7fc5f1..1f55cd8abb558 100644 --- a/drivers/iio/dac/ad5446.c +++ b/drivers/iio/dac/ad5446.c @@ -527,8 +527,15 @@ static int ad5622_write(struct ad5446_state *st, unsigned val) { struct i2c_client *client = to_i2c_client(st->dev); __be16 data = cpu_to_be16(val); + int ret; + + ret = i2c_master_send(client, (char *)&data, sizeof(data)); + if (ret < 0) + return ret; + if (ret != sizeof(data)) + return -EIO; - return i2c_master_send(client, (char *)&data, sizeof(data)); + return 0; } /** From 0d7a611018134042d1f208c591a7d70c38722268 Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Fri, 15 Oct 2021 16:55:43 +0800 Subject: [PATCH 1439/5344] USB: serial: keyspan: fix memleak on probe errors commit 910c996335c37552ee30fcb837375b808bb4f33b upstream. I got memory leak as follows when doing fault injection test: unreferenced object 0xffff888258228440 (size 64): comm "kworker/7:2", pid 2005, jiffies 4294989509 (age 824.540s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] slab_post_alloc_hook+0x9c/0x490 [] kmem_cache_alloc_trace+0x1f7/0x470 [] keyspan_port_probe+0xa4/0x5d0 [keyspan] [] usb_serial_device_probe+0x97/0x1d0 [usbserial] [] really_probe+0x167/0x460 [] __driver_probe_device+0xf9/0x180 [] driver_probe_device+0x53/0x130 [] __device_attach_driver+0x105/0x130 [] bus_for_each_drv+0x129/0x190 [] __device_attach+0x1c9/0x270 [] device_initial_probe+0x20/0x30 [] bus_probe_device+0x142/0x160 [] device_add+0x829/0x1300 [] usb_serial_probe.cold+0xc9b/0x14ac [usbserial] [] usb_probe_interface+0x1aa/0x3c0 [usbcore] [] really_probe+0x167/0x460 If keyspan_port_probe() fails to allocate memory for an out_buffer[i] or in_buffer[i], the previously allocated memory for out_buffer or in_buffer needs to be freed on the error handling path, otherwise a memory leak will result. Fixes: bad41a5bf177 ("USB: keyspan: fix port DMA-buffer allocations") Reported-by: Hulk Robot Signed-off-by: Wang Hai Link: https://lore.kernel.org/r/20211015085543.1203011-1-wanghai38@huawei.com Cc: stable@vger.kernel.org # 3.12 Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/keyspan.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/usb/serial/keyspan.c b/drivers/usb/serial/keyspan.c index aa3dbce22cfbe..451759f38b573 100644 --- a/drivers/usb/serial/keyspan.c +++ b/drivers/usb/serial/keyspan.c @@ -2910,22 +2910,22 @@ static int keyspan_port_probe(struct usb_serial_port *port) for (i = 0; i < ARRAY_SIZE(p_priv->in_buffer); ++i) { p_priv->in_buffer[i] = kzalloc(IN_BUFLEN, GFP_KERNEL); if (!p_priv->in_buffer[i]) - goto err_in_buffer; + goto err_free_in_buffer; } for (i = 0; i < ARRAY_SIZE(p_priv->out_buffer); ++i) { p_priv->out_buffer[i] = kzalloc(OUT_BUFLEN, GFP_KERNEL); if (!p_priv->out_buffer[i]) - goto err_out_buffer; + goto err_free_out_buffer; } p_priv->inack_buffer = kzalloc(INACK_BUFLEN, GFP_KERNEL); if (!p_priv->inack_buffer) - goto err_inack_buffer; + goto err_free_out_buffer; p_priv->outcont_buffer = kzalloc(OUTCONT_BUFLEN, GFP_KERNEL); if (!p_priv->outcont_buffer) - goto err_outcont_buffer; + goto err_free_inack_buffer; p_priv->device_details = d_details; @@ -2971,15 +2971,14 @@ static int keyspan_port_probe(struct usb_serial_port *port) return 0; -err_outcont_buffer: +err_free_inack_buffer: kfree(p_priv->inack_buffer); -err_inack_buffer: +err_free_out_buffer: for (i = 0; i < ARRAY_SIZE(p_priv->out_buffer); ++i) kfree(p_priv->out_buffer[i]); -err_out_buffer: +err_free_in_buffer: for (i = 0; i < ARRAY_SIZE(p_priv->in_buffer); ++i) kfree(p_priv->in_buffer[i]); -err_in_buffer: kfree(p_priv); return -ENOMEM; From d1e64b0d7b77388ee74dbd0d35665d9e8c1ebfe6 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 25 Oct 2021 13:51:59 +0200 Subject: [PATCH 1440/5344] USB: iowarrior: fix control-message timeouts commit 79a4479a17b83310deb0b1a2a274fe5be12d2318 upstream. USB control-message timeouts are specified in milliseconds and should specifically not vary with CONFIG_HZ. Use the common control-message timeout define for the five-second timeout and drop the driver-specific one. Fixes: 946b960d13c1 ("USB: add driver for iowarrior devices.") Cc: stable@vger.kernel.org # 2.6.21 Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20211025115159.4954-3-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/iowarrior.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/usb/misc/iowarrior.c b/drivers/usb/misc/iowarrior.c index 103c69c692bad..888defdea546f 100644 --- a/drivers/usb/misc/iowarrior.c +++ b/drivers/usb/misc/iowarrior.c @@ -99,10 +99,6 @@ struct iowarrior { /* globals */ /*--------------*/ -/* - * USB spec identifies 5 second timeouts. - */ -#define GET_TIMEOUT 5 #define USB_REQ_GET_REPORT 0x01 //#if 0 static int usb_get_report(struct usb_device *dev, @@ -114,7 +110,7 @@ static int usb_get_report(struct usb_device *dev, USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE, (type << 8) + id, inter->desc.bInterfaceNumber, buf, size, - GET_TIMEOUT*HZ); + USB_CTRL_GET_TIMEOUT); } //#endif @@ -129,7 +125,7 @@ static int usb_set_report(struct usb_interface *intf, unsigned char type, USB_TYPE_CLASS | USB_RECIP_INTERFACE, (type << 8) + id, intf->cur_altsetting->desc.bInterfaceNumber, buf, - size, HZ); + size, 1000); } /*---------------------*/ From a372b9a477ee5f534459f9adac4387df2725ba53 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 21 Oct 2021 10:34:47 +0200 Subject: [PATCH 1441/5344] USB: chipidea: fix interrupt deadlock commit 9aaa81c3366e8393a62374e3a1c67c69edc07b8a upstream. Chipidea core was calling the interrupt handler from non-IRQ context with interrupts enabled, something which can lead to a deadlock if there's an actual interrupt trying to take a lock that's already held (e.g. the controller lock in udc_irq()). Add a wrapper that can be used to fake interrupts instead of calling the handler directly. Fixes: 3ecb3e09b042 ("usb: chipidea: Use extcon framework for VBUS and ID detect") Fixes: 876d4e1e8298 ("usb: chipidea: core: add wakeup support for extcon") Cc: Peter Chen Cc: stable@vger.kernel.org # 4.4 Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20211021083447.20078-1-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/chipidea/core.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/drivers/usb/chipidea/core.c b/drivers/usb/chipidea/core.c index b7da2a273c451..74cdc7ef476ae 100644 --- a/drivers/usb/chipidea/core.c +++ b/drivers/usb/chipidea/core.c @@ -534,7 +534,7 @@ int hw_device_reset(struct ci_hdrc *ci) return 0; } -static irqreturn_t ci_irq(int irq, void *data) +static irqreturn_t ci_irq_handler(int irq, void *data) { struct ci_hdrc *ci = data; irqreturn_t ret = IRQ_NONE; @@ -587,6 +587,15 @@ static irqreturn_t ci_irq(int irq, void *data) return ret; } +static void ci_irq(struct ci_hdrc *ci) +{ + unsigned long flags; + + local_irq_save(flags); + ci_irq_handler(ci->irq, ci); + local_irq_restore(flags); +} + static int ci_cable_notifier(struct notifier_block *nb, unsigned long event, void *ptr) { @@ -596,7 +605,7 @@ static int ci_cable_notifier(struct notifier_block *nb, unsigned long event, cbl->connected = event; cbl->changed = true; - ci_irq(ci->irq, ci); + ci_irq(ci); return NOTIFY_DONE; } @@ -634,7 +643,7 @@ static int ci_usb_role_switch_set(struct device *dev, enum usb_role role) if (cable) { cable->changed = true; cable->connected = false; - ci_irq(ci->irq, ci); + ci_irq(ci); spin_unlock_irqrestore(&ci->lock, flags); if (ci->wq && role != USB_ROLE_NONE) flush_workqueue(ci->wq); @@ -652,7 +661,7 @@ static int ci_usb_role_switch_set(struct device *dev, enum usb_role role) if (cable) { cable->changed = true; cable->connected = true; - ci_irq(ci->irq, ci); + ci_irq(ci); } spin_unlock_irqrestore(&ci->lock, flags); pm_runtime_put_sync(ci->dev); @@ -1156,7 +1165,7 @@ static int ci_hdrc_probe(struct platform_device *pdev) } } - ret = devm_request_irq(dev, ci->irq, ci_irq, IRQF_SHARED, + ret = devm_request_irq(dev, ci->irq, ci_irq_handler, IRQF_SHARED, ci->platdata->name, ci); if (ret) goto stop; @@ -1277,11 +1286,11 @@ static void ci_extcon_wakeup_int(struct ci_hdrc *ci) if (!IS_ERR(cable_id->edev) && ci->is_otg && (otgsc & OTGSC_IDIE) && (otgsc & OTGSC_IDIS)) - ci_irq(ci->irq, ci); + ci_irq(ci); if (!IS_ERR(cable_vbus->edev) && ci->is_otg && (otgsc & OTGSC_BSVIE) && (otgsc & OTGSC_BSVIS)) - ci_irq(ci->irq, ci); + ci_irq(ci); } static int ci_controller_resume(struct device *dev) From a2d025fdbf44f65c5b04f72f2d35c3383481c768 Mon Sep 17 00:00:00 2001 From: Charan Teja Reddy Date: Fri, 23 Jul 2021 18:01:08 +0530 Subject: [PATCH 1442/5344] dma-buf: WARN on dmabuf release with pending attachments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit f492283b157053e9555787262f058ae33096f568 ] It is expected from the clients to follow the below steps on an imported dmabuf fd: a) dmabuf = dma_buf_get(fd) // Get the dmabuf from fd b) dma_buf_attach(dmabuf); // Clients attach to the dmabuf o Here the kernel does some slab allocations, say for dma_buf_attachment and may be some other slab allocation in the dmabuf->ops->attach(). c) Client may need to do dma_buf_map_attachment(). d) Accordingly dma_buf_unmap_attachment() should be called. e) dma_buf_detach () // Clients detach to the dmabuf. o Here the slab allocations made in b) are freed. f) dma_buf_put(dmabuf) // Can free the dmabuf if it is the last reference. Now say an erroneous client failed at step c) above thus it directly called dma_buf_put(), step f) above. Considering that it may be the last reference to the dmabuf, buffer will be freed with pending attachments left to the dmabuf which can show up as the 'memory leak'. This should at least be reported as the WARN(). Signed-off-by: Charan Teja Reddy Reviewed-by: Christian König Link: https://patchwork.freedesktop.org/patch/msgid/1627043468-16381-1-git-send-email-charante@codeaurora.org Signed-off-by: Christian König Signed-off-by: Sasha Levin --- drivers/dma-buf/dma-buf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 758de0e9b2ddc..16bbc9bc9e6d1 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -79,6 +79,7 @@ static void dma_buf_release(struct dentry *dentry) if (dmabuf->resv == (struct dma_resv *)&dmabuf[1]) dma_resv_fini(dmabuf->resv); + WARN_ON(!list_empty(&dmabuf->attachments)); module_put(dmabuf->owner); kfree(dmabuf->name); kfree(dmabuf); From f3904274fcad92241cce03343d950299a03cdca2 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 30 May 2021 13:04:25 +0200 Subject: [PATCH 1443/5344] drm: panel-orientation-quirks: Update the Lenovo Ideapad D330 quirk (v2) [ Upstream commit 820a2ab23d5eab4ccfb82581eda8ad4acf18458f ] 2 improvements to the Lenovo Ideapad D330 panel-orientation quirks: 1. Some versions of the Lenovo Ideapad D330 have a DMI_PRODUCT_NAME of "81H3" and others have "81MD". Testing has shown that the "81MD" also has a 90 degree mounted panel. Drop the DMI_PRODUCT_NAME from the existing quirk so that the existing quirk matches both variants. 2. Some of the Lenovo Ideapad D330 models have a HD (800x1280) screen instead of a FHD (1200x1920) screen (both are mounted right-side-up) add a second Lenovo Ideapad D330 quirk for the HD version. Changes in v2: - Add a new quirk for Lenovo Ideapad D330 models with a HD screen instead of a FHD screen Link: https://github.com/systemd/systemd/pull/18884 Acked-by: Simon Ser Signed-off-by: Hans de Goede Link: https://patchwork.freedesktop.org/patch/msgid/20210530110428.12994-2-hdegoede@redhat.com Signed-off-by: Sasha Levin --- drivers/gpu/drm/drm_panel_orientation_quirks.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/drm_panel_orientation_quirks.c b/drivers/gpu/drm/drm_panel_orientation_quirks.c index e1b2ce4921ae7..5d0942e3985b2 100644 --- a/drivers/gpu/drm/drm_panel_orientation_quirks.c +++ b/drivers/gpu/drm/drm_panel_orientation_quirks.c @@ -223,10 +223,15 @@ static const struct dmi_system_id orientation_data[] = { DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, "Lenovo MIIX 320-10ICR"), }, .driver_data = (void *)&lcd800x1280_rightside_up, - }, { /* Lenovo Ideapad D330 */ + }, { /* Lenovo Ideapad D330-10IGM (HD) */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, "Lenovo ideapad D330-10IGM"), + }, + .driver_data = (void *)&lcd800x1280_rightside_up, + }, { /* Lenovo Ideapad D330-10IGM (FHD) */ .matches = { DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"), - DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "81H3"), DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, "Lenovo ideapad D330-10IGM"), }, .driver_data = (void *)&lcd1200x1920_rightside_up, From 6fcdde4ec52e6320b0f6073cdf08e78ca1f57345 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 30 May 2021 13:04:26 +0200 Subject: [PATCH 1444/5344] drm: panel-orientation-quirks: Add quirk for KD Kurio Smart C15200 2-in-1 [ Upstream commit a53f1dd3ab9fec715c6c2e8e01bf4d3c07eef8e5 ] The KD Kurio Smart C15200 2-in-1 uses a panel which has been mounted 90 degrees rotated. Add a quirk for this. Signed-off-by: Hans de Goede Acked-by: Simon Ser Link: https://patchwork.freedesktop.org/patch/msgid/20210530110428.12994-3-hdegoede@redhat.com Signed-off-by: Sasha Levin --- drivers/gpu/drm/drm_panel_orientation_quirks.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/drm_panel_orientation_quirks.c b/drivers/gpu/drm/drm_panel_orientation_quirks.c index 5d0942e3985b2..cf4db2cdebbbd 100644 --- a/drivers/gpu/drm/drm_panel_orientation_quirks.c +++ b/drivers/gpu/drm/drm_panel_orientation_quirks.c @@ -205,6 +205,13 @@ static const struct dmi_system_id orientation_data[] = { DMI_EXACT_MATCH(DMI_BOARD_NAME, "TW891"), }, .driver_data = (void *)&itworks_tw891, + }, { /* KD Kurio Smart C15200 2-in-1 */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "KD Interactive"), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "Kurio Smart"), + DMI_EXACT_MATCH(DMI_BOARD_NAME, "KDM960BCP"), + }, + .driver_data = (void *)&lcd800x1280_rightside_up, }, { /* * Lenovo Ideapad Miix 310 laptop, only some production batches * have a portrait screen, the resolution checks makes the quirk From 04a7910b006eb934e88e61c9a94bccf78b7ef261 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 30 May 2021 13:04:27 +0200 Subject: [PATCH 1445/5344] drm: panel-orientation-quirks: Add quirk for the Samsung Galaxy Book 10.6 [ Upstream commit 88fa1fde918951c175ae5ea0f31efc4bb1736ab9 ] The Samsung Galaxy Book 10.6 uses a panel which has been mounted 90 degrees rotated. Add a quirk for this. Signed-off-by: Hans de Goede Acked-by: Simon Ser Link: https://patchwork.freedesktop.org/patch/msgid/20210530110428.12994-4-hdegoede@redhat.com Signed-off-by: Sasha Levin --- drivers/gpu/drm/drm_panel_orientation_quirks.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/gpu/drm/drm_panel_orientation_quirks.c b/drivers/gpu/drm/drm_panel_orientation_quirks.c index cf4db2cdebbbd..926094b83e2f4 100644 --- a/drivers/gpu/drm/drm_panel_orientation_quirks.c +++ b/drivers/gpu/drm/drm_panel_orientation_quirks.c @@ -109,6 +109,12 @@ static const struct drm_dmi_panel_orientation_data lcd1200x1920_rightside_up = { .orientation = DRM_MODE_PANEL_ORIENTATION_RIGHT_UP, }; +static const struct drm_dmi_panel_orientation_data lcd1280x1920_rightside_up = { + .width = 1280, + .height = 1920, + .orientation = DRM_MODE_PANEL_ORIENTATION_RIGHT_UP, +}; + static const struct dmi_system_id orientation_data[] = { { /* Acer One 10 (S1003) */ .matches = { @@ -249,6 +255,12 @@ static const struct dmi_system_id orientation_data[] = { DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, "Default string"), }, .driver_data = (void *)&onegx1_pro, + }, { /* Samsung GalaxyBook 10.6 */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "Galaxy Book 10.6"), + }, + .driver_data = (void *)&lcd1280x1920_rightside_up, }, { /* VIOS LTH17 */ .matches = { DMI_EXACT_MATCH(DMI_SYS_VENDOR, "VIOS"), From bbff9db4bfd57b35db033af649509ff3c6cac757 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Sat, 28 Aug 2021 18:18:18 +0200 Subject: [PATCH 1446/5344] Bluetooth: sco: Fix lock_sock() blockage by memcpy_from_msg() [ Upstream commit 99c23da0eed4fd20cae8243f2b51e10e66aa0951 ] The sco_send_frame() also takes lock_sock() during memcpy_from_msg() call that may be endlessly blocked by a task with userfaultd technique, and this will result in a hung task watchdog trigger. Just like the similar fix for hci_sock_sendmsg() in commit 92c685dc5de0 ("Bluetooth: reorganize functions..."), this patch moves the memcpy_from_msg() out of lock_sock() for addressing the hang. This should be the last piece for fixing CVE-2021-3640 after a few already queued fixes. Signed-off-by: Takashi Iwai Signed-off-by: Marcel Holtmann Signed-off-by: Sasha Levin --- net/bluetooth/sco.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 1915943bb646a..cc5a1d2545679 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -280,7 +280,8 @@ static int sco_connect(struct hci_dev *hdev, struct sock *sk) return err; } -static int sco_send_frame(struct sock *sk, struct msghdr *msg, int len) +static int sco_send_frame(struct sock *sk, void *buf, int len, + unsigned int msg_flags) { struct sco_conn *conn = sco_pi(sk)->conn; struct sk_buff *skb; @@ -292,15 +293,11 @@ static int sco_send_frame(struct sock *sk, struct msghdr *msg, int len) BT_DBG("sk %p len %d", sk, len); - skb = bt_skb_send_alloc(sk, len, msg->msg_flags & MSG_DONTWAIT, &err); + skb = bt_skb_send_alloc(sk, len, msg_flags & MSG_DONTWAIT, &err); if (!skb) return err; - if (memcpy_from_msg(skb_put(skb, len), msg, len)) { - kfree_skb(skb); - return -EFAULT; - } - + memcpy(skb_put(skb, len), buf, len); hci_send_sco(conn->hcon, skb); return len; @@ -714,6 +711,7 @@ static int sco_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; + void *buf; int err; BT_DBG("sock %p, sk %p", sock, sk); @@ -725,14 +723,24 @@ static int sco_sock_sendmsg(struct socket *sock, struct msghdr *msg, if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (memcpy_from_msg(buf, msg, len)) { + kfree(buf); + return -EFAULT; + } + lock_sock(sk); if (sk->sk_state == BT_CONNECTED) - err = sco_send_frame(sk, msg, len); + err = sco_send_frame(sk, buf, len, msg->msg_flags); else err = -ENOTCONN; release_sock(sk); + kfree(buf); return err; } From 1877e6c6d216b03bc7127f2e9e1d84298bc76960 Mon Sep 17 00:00:00 2001 From: Wang ShaoBo Date: Tue, 31 Aug 2021 17:35:37 -0700 Subject: [PATCH 1447/5344] Bluetooth: fix use-after-free error in lock_sock_nested() [ Upstream commit 1bff51ea59a9afb67d2dd78518ab0582a54a472c ] use-after-free error in lock_sock_nested is reported: [ 179.140137][ T3731] ===================================================== [ 179.142675][ T3731] BUG: KMSAN: use-after-free in lock_sock_nested+0x280/0x2c0 [ 179.145494][ T3731] CPU: 4 PID: 3731 Comm: kworker/4:2 Not tainted 5.12.0-rc6+ #54 [ 179.148432][ T3731] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 [ 179.151806][ T3731] Workqueue: events l2cap_chan_timeout [ 179.152730][ T3731] Call Trace: [ 179.153301][ T3731] dump_stack+0x24c/0x2e0 [ 179.154063][ T3731] kmsan_report+0xfb/0x1e0 [ 179.154855][ T3731] __msan_warning+0x5c/0xa0 [ 179.155579][ T3731] lock_sock_nested+0x280/0x2c0 [ 179.156436][ T3731] ? kmsan_get_metadata+0x116/0x180 [ 179.157257][ T3731] l2cap_sock_teardown_cb+0xb8/0x890 [ 179.158154][ T3731] ? __msan_metadata_ptr_for_load_8+0x10/0x20 [ 179.159141][ T3731] ? kmsan_get_metadata+0x116/0x180 [ 179.159994][ T3731] ? kmsan_get_shadow_origin_ptr+0x84/0xb0 [ 179.160959][ T3731] ? l2cap_sock_recv_cb+0x420/0x420 [ 179.161834][ T3731] l2cap_chan_del+0x3e1/0x1d50 [ 179.162608][ T3731] ? kmsan_get_metadata+0x116/0x180 [ 179.163435][ T3731] ? kmsan_get_shadow_origin_ptr+0x84/0xb0 [ 179.164406][ T3731] l2cap_chan_close+0xeea/0x1050 [ 179.165189][ T3731] ? kmsan_internal_unpoison_shadow+0x42/0x70 [ 179.166180][ T3731] l2cap_chan_timeout+0x1da/0x590 [ 179.167066][ T3731] ? __msan_metadata_ptr_for_load_8+0x10/0x20 [ 179.168023][ T3731] ? l2cap_chan_create+0x560/0x560 [ 179.168818][ T3731] process_one_work+0x121d/0x1ff0 [ 179.169598][ T3731] worker_thread+0x121b/0x2370 [ 179.170346][ T3731] kthread+0x4ef/0x610 [ 179.171010][ T3731] ? process_one_work+0x1ff0/0x1ff0 [ 179.171828][ T3731] ? kthread_blkcg+0x110/0x110 [ 179.172587][ T3731] ret_from_fork+0x1f/0x30 [ 179.173348][ T3731] [ 179.173752][ T3731] Uninit was created at: [ 179.174409][ T3731] kmsan_internal_poison_shadow+0x5c/0xf0 [ 179.175373][ T3731] kmsan_slab_free+0x76/0xc0 [ 179.176060][ T3731] kfree+0x3a5/0x1180 [ 179.176664][ T3731] __sk_destruct+0x8af/0xb80 [ 179.177375][ T3731] __sk_free+0x812/0x8c0 [ 179.178032][ T3731] sk_free+0x97/0x130 [ 179.178686][ T3731] l2cap_sock_release+0x3d5/0x4d0 [ 179.179457][ T3731] sock_close+0x150/0x450 [ 179.180117][ T3731] __fput+0x6bd/0xf00 [ 179.180787][ T3731] ____fput+0x37/0x40 [ 179.181481][ T3731] task_work_run+0x140/0x280 [ 179.182219][ T3731] do_exit+0xe51/0x3e60 [ 179.182930][ T3731] do_group_exit+0x20e/0x450 [ 179.183656][ T3731] get_signal+0x2dfb/0x38f0 [ 179.184344][ T3731] arch_do_signal_or_restart+0xaa/0xe10 [ 179.185266][ T3731] exit_to_user_mode_prepare+0x2d2/0x560 [ 179.186136][ T3731] syscall_exit_to_user_mode+0x35/0x60 [ 179.186984][ T3731] do_syscall_64+0xc5/0x140 [ 179.187681][ T3731] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 179.188604][ T3731] ===================================================== In our case, there are two Thread A and B: Context: Thread A: Context: Thread B: l2cap_chan_timeout() __se_sys_shutdown() l2cap_chan_close() l2cap_sock_shutdown() l2cap_chan_del() l2cap_chan_close() l2cap_sock_teardown_cb() l2cap_sock_teardown_cb() Once l2cap_sock_teardown_cb() excuted, this sock will be marked as SOCK_ZAPPED, and can be treated as killable in l2cap_sock_kill() if sock_orphan() has excuted, at this time we close sock through sock_close() which end to call l2cap_sock_kill() like Thread C: Context: Thread C: sock_close() l2cap_sock_release() sock_orphan() l2cap_sock_kill() #free sock if refcnt is 1 If C completed, Once A or B reaches l2cap_sock_teardown_cb() again, use-after-free happened. We should set chan->data to NULL if sock is destructed, for telling teardown operation is not allowed in l2cap_sock_teardown_cb(), and also we should avoid killing an already killed socket in l2cap_sock_close_cb(). Signed-off-by: Wang ShaoBo Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann Signed-off-by: Sasha Levin --- net/bluetooth/l2cap_sock.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 82e76ff01267a..08e9f332adad3 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1347,6 +1347,9 @@ static void l2cap_sock_close_cb(struct l2cap_chan *chan) { struct sock *sk = chan->data; + if (!sk) + return; + l2cap_sock_kill(sk); } @@ -1355,6 +1358,9 @@ static void l2cap_sock_teardown_cb(struct l2cap_chan *chan, int err) struct sock *sk = chan->data; struct sock *parent; + if (!sk) + return; + BT_DBG("chan %p state %s", chan, state_to_string(chan->state)); /* This callback can be called both for server (BT_LISTEN) @@ -1538,8 +1544,10 @@ static void l2cap_sock_destruct(struct sock *sk) { BT_DBG("sk %p", sk); - if (l2cap_pi(sk)->chan) + if (l2cap_pi(sk)->chan) { + l2cap_pi(sk)->chan->data = NULL; l2cap_chan_put(l2cap_pi(sk)->chan); + } if (l2cap_pi(sk)->rx_busy_skb) { kfree_skb(l2cap_pi(sk)->rx_busy_skb); From 18416a7407e9018528d6c990da8bf5177ecafd98 Mon Sep 17 00:00:00 2001 From: Simon Ser Date: Sat, 11 Sep 2021 10:24:40 +0000 Subject: [PATCH 1448/5344] drm/panel-orientation-quirks: add Valve Steam Deck [ Upstream commit 9eeb7b4e40bfd69d8aaa920c7e9df751c9e11dce ] Valve's Steam Deck has a 800x1280 LCD screen. Signed-off-by: Simon Ser Cc: Jared Baldridge Cc: Emil Velikov Cc: Daniel Vetter Cc: Hans de Goede Acked-by: Sam Ravnborg Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Link: https://patchwork.freedesktop.org/patch/msgid/20210911102430.253986-1-contact@emersion.fr Signed-off-by: Sasha Levin --- drivers/gpu/drm/drm_panel_orientation_quirks.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/drm_panel_orientation_quirks.c b/drivers/gpu/drm/drm_panel_orientation_quirks.c index 926094b83e2f4..a950d5db211c5 100644 --- a/drivers/gpu/drm/drm_panel_orientation_quirks.c +++ b/drivers/gpu/drm/drm_panel_orientation_quirks.c @@ -261,6 +261,13 @@ static const struct dmi_system_id orientation_data[] = { DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "Galaxy Book 10.6"), }, .driver_data = (void *)&lcd1280x1920_rightside_up, + }, { /* Valve Steam Deck */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Valve"), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "Jupiter"), + DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, "1"), + }, + .driver_data = (void *)&lcd800x1280_rightside_up, }, { /* VIOS LTH17 */ .matches = { DMI_EXACT_MATCH(DMI_SYS_VENDOR, "VIOS"), From ee38ff938bfadb85de80c1d4865796201cabde57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Barnab=C3=A1s=20P=C5=91cze?= Date: Sat, 4 Sep 2021 17:56:26 +0000 Subject: [PATCH 1449/5344] platform/x86: wmi: do not fail if disabling fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 1975718c488a39128f1f515b23ae61a5a214cc3d ] Previously, `__query_block()` would fail if the second WCxx method call failed. However, the WQxx method might have succeeded, and potentially allocated memory for the result. Instead of throwing away the result and potentially leaking memory, ignore the result of the second WCxx call. Signed-off-by: Barnabás Pőcze Link: https://lore.kernel.org/r/20210904175450.156801-25-pobrn@protonmail.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Signed-off-by: Sasha Levin --- drivers/platform/x86/wmi.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c index 59e9aa0f96436..cb029126a68c6 100644 --- a/drivers/platform/x86/wmi.c +++ b/drivers/platform/x86/wmi.c @@ -353,7 +353,14 @@ static acpi_status __query_block(struct wmi_block *wblock, u8 instance, * the WQxx method failed - we should disable collection anyway. */ if ((block->flags & ACPI_WMI_EXPENSIVE) && ACPI_SUCCESS(wc_status)) { - status = acpi_execute_simple_method(handle, wc_method, 0); + /* + * Ignore whether this WCxx call succeeds or not since + * the previously executed WQxx method call might have + * succeeded, and returning the failing status code + * of this call would throw away the result of the WQxx + * call, potentially leaking memory. + */ + acpi_execute_simple_method(handle, wc_method, 0); } return status; From 355c5ac1526e20b53dbe1f32872ea56b7f401dda Mon Sep 17 00:00:00 2001 From: Aleksander Jan Bajkowski Date: Tue, 14 Sep 2021 23:20:58 +0200 Subject: [PATCH 1450/5344] MIPS: lantiq: dma: add small delay after reset [ Upstream commit c12aa581f6d5e80c3c3675ab26a52c2b3b62f76e ] Reading the DMA registers immediately after the reset causes Data Bus Error. Adding a small delay fixes this issue. Signed-off-by: Aleksander Jan Bajkowski Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- arch/mips/lantiq/xway/dma.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/mips/lantiq/xway/dma.c b/arch/mips/lantiq/xway/dma.c index aeb1b989cd4ee..24c6267f78698 100644 --- a/arch/mips/lantiq/xway/dma.c +++ b/arch/mips/lantiq/xway/dma.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -221,6 +222,8 @@ ltq_dma_init(struct platform_device *pdev) clk_enable(clk); ltq_dma_w32_mask(0, DMA_RESET, LTQ_DMA_CTRL); + usleep_range(1, 10); + /* disable all interrupts */ ltq_dma_w32(0, LTQ_DMA_IRNEN); From ed85efa5d5965c909291d06d4f554e12320057c0 Mon Sep 17 00:00:00 2001 From: Aleksander Jan Bajkowski Date: Tue, 14 Sep 2021 23:20:59 +0200 Subject: [PATCH 1451/5344] MIPS: lantiq: dma: reset correct number of channel [ Upstream commit 5ca9ce2ba4d5884cd94d1a856c675ab1242cd242 ] Different SoCs have a different number of channels, e.g .: * amazon-se has 10 channels, * danube+ar9 have 20 channels, * vr9 has 28 channels, * ar10 has 24 channels. We can read the ID register and, depending on the reported number of channels, reset the appropriate number of channels. Signed-off-by: Aleksander Jan Bajkowski Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- arch/mips/lantiq/xway/dma.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/mips/lantiq/xway/dma.c b/arch/mips/lantiq/xway/dma.c index 24c6267f78698..e45077aecf83a 100644 --- a/arch/mips/lantiq/xway/dma.c +++ b/arch/mips/lantiq/xway/dma.c @@ -30,6 +30,7 @@ #define LTQ_DMA_PCTRL 0x44 #define LTQ_DMA_IRNEN 0xf4 +#define DMA_ID_CHNR GENMASK(26, 20) /* channel number */ #define DMA_DESCPT BIT(3) /* descriptor complete irq */ #define DMA_TX BIT(8) /* TX channel direction */ #define DMA_CHAN_ON BIT(0) /* channel on / off bit */ @@ -40,7 +41,6 @@ #define DMA_POLL BIT(31) /* turn on channel polling */ #define DMA_CLK_DIV4 BIT(6) /* polling clock divider */ #define DMA_2W_BURST BIT(1) /* 2 word burst length */ -#define DMA_MAX_CHANNEL 20 /* the soc has 20 channels */ #define DMA_ETOP_ENDIANNESS (0xf << 8) /* endianness swap etop channels */ #define DMA_WEIGHT (BIT(17) | BIT(16)) /* default channel wheight */ @@ -206,7 +206,7 @@ ltq_dma_init(struct platform_device *pdev) { struct clk *clk; struct resource *res; - unsigned id; + unsigned int id, nchannels; int i; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); @@ -228,17 +228,18 @@ ltq_dma_init(struct platform_device *pdev) ltq_dma_w32(0, LTQ_DMA_IRNEN); /* reset/configure each channel */ - for (i = 0; i < DMA_MAX_CHANNEL; i++) { + id = ltq_dma_r32(LTQ_DMA_ID); + nchannels = ((id & DMA_ID_CHNR) >> 20); + for (i = 0; i < nchannels; i++) { ltq_dma_w32(i, LTQ_DMA_CS); ltq_dma_w32(DMA_CHAN_RST, LTQ_DMA_CCTRL); ltq_dma_w32(DMA_POLL | DMA_CLK_DIV4, LTQ_DMA_CPOLL); ltq_dma_w32_mask(DMA_CHAN_ON, 0, LTQ_DMA_CCTRL); } - id = ltq_dma_r32(LTQ_DMA_ID); dev_info(&pdev->dev, "Init done - hw rev: %X, ports: %d, channels: %d\n", - id & 0x1f, (id >> 16) & 0xf, id >> 20); + id & 0x1f, (id >> 16) & 0xf, nchannels); return 0; } From a94646c62dd3960cdb5ba5c86d52a77571679ea5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Jun 2021 11:41:10 +0200 Subject: [PATCH 1452/5344] locking/lockdep: Avoid RCU-induced noinstr fail [ Upstream commit ce0b9c805dd66d5e49fd53ec5415ae398f4c56e6 ] vmlinux.o: warning: objtool: look_up_lock_class()+0xc7: call to rcu_read_lock_any_held() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210624095148.311980536@infradead.org Signed-off-by: Sasha Levin --- kernel/locking/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index c73bfb1a77b7c..806b25a2a6913 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -832,7 +832,7 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return NULL; - hlist_for_each_entry_rcu(class, hash_head, hash_entry) { + hlist_for_each_entry_rcu_notrace(class, hash_head, hash_entry) { if (class->key == key) { /* * Huh! same key, different name? Did someone trample From 813accc2afd82f90b15e1cca7f0781de61413815 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 13 Sep 2021 15:53:30 -0700 Subject: [PATCH 1453/5344] net: sched: update default qdisc visibility after Tx queue cnt changes [ Upstream commit 1e080f17750d1083e8a32f7b350584ae1cd7ff20 ] mq / mqprio make the default child qdiscs visible. They only do so for the qdiscs which are within real_num_tx_queues when the device is registered. Depending on order of calls in the driver, or if user space changes config via ethtool -L the number of qdiscs visible under tc qdisc show will differ from the number of queues. This is confusing to users and potentially to system configuration scripts which try to make sure qdiscs have the right parameters. Add a new Qdisc_ops callback and make relevant qdiscs TTRT. Note that this uncovers the "shortcut" created by commit 1f27cde313d7 ("net: sched: use pfifo_fast for non real queues") The default child qdiscs beyond initial real_num_tx are always pfifo_fast, no matter what the sysfs setting is. Fixing this gets a little tricky because we'd need to keep a reference on whatever the default qdisc was at the time of creation. In practice this is likely an non-issue the qdiscs likely have to be configured to non-default settings, so whatever user space is doing such configuration can replace the pfifos... now that it will see them. Reported-by: Matthew Massey Reviewed-by: Dave Taht Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/net/sch_generic.h | 4 ++++ net/core/dev.c | 2 ++ net/sched/sch_generic.c | 9 +++++++++ net/sched/sch_mq.c | 24 ++++++++++++++++++++++++ net/sched/sch_mqprio.c | 23 +++++++++++++++++++++++ 5 files changed, 62 insertions(+) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 0cb0a4bcb5447..939fda8f97215 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -299,6 +299,8 @@ struct Qdisc_ops { struct netlink_ext_ack *extack); void (*attach)(struct Qdisc *sch); int (*change_tx_queue_len)(struct Qdisc *, unsigned int); + void (*change_real_num_tx)(struct Qdisc *sch, + unsigned int new_real_tx); int (*dump)(struct Qdisc *, struct sk_buff *); int (*dump_stats)(struct Qdisc *, struct gnet_dump *); @@ -675,6 +677,8 @@ void qdisc_class_hash_grow(struct Qdisc *, struct Qdisc_class_hash *); void qdisc_class_hash_destroy(struct Qdisc_class_hash *); int dev_qdisc_change_tx_queue_len(struct net_device *dev); +void dev_qdisc_change_real_num_tx(struct net_device *dev, + unsigned int new_real_tx); void dev_init_scheduler(struct net_device *dev); void dev_shutdown(struct net_device *dev); void dev_activate(struct net_device *dev); diff --git a/net/core/dev.c b/net/core/dev.c index a0e7752e27d59..ccfee33336ab2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2589,6 +2589,8 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) if (dev->num_tc) netif_setup_tc(dev, txq); + dev_qdisc_change_real_num_tx(dev, txq); + dev->real_num_tx_queues = txq; if (disabling) { diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index ccf67c3e0596c..5285390f2f0b9 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -1313,6 +1313,15 @@ static int qdisc_change_tx_queue_len(struct net_device *dev, return 0; } +void dev_qdisc_change_real_num_tx(struct net_device *dev, + unsigned int new_real_tx) +{ + struct Qdisc *qdisc = dev->qdisc; + + if (qdisc->ops->change_real_num_tx) + qdisc->ops->change_real_num_tx(qdisc, new_real_tx); +} + int dev_qdisc_change_tx_queue_len(struct net_device *dev) { bool up = dev->flags & IFF_UP; diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index e79f1afe0cfd6..db18d8a860f9c 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -125,6 +125,29 @@ static void mq_attach(struct Qdisc *sch) priv->qdiscs = NULL; } +static void mq_change_real_num_tx(struct Qdisc *sch, unsigned int new_real_tx) +{ +#ifdef CONFIG_NET_SCHED + struct net_device *dev = qdisc_dev(sch); + struct Qdisc *qdisc; + unsigned int i; + + for (i = new_real_tx; i < dev->real_num_tx_queues; i++) { + qdisc = netdev_get_tx_queue(dev, i)->qdisc_sleeping; + /* Only update the default qdiscs we created, + * qdiscs with handles are always hashed. + */ + if (qdisc != &noop_qdisc && !qdisc->handle) + qdisc_hash_del(qdisc); + } + for (i = dev->real_num_tx_queues; i < new_real_tx; i++) { + qdisc = netdev_get_tx_queue(dev, i)->qdisc_sleeping; + if (qdisc != &noop_qdisc && !qdisc->handle) + qdisc_hash_add(qdisc, false); + } +#endif +} + static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) { struct net_device *dev = qdisc_dev(sch); @@ -288,6 +311,7 @@ struct Qdisc_ops mq_qdisc_ops __read_mostly = { .init = mq_init, .destroy = mq_destroy, .attach = mq_attach, + .change_real_num_tx = mq_change_real_num_tx, .dump = mq_dump, .owner = THIS_MODULE, }; diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 5eb3b1b7ae5e7..50e15add6068f 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -306,6 +306,28 @@ static void mqprio_attach(struct Qdisc *sch) priv->qdiscs = NULL; } +static void mqprio_change_real_num_tx(struct Qdisc *sch, + unsigned int new_real_tx) +{ + struct net_device *dev = qdisc_dev(sch); + struct Qdisc *qdisc; + unsigned int i; + + for (i = new_real_tx; i < dev->real_num_tx_queues; i++) { + qdisc = netdev_get_tx_queue(dev, i)->qdisc_sleeping; + /* Only update the default qdiscs we created, + * qdiscs with handles are always hashed. + */ + if (qdisc != &noop_qdisc && !qdisc->handle) + qdisc_hash_del(qdisc); + } + for (i = dev->real_num_tx_queues; i < new_real_tx; i++) { + qdisc = netdev_get_tx_queue(dev, i)->qdisc_sleeping; + if (qdisc != &noop_qdisc && !qdisc->handle) + qdisc_hash_add(qdisc, false); + } +} + static struct netdev_queue *mqprio_queue_get(struct Qdisc *sch, unsigned long cl) { @@ -629,6 +651,7 @@ static struct Qdisc_ops mqprio_qdisc_ops __read_mostly = { .init = mqprio_init, .destroy = mqprio_destroy, .attach = mqprio_attach, + .change_real_num_tx = mqprio_change_real_num_tx, .dump = mqprio_dump, .owner = THIS_MODULE, }; From df11d9c77c09589f8a4aed18b0eff371003e42c5 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Sat, 28 Aug 2021 23:41:40 -0700 Subject: [PATCH 1454/5344] smackfs: Fix use-after-free in netlbl_catmap_walk() [ Upstream commit 0817534ff9ea809fac1322c5c8c574be8483ea57 ] Syzkaller reported use-after-free bug as described in [1]. The bug is triggered when smk_set_cipso() tries to free stale category bitmaps while there are concurrent reader(s) using the same bitmaps. Wait for RCU grace period to finish before freeing the category bitmaps in smk_set_cipso(). This makes sure that there are no more readers using the stale bitmaps and freeing them should be safe. [1] https://lore.kernel.org/netdev/000000000000a814c505ca657a4e@google.com/ Reported-by: syzbot+3f91de0b813cc3d19a80@syzkaller.appspotmail.com Signed-off-by: Pawan Gupta Signed-off-by: Casey Schaufler Signed-off-by: Sasha Levin --- security/smack/smackfs.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c index 3823ab2c4e4be..cec3f56739dc2 100644 --- a/security/smack/smackfs.c +++ b/security/smack/smackfs.c @@ -831,6 +831,7 @@ static int smk_open_cipso(struct inode *inode, struct file *file) static ssize_t smk_set_cipso(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int format) { + struct netlbl_lsm_catmap *old_cat; struct smack_known *skp; struct netlbl_lsm_secattr ncats; char mapcatset[SMK_CIPSOLEN]; @@ -920,9 +921,11 @@ static ssize_t smk_set_cipso(struct file *file, const char __user *buf, rc = smk_netlbl_mls(maplevel, mapcatset, &ncats, SMK_CIPSOLEN); if (rc >= 0) { - netlbl_catmap_free(skp->smk_netlabel.attr.mls.cat); + old_cat = skp->smk_netlabel.attr.mls.cat; skp->smk_netlabel.attr.mls.cat = ncats.attr.mls.cat; skp->smk_netlabel.attr.mls.lvl = ncats.attr.mls.lvl; + synchronize_rcu(); + netlbl_catmap_free(old_cat); rc = count; } From 82ee48eb6c3a91e174c7b7fb39e382385cd628a0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 15 Sep 2021 16:19:46 +0200 Subject: [PATCH 1455/5344] x86: Increase exception stack sizes [ Upstream commit 7fae4c24a2b84a66c7be399727aca11e7a888462 ] It turns out that a single page of stack is trivial to overflow with all the tracing gunk enabled. Raise the exception stacks to 2 pages, which is still half the interrupt stacks, which are at 4 pages. Reported-by: Michael Wang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/YUIO9Ye98S5Eb68w@hirez.programming.kicks-ass.net Signed-off-by: Sasha Levin --- arch/x86/include/asm/page_64_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 288b065955b72..9d0b479452720 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -15,7 +15,7 @@ #define THREAD_SIZE_ORDER (2 + KASAN_STACK_ORDER) #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) -#define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER) +#define EXCEPTION_STACK_ORDER (1 + KASAN_STACK_ORDER) #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER) #define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER) From f81a326926b27d43d0f18c8b8a093974f11218d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonas=20Dre=C3=9Fler?= Date: Tue, 14 Sep 2021 21:59:03 +0200 Subject: [PATCH 1456/5344] mwifiex: Run SET_BSS_MODE when changing from P2P to STATION vif-type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit c2e9666cdffd347460a2b17988db4cfaf2a68fb9 ] We currently handle changing from the P2P to the STATION virtual interface type slightly different than changing from P2P to ADHOC: When changing to STATION, we don't send the SET_BSS_MODE command. We do send that command on all other type-changes though, and it probably makes sense to send the command since after all we just changed our BSS_MODE. Looking at prior changes to this part of the code, it seems that this is simply a leftover from old refactorings. Since sending the SET_BSS_MODE command is the only difference between mwifiex_change_vif_to_sta_adhoc() and the current code, we can now use mwifiex_change_vif_to_sta_adhoc() for both switching to ADHOC and STATION interface type. This does not fix any particular bug and just "looked right", so there's a small chance it might be a regression. Signed-off-by: Jonas Dreßler Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210914195909.36035-4-verdre@v0yd.nl Signed-off-by: Sasha Levin --- .../net/wireless/marvell/mwifiex/cfg80211.c | 22 ++++--------------- 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c index 9e6dc289ec3e8..b5134f11fc32b 100644 --- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c +++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c @@ -1233,29 +1233,15 @@ mwifiex_cfg80211_change_virtual_intf(struct wiphy *wiphy, break; case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_GO: + if (mwifiex_cfg80211_deinit_p2p(priv)) + return -EFAULT; + switch (type) { - case NL80211_IFTYPE_STATION: - if (mwifiex_cfg80211_deinit_p2p(priv)) - return -EFAULT; - priv->adapter->curr_iface_comb.p2p_intf--; - priv->adapter->curr_iface_comb.sta_intf++; - dev->ieee80211_ptr->iftype = type; - if (mwifiex_deinit_priv_params(priv)) - return -1; - if (mwifiex_init_new_priv_params(priv, dev, type)) - return -1; - if (mwifiex_sta_init_cmd(priv, false, false)) - return -1; - break; case NL80211_IFTYPE_ADHOC: - if (mwifiex_cfg80211_deinit_p2p(priv)) - return -EFAULT; + case NL80211_IFTYPE_STATION: return mwifiex_change_vif_to_sta_adhoc(dev, curr_iftype, type, params); - break; case NL80211_IFTYPE_AP: - if (mwifiex_cfg80211_deinit_p2p(priv)) - return -EFAULT; return mwifiex_change_vif_to_ap(dev, curr_iftype, type, params); case NL80211_IFTYPE_UNSPECIFIED: From b70695114476dadfd542756fd0bbea7d28d27e0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonas=20Dre=C3=9Fler?= Date: Tue, 14 Sep 2021 21:59:08 +0200 Subject: [PATCH 1457/5344] mwifiex: Properly initialize private structure on interface type changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit c606008b70627a2fc485732a53cc22f0f66d0981 ] When creating a new virtual interface in mwifiex_add_virtual_intf(), we update our internal driver states like bss_type, bss_priority, bss_role and bss_mode to reflect the mode the firmware will be set to. When switching virtual interface mode using mwifiex_init_new_priv_params() though, we currently only update bss_mode and bss_role. In order for the interface mode switch to actually work, we also need to update bss_type to its proper value, so do that. This fixes a crash of the firmware (because the driver tries to execute commands that are invalid in AP mode) when switching from station mode to AP mode. Signed-off-by: Jonas Dreßler Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210914195909.36035-9-verdre@v0yd.nl Signed-off-by: Sasha Levin --- drivers/net/wireless/marvell/mwifiex/cfg80211.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c index b5134f11fc32b..1599ae74b066b 100644 --- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c +++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c @@ -912,16 +912,20 @@ mwifiex_init_new_priv_params(struct mwifiex_private *priv, switch (type) { case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_ADHOC: - priv->bss_role = MWIFIEX_BSS_ROLE_STA; + priv->bss_role = MWIFIEX_BSS_ROLE_STA; + priv->bss_type = MWIFIEX_BSS_TYPE_STA; break; case NL80211_IFTYPE_P2P_CLIENT: - priv->bss_role = MWIFIEX_BSS_ROLE_STA; + priv->bss_role = MWIFIEX_BSS_ROLE_STA; + priv->bss_type = MWIFIEX_BSS_TYPE_P2P; break; case NL80211_IFTYPE_P2P_GO: - priv->bss_role = MWIFIEX_BSS_ROLE_UAP; + priv->bss_role = MWIFIEX_BSS_ROLE_UAP; + priv->bss_type = MWIFIEX_BSS_TYPE_P2P; break; case NL80211_IFTYPE_AP: priv->bss_role = MWIFIEX_BSS_ROLE_UAP; + priv->bss_type = MWIFIEX_BSS_TYPE_UAP; break; default: mwifiex_dbg(adapter, ERROR, From 5250158b2257a77663a0a0e80ae6a545ad8c4ab5 Mon Sep 17 00:00:00 2001 From: Alagu Sankar Date: Tue, 28 Sep 2021 14:00:47 +0300 Subject: [PATCH 1458/5344] ath10k: high latency fixes for beacon buffer [ Upstream commit e263bdab9c0e8025fb7f41f153709a9cda51f6b6 ] Beacon buffer for high latency devices does not use DMA. other similar buffer allocation methods in the driver have already been modified for high latency path. Fix the beacon buffer allocation left out in the earlier high latency changes. Signed-off-by: Alagu Sankar Signed-off-by: Erik Stromdahl [fabio: adapt it to use ar->bus_param.dev_type ] Signed-off-by: Fabio Estevam Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210818232627.2040121-1-festevam@denx.de Signed-off-by: Sasha Levin --- drivers/net/wireless/ath/ath10k/mac.c | 31 ++++++++++++++++++++------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index 20e248fd43642..603f817ae3a59 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -985,8 +985,12 @@ static void ath10k_mac_vif_beacon_cleanup(struct ath10k_vif *arvif) ath10k_mac_vif_beacon_free(arvif); if (arvif->beacon_buf) { - dma_free_coherent(ar->dev, IEEE80211_MAX_FRAME_LEN, - arvif->beacon_buf, arvif->beacon_paddr); + if (ar->bus_param.dev_type == ATH10K_DEV_TYPE_HL) + kfree(arvif->beacon_buf); + else + dma_free_coherent(ar->dev, IEEE80211_MAX_FRAME_LEN, + arvif->beacon_buf, + arvif->beacon_paddr); arvif->beacon_buf = NULL; } } @@ -5251,10 +5255,17 @@ static int ath10k_add_interface(struct ieee80211_hw *hw, if (vif->type == NL80211_IFTYPE_ADHOC || vif->type == NL80211_IFTYPE_MESH_POINT || vif->type == NL80211_IFTYPE_AP) { - arvif->beacon_buf = dma_alloc_coherent(ar->dev, - IEEE80211_MAX_FRAME_LEN, - &arvif->beacon_paddr, - GFP_ATOMIC); + if (ar->bus_param.dev_type == ATH10K_DEV_TYPE_HL) { + arvif->beacon_buf = kmalloc(IEEE80211_MAX_FRAME_LEN, + GFP_KERNEL); + arvif->beacon_paddr = (dma_addr_t)arvif->beacon_buf; + } else { + arvif->beacon_buf = + dma_alloc_coherent(ar->dev, + IEEE80211_MAX_FRAME_LEN, + &arvif->beacon_paddr, + GFP_ATOMIC); + } if (!arvif->beacon_buf) { ret = -ENOMEM; ath10k_warn(ar, "failed to allocate beacon buffer: %d\n", @@ -5469,8 +5480,12 @@ static int ath10k_add_interface(struct ieee80211_hw *hw, err: if (arvif->beacon_buf) { - dma_free_coherent(ar->dev, IEEE80211_MAX_FRAME_LEN, - arvif->beacon_buf, arvif->beacon_paddr); + if (ar->bus_param.dev_type == ATH10K_DEV_TYPE_HL) + kfree(arvif->beacon_buf); + else + dma_free_coherent(ar->dev, IEEE80211_MAX_FRAME_LEN, + arvif->beacon_buf, + arvif->beacon_paddr); arvif->beacon_buf = NULL; } From f3c7d78a6d9fd0723062dd9ee02d7932788b0e18 Mon Sep 17 00:00:00 2001 From: Dirk Bender Date: Mon, 26 Jul 2021 09:35:15 +0200 Subject: [PATCH 1459/5344] media: mt9p031: Fix corrupted frame after restarting stream [ Upstream commit 0961ba6dd211a4a52d1dd4c2d59be60ac2dc08c7 ] To prevent corrupted frames after starting and stopping the sensor its datasheet specifies a specific pause sequence to follow: Stopping: Set Pause_Restart Bit -> Set Restart Bit -> Set Chip_Enable Off Restarting: Set Chip_Enable On -> Clear Pause_Restart Bit The Restart Bit is cleared automatically and must not be cleared manually as this would cause undefined behavior. Signed-off-by: Dirk Bender Signed-off-by: Stefan Riedmueller Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/i2c/mt9p031.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/drivers/media/i2c/mt9p031.c b/drivers/media/i2c/mt9p031.c index dc23b9ed510a4..18440c5104ad9 100644 --- a/drivers/media/i2c/mt9p031.c +++ b/drivers/media/i2c/mt9p031.c @@ -78,7 +78,9 @@ #define MT9P031_PIXEL_CLOCK_INVERT (1 << 15) #define MT9P031_PIXEL_CLOCK_SHIFT(n) ((n) << 8) #define MT9P031_PIXEL_CLOCK_DIVIDE(n) ((n) << 0) -#define MT9P031_FRAME_RESTART 0x0b +#define MT9P031_RESTART 0x0b +#define MT9P031_FRAME_PAUSE_RESTART (1 << 1) +#define MT9P031_FRAME_RESTART (1 << 0) #define MT9P031_SHUTTER_DELAY 0x0c #define MT9P031_RST 0x0d #define MT9P031_RST_ENABLE 1 @@ -445,9 +447,23 @@ static int mt9p031_set_params(struct mt9p031 *mt9p031) static int mt9p031_s_stream(struct v4l2_subdev *subdev, int enable) { struct mt9p031 *mt9p031 = to_mt9p031(subdev); + struct i2c_client *client = v4l2_get_subdevdata(subdev); + int val; int ret; if (!enable) { + /* enable pause restart */ + val = MT9P031_FRAME_PAUSE_RESTART; + ret = mt9p031_write(client, MT9P031_RESTART, val); + if (ret < 0) + return ret; + + /* enable restart + keep pause restart set */ + val |= MT9P031_FRAME_RESTART; + ret = mt9p031_write(client, MT9P031_RESTART, val); + if (ret < 0) + return ret; + /* Stop sensor readout */ ret = mt9p031_set_output_control(mt9p031, MT9P031_OUTPUT_CONTROL_CEN, 0); @@ -467,6 +483,16 @@ static int mt9p031_s_stream(struct v4l2_subdev *subdev, int enable) if (ret < 0) return ret; + /* + * - clear pause restart + * - don't clear restart as clearing restart manually can cause + * undefined behavior + */ + val = MT9P031_FRAME_RESTART; + ret = mt9p031_write(client, MT9P031_RESTART, val); + if (ret < 0) + return ret; + return mt9p031_pll_enable(mt9p031); } From e1e5c86fce53bc5a63563b84ebccda43734c9383 Mon Sep 17 00:00:00 2001 From: Zheyu Ma Date: Wed, 23 Jun 2021 08:01:05 +0200 Subject: [PATCH 1460/5344] media: netup_unidvb: handle interrupt properly according to the firmware [ Upstream commit dbb4cfea6efe979ed153bd59a6a527a90d3d0ab3 ] The interrupt handling should be related to the firmware version. If the driver matches an old firmware, then the driver should not handle interrupt such as i2c or dma, otherwise it will cause some errors. This log reveals it: [ 27.708641] INFO: trying to register non-static key. [ 27.710851] The code is fine but needs lockdep annotation, or maybe [ 27.712010] you didn't initialize this object before use? [ 27.712396] turning off the locking correctness validator. [ 27.712787] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 5.12.4-g70e7f0549188-dirty #169 [ 27.713349] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 [ 27.714149] Call Trace: [ 27.714329] [ 27.714480] dump_stack+0xba/0xf5 [ 27.714737] register_lock_class+0x873/0x8f0 [ 27.715052] ? __lock_acquire+0x323/0x1930 [ 27.715353] __lock_acquire+0x75/0x1930 [ 27.715636] lock_acquire+0x1dd/0x3e0 [ 27.715905] ? netup_i2c_interrupt+0x19/0x310 [ 27.716226] _raw_spin_lock_irqsave+0x4b/0x60 [ 27.716544] ? netup_i2c_interrupt+0x19/0x310 [ 27.716863] netup_i2c_interrupt+0x19/0x310 [ 27.717178] netup_unidvb_isr+0xd3/0x160 [ 27.717467] __handle_irq_event_percpu+0x53/0x3e0 [ 27.717808] handle_irq_event_percpu+0x35/0x90 [ 27.718129] handle_irq_event+0x39/0x60 [ 27.718409] handle_fasteoi_irq+0xc2/0x1d0 [ 27.718707] __common_interrupt+0x7f/0x150 [ 27.719008] common_interrupt+0xb4/0xd0 [ 27.719289] [ 27.719446] asm_common_interrupt+0x1e/0x40 [ 27.719747] RIP: 0010:native_safe_halt+0x17/0x20 [ 27.720084] Code: 07 0f 00 2d 8b ee 4c 00 f4 5d c3 0f 1f 84 00 00 00 00 00 8b 05 72 95 17 02 55 48 89 e5 85 c0 7e 07 0f 00 2d 6b ee 4c 00 fb f4 <5d> c3 cc cc cc cc cc cc cc 55 48 89 e5 e8 67 53 ff ff 8b 0d 29 f6 [ 27.721386] RSP: 0018:ffffc9000008fe90 EFLAGS: 00000246 [ 27.721758] RAX: 0000000000000000 RBX: 0000000000000002 RCX: 0000000000000000 [ 27.722262] RDX: 0000000000000000 RSI: ffffffff85f7c054 RDI: ffffffff85ded4e6 [ 27.722770] RBP: ffffc9000008fe90 R08: 0000000000000001 R09: 0000000000000001 [ 27.723277] R10: 0000000000000000 R11: 0000000000000001 R12: ffffffff86a75408 [ 27.723781] R13: 0000000000000000 R14: 0000000000000000 R15: ffff888100260000 [ 27.724289] default_idle+0x9/0x10 [ 27.724537] arch_cpu_idle+0xa/0x10 [ 27.724791] default_idle_call+0x6e/0x250 [ 27.725082] do_idle+0x1f0/0x2d0 [ 27.725326] cpu_startup_entry+0x18/0x20 [ 27.725613] start_secondary+0x11f/0x160 [ 27.725902] secondary_startup_64_no_verify+0xb0/0xbb [ 27.726272] BUG: kernel NULL pointer dereference, address: 0000000000000002 [ 27.726768] #PF: supervisor read access in kernel mode [ 27.727138] #PF: error_code(0x0000) - not-present page [ 27.727507] PGD 8000000118688067 P4D 8000000118688067 PUD 10feab067 PMD 0 [ 27.727999] Oops: 0000 [#1] PREEMPT SMP PTI [ 27.728302] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 5.12.4-g70e7f0549188-dirty #169 [ 27.728861] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 [ 27.729660] RIP: 0010:netup_i2c_interrupt+0x23/0x310 [ 27.730019] Code: 0f 1f 80 00 00 00 00 55 48 89 e5 41 55 41 54 53 48 89 fb e8 af 6e 95 fd 48 89 df e8 e7 9f 1c 01 49 89 c5 48 8b 83 48 08 00 00 <66> 44 8b 60 02 44 89 e0 48 8b 93 48 08 00 00 83 e0 f8 66 89 42 02 [ 27.731339] RSP: 0018:ffffc90000118e90 EFLAGS: 00010046 [ 27.731716] RAX: 0000000000000000 RBX: ffff88810803c4d8 RCX: 0000000000000000 [ 27.732223] RDX: 0000000000000001 RSI: ffffffff85d37b94 RDI: ffff88810803c4d8 [ 27.732727] RBP: ffffc90000118ea8 R08: 0000000000000000 R09: 0000000000000001 [ 27.733239] R10: ffff88810803c4f0 R11: 61646e6f63657320 R12: 0000000000000000 [ 27.733745] R13: 0000000000000046 R14: ffff888101041000 R15: ffff8881081b2400 [ 27.734251] FS: 0000000000000000(0000) GS:ffff88817bc80000(0000) knlGS:0000000000000000 [ 27.734821] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 27.735228] CR2: 0000000000000002 CR3: 0000000108194000 CR4: 00000000000006e0 [ 27.735735] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 27.736241] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 27.736744] Call Trace: [ 27.736924] [ 27.737074] netup_unidvb_isr+0xd3/0x160 [ 27.737363] __handle_irq_event_percpu+0x53/0x3e0 [ 27.737706] handle_irq_event_percpu+0x35/0x90 [ 27.738028] handle_irq_event+0x39/0x60 [ 27.738306] handle_fasteoi_irq+0xc2/0x1d0 [ 27.738602] __common_interrupt+0x7f/0x150 [ 27.738899] common_interrupt+0xb4/0xd0 [ 27.739176] [ 27.739331] asm_common_interrupt+0x1e/0x40 [ 27.739633] RIP: 0010:native_safe_halt+0x17/0x20 [ 27.739967] Code: 07 0f 00 2d 8b ee 4c 00 f4 5d c3 0f 1f 84 00 00 00 00 00 8b 05 72 95 17 02 55 48 89 e5 85 c0 7e 07 0f 00 2d 6b ee 4c 00 fb f4 <5d> c3 cc cc cc cc cc cc cc 55 48 89 e5 e8 67 53 ff ff 8b 0d 29 f6 [ 27.741275] RSP: 0018:ffffc9000008fe90 EFLAGS: 00000246 [ 27.741647] RAX: 0000000000000000 RBX: 0000000000000002 RCX: 0000000000000000 [ 27.742148] RDX: 0000000000000000 RSI: ffffffff85f7c054 RDI: ffffffff85ded4e6 [ 27.742652] RBP: ffffc9000008fe90 R08: 0000000000000001 R09: 0000000000000001 [ 27.743154] R10: 0000000000000000 R11: 0000000000000001 R12: ffffffff86a75408 [ 27.743652] R13: 0000000000000000 R14: 0000000000000000 R15: ffff888100260000 [ 27.744157] default_idle+0x9/0x10 [ 27.744405] arch_cpu_idle+0xa/0x10 [ 27.744658] default_idle_call+0x6e/0x250 [ 27.744948] do_idle+0x1f0/0x2d0 [ 27.745190] cpu_startup_entry+0x18/0x20 [ 27.745475] start_secondary+0x11f/0x160 [ 27.745761] secondary_startup_64_no_verify+0xb0/0xbb [ 27.746123] Modules linked in: [ 27.746348] Dumping ftrace buffer: [ 27.746596] (ftrace buffer empty) [ 27.746852] CR2: 0000000000000002 [ 27.747094] ---[ end trace ebafd46f83ab946d ]--- [ 27.747424] RIP: 0010:netup_i2c_interrupt+0x23/0x310 [ 27.747778] Code: 0f 1f 80 00 00 00 00 55 48 89 e5 41 55 41 54 53 48 89 fb e8 af 6e 95 fd 48 89 df e8 e7 9f 1c 01 49 89 c5 48 8b 83 48 08 00 00 <66> 44 8b 60 02 44 89 e0 48 8b 93 48 08 00 00 83 e0 f8 66 89 42 02 [ 27.749082] RSP: 0018:ffffc90000118e90 EFLAGS: 00010046 [ 27.749461] RAX: 0000000000000000 RBX: ffff88810803c4d8 RCX: 0000000000000000 [ 27.749966] RDX: 0000000000000001 RSI: ffffffff85d37b94 RDI: ffff88810803c4d8 [ 27.750471] RBP: ffffc90000118ea8 R08: 0000000000000000 R09: 0000000000000001 [ 27.750976] R10: ffff88810803c4f0 R11: 61646e6f63657320 R12: 0000000000000000 [ 27.751480] R13: 0000000000000046 R14: ffff888101041000 R15: ffff8881081b2400 [ 27.751986] FS: 0000000000000000(0000) GS:ffff88817bc80000(0000) knlGS:0000000000000000 [ 27.752560] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 27.752970] CR2: 0000000000000002 CR3: 0000000108194000 CR4: 00000000000006e0 [ 27.753481] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 27.753984] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 27.754487] Kernel panic - not syncing: Fatal exception in interrupt [ 27.755033] Dumping ftrace buffer: [ 27.755279] (ftrace buffer empty) [ 27.755534] Kernel Offset: disabled [ 27.755785] Rebooting in 1 seconds.. Signed-off-by: Zheyu Ma Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- .../pci/netup_unidvb/netup_unidvb_core.c | 27 +++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/media/pci/netup_unidvb/netup_unidvb_core.c b/drivers/media/pci/netup_unidvb/netup_unidvb_core.c index 80a7c41baa901..eb5621c9ebf85 100644 --- a/drivers/media/pci/netup_unidvb/netup_unidvb_core.c +++ b/drivers/media/pci/netup_unidvb/netup_unidvb_core.c @@ -258,19 +258,24 @@ static irqreturn_t netup_unidvb_isr(int irq, void *dev_id) if ((reg40 & AVL_IRQ_ASSERTED) != 0) { /* IRQ is being signaled */ reg_isr = readw(ndev->bmmio0 + REG_ISR); - if (reg_isr & NETUP_UNIDVB_IRQ_I2C0) { - iret = netup_i2c_interrupt(&ndev->i2c[0]); - } else if (reg_isr & NETUP_UNIDVB_IRQ_I2C1) { - iret = netup_i2c_interrupt(&ndev->i2c[1]); - } else if (reg_isr & NETUP_UNIDVB_IRQ_SPI) { + if (reg_isr & NETUP_UNIDVB_IRQ_SPI) iret = netup_spi_interrupt(ndev->spi); - } else if (reg_isr & NETUP_UNIDVB_IRQ_DMA1) { - iret = netup_dma_interrupt(&ndev->dma[0]); - } else if (reg_isr & NETUP_UNIDVB_IRQ_DMA2) { - iret = netup_dma_interrupt(&ndev->dma[1]); - } else if (reg_isr & NETUP_UNIDVB_IRQ_CI) { - iret = netup_ci_interrupt(ndev); + else if (!ndev->old_fw) { + if (reg_isr & NETUP_UNIDVB_IRQ_I2C0) { + iret = netup_i2c_interrupt(&ndev->i2c[0]); + } else if (reg_isr & NETUP_UNIDVB_IRQ_I2C1) { + iret = netup_i2c_interrupt(&ndev->i2c[1]); + } else if (reg_isr & NETUP_UNIDVB_IRQ_DMA1) { + iret = netup_dma_interrupt(&ndev->dma[0]); + } else if (reg_isr & NETUP_UNIDVB_IRQ_DMA2) { + iret = netup_dma_interrupt(&ndev->dma[1]); + } else if (reg_isr & NETUP_UNIDVB_IRQ_CI) { + iret = netup_ci_interrupt(ndev); + } else { + goto err; + } } else { +err: dev_err(&pci_dev->dev, "%s(): unknown interrupt 0x%x\n", __func__, reg_isr); From 5bce8b89a2a6ab666ce6d27fd6d1e8424c3e1f26 Mon Sep 17 00:00:00 2001 From: Dmitriy Ulitin Date: Thu, 27 May 2021 17:06:26 +0200 Subject: [PATCH 1461/5344] media: stm32: Potential NULL pointer dereference in dcmi_irq_thread() [ Upstream commit 548fa43a58696450c15b8f5564e99589c5144664 ] At the moment of enabling irq handling: 1922 ret = devm_request_threaded_irq(&pdev->dev, irq, dcmi_irq_callback, 1923 dcmi_irq_thread, IRQF_ONESHOT, 1924 dev_name(&pdev->dev), dcmi); there is still uninitialized field sd_format of struct stm32_dcmi *dcmi. If an interrupt occurs in the interval between the installation of the interrupt handler and the initialization of this field, NULL pointer dereference happens. This field is dereferenced in the handler function without any check: 457 if (dcmi->sd_format->fourcc == V4L2_PIX_FMT_JPEG && 458 dcmi->misr & IT_FRAME) { The patch moves interrupt handler installation after initialization of the sd_format field that happens in dcmi_graph_notify_complete() via dcmi_set_default_fmt(). Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Dmitriy Ulitin Signed-off-by: Alexey Khoroshilov Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/platform/stm32/stm32-dcmi.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/media/platform/stm32/stm32-dcmi.c b/drivers/media/platform/stm32/stm32-dcmi.c index d41475f56ab54..72798aae7a628 100644 --- a/drivers/media/platform/stm32/stm32-dcmi.c +++ b/drivers/media/platform/stm32/stm32-dcmi.c @@ -135,6 +135,7 @@ struct stm32_dcmi { int sequence; struct list_head buffers; struct dcmi_buf *active; + int irq; struct v4l2_device v4l2_dev; struct video_device *vdev; @@ -1720,6 +1721,14 @@ static int dcmi_graph_notify_complete(struct v4l2_async_notifier *notifier) return ret; } + ret = devm_request_threaded_irq(dcmi->dev, dcmi->irq, dcmi_irq_callback, + dcmi_irq_thread, IRQF_ONESHOT, + dev_name(dcmi->dev), dcmi); + if (ret) { + dev_err(dcmi->dev, "Unable to request irq %d\n", dcmi->irq); + return ret; + } + return 0; } @@ -1881,6 +1890,8 @@ static int dcmi_probe(struct platform_device *pdev) if (irq <= 0) return irq ? irq : -ENXIO; + dcmi->irq = irq; + dcmi->res = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!dcmi->res) { dev_err(&pdev->dev, "Could not get resource\n"); @@ -1893,14 +1904,6 @@ static int dcmi_probe(struct platform_device *pdev) return PTR_ERR(dcmi->regs); } - ret = devm_request_threaded_irq(&pdev->dev, irq, dcmi_irq_callback, - dcmi_irq_thread, IRQF_ONESHOT, - dev_name(&pdev->dev), dcmi); - if (ret) { - dev_err(&pdev->dev, "Unable to request irq %d\n", irq); - return ret; - } - mclk = devm_clk_get(&pdev->dev, "mclk"); if (IS_ERR(mclk)) { if (PTR_ERR(mclk) != -EPROBE_DEFER) From 21414bfef90347cde6e43a12c0ad30f304d4e8b1 Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Fri, 18 Jun 2021 14:29:08 +0200 Subject: [PATCH 1462/5344] media: uvcvideo: Set capability in s_param [ Upstream commit 97a2777a96070afb7da5d587834086c0b586c8cc ] Fixes v4l2-compliance: Format ioctls (Input 0): warn: v4l2-test-formats.cpp(1339): S_PARM is supported but doesn't report V4L2_CAP_TIMEPERFRAME fail: v4l2-test-formats.cpp(1241): node->has_frmintervals && !cap->capability Reviewed-by: Hans Verkuil Signed-off-by: Ricardo Ribalda Signed-off-by: Laurent Pinchart Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/usb/uvc/uvc_v4l2.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/media/usb/uvc/uvc_v4l2.c b/drivers/media/usb/uvc/uvc_v4l2.c index db7f8f8ee2f9f..3126ee9e965c9 100644 --- a/drivers/media/usb/uvc/uvc_v4l2.c +++ b/drivers/media/usb/uvc/uvc_v4l2.c @@ -467,10 +467,13 @@ static int uvc_v4l2_set_streamparm(struct uvc_streaming *stream, uvc_simplify_fraction(&timeperframe.numerator, &timeperframe.denominator, 8, 333); - if (parm->type == V4L2_BUF_TYPE_VIDEO_CAPTURE) + if (parm->type == V4L2_BUF_TYPE_VIDEO_CAPTURE) { parm->parm.capture.timeperframe = timeperframe; - else + parm->parm.capture.capability = V4L2_CAP_TIMEPERFRAME; + } else { parm->parm.output.timeperframe = timeperframe; + parm->parm.output.capability = V4L2_CAP_TIMEPERFRAME; + } return 0; } From faa2a53df0edd2db46aaffc5ca4bc32943f1489e Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Fri, 18 Jun 2021 14:29:09 +0200 Subject: [PATCH 1463/5344] media: uvcvideo: Return -EIO for control errors [ Upstream commit ffccdde5f0e17d2f0d788a9d831a027187890eaa ] The device is doing something unexpected with the control. Either because the protocol is not properly implemented or there has been a HW error. Fixes v4l2-compliance: Control ioctls (Input 0): fail: v4l2-test-controls.cpp(448): s_ctrl returned an error (22) test VIDIOC_G/S_CTRL: FAIL fail: v4l2-test-controls.cpp(698): s_ext_ctrls returned an error (22) test VIDIOC_G/S/TRY_EXT_CTRLS: FAIL Reviewed-by: Hans Verkuil Signed-off-by: Ricardo Ribalda Signed-off-by: Laurent Pinchart Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/usb/uvc/uvc_video.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/media/usb/uvc/uvc_video.c b/drivers/media/usb/uvc/uvc_video.c index 5d095b2a03464..96b85d66e7a87 100644 --- a/drivers/media/usb/uvc/uvc_video.c +++ b/drivers/media/usb/uvc/uvc_video.c @@ -112,6 +112,11 @@ int uvc_query_ctrl(struct uvc_device *dev, u8 query, u8 unit, case 5: /* Invalid unit */ case 6: /* Invalid control */ case 7: /* Invalid Request */ + /* + * The firmware has not properly implemented + * the control or there has been a HW error. + */ + return -EIO; case 8: /* Invalid value within range */ return -EINVAL; default: /* reserved or unknown */ From 829fa170e0573af7af63d1fc660fdb108d968c1f Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Fri, 18 Jun 2021 14:29:13 +0200 Subject: [PATCH 1464/5344] media: uvcvideo: Set unique vdev name based in type [ Upstream commit e3f60e7e1a2b451f538f9926763432249bcf39c4 ] All the entities must have a unique name. We can have a descriptive and unique name by appending the function and the entity->id. This is even resilent to multi chain devices. Fixes v4l2-compliance: Media Controller ioctls: fail: v4l2-test-media.cpp(205): v2_entity_names_set.find(key) != v2_entity_names_set.end() test MEDIA_IOC_G_TOPOLOGY: FAIL fail: v4l2-test-media.cpp(394): num_data_links != num_links test MEDIA_IOC_ENUM_ENTITIES/LINKS: FAIL Signed-off-by: Ricardo Ribalda Reviewed-by: Hans Verkuil Signed-off-by: Laurent Pinchart Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/usb/uvc/uvc_driver.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/media/usb/uvc/uvc_driver.c b/drivers/media/usb/uvc/uvc_driver.c index 40ca1d4e03483..378cfc46fc195 100644 --- a/drivers/media/usb/uvc/uvc_driver.c +++ b/drivers/media/usb/uvc/uvc_driver.c @@ -1972,6 +1972,7 @@ int uvc_register_video_device(struct uvc_device *dev, const struct v4l2_file_operations *fops, const struct v4l2_ioctl_ops *ioctl_ops) { + const char *name; int ret; /* Initialize the video buffers queue. */ @@ -2000,16 +2001,20 @@ int uvc_register_video_device(struct uvc_device *dev, case V4L2_BUF_TYPE_VIDEO_CAPTURE: default: vdev->device_caps = V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_STREAMING; + name = "Video Capture"; break; case V4L2_BUF_TYPE_VIDEO_OUTPUT: vdev->device_caps = V4L2_CAP_VIDEO_OUTPUT | V4L2_CAP_STREAMING; + name = "Video Output"; break; case V4L2_BUF_TYPE_META_CAPTURE: vdev->device_caps = V4L2_CAP_META_CAPTURE | V4L2_CAP_STREAMING; + name = "Metadata"; break; } - strscpy(vdev->name, dev->name, sizeof(vdev->name)); + snprintf(vdev->name, sizeof(vdev->name), "%s %u", name, + stream->header.bTerminalLink); /* * Set the driver data before calling video_register_device, otherwise From 9401c0df1ade13e416a4e34a3f94e5c8edd5fbba Mon Sep 17 00:00:00 2001 From: Tuo Li Date: Thu, 5 Aug 2021 09:55:35 +0200 Subject: [PATCH 1465/5344] media: s5p-mfc: fix possible null-pointer dereference in s5p_mfc_probe() [ Upstream commit 8515965e5e33f4feb56134348c95953f3eadfb26 ] The variable pdev is assigned to dev->plat_dev, and dev->plat_dev is checked in: if (!dev->plat_dev) This indicates both dev->plat_dev and pdev can be NULL. If so, the function dev_err() is called to print error information. dev_err(&pdev->dev, "No platform data specified\n"); However, &pdev->dev is an illegal address, and it is dereferenced in dev_err(). To fix this possible null-pointer dereference, replace dev_err() with mfc_err(). Reported-by: TOTE Robot Signed-off-by: Tuo Li Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/platform/s5p-mfc/s5p_mfc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/platform/s5p-mfc/s5p_mfc.c b/drivers/media/platform/s5p-mfc/s5p_mfc.c index b776f83e395e0..f8a5ed6bb9d7a 100644 --- a/drivers/media/platform/s5p-mfc/s5p_mfc.c +++ b/drivers/media/platform/s5p-mfc/s5p_mfc.c @@ -1279,7 +1279,7 @@ static int s5p_mfc_probe(struct platform_device *pdev) spin_lock_init(&dev->condlock); dev->plat_dev = pdev; if (!dev->plat_dev) { - dev_err(&pdev->dev, "No platform data specified\n"); + mfc_err("No platform data specified\n"); return -ENODEV; } From ec4c4fe2b4fcf6a08305a9c783070fda8ee0dc56 Mon Sep 17 00:00:00 2001 From: Nadezda Lutovinova Date: Wed, 11 Aug 2021 15:32:28 +0200 Subject: [PATCH 1466/5344] media: s5p-mfc: Add checking to s5p_mfc_probe(). [ Upstream commit cdfaf4752e6915a4b455ad4400133e540e4dc965 ] If of_device_get_match_data() return NULL, then null pointer dereference occurs in s5p_mfc_init_pm(). The patch adds checking if dev->variant is NULL. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Nadezda Lutovinova Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/platform/s5p-mfc/s5p_mfc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/media/platform/s5p-mfc/s5p_mfc.c b/drivers/media/platform/s5p-mfc/s5p_mfc.c index f8a5ed6bb9d7a..9faecd049002f 100644 --- a/drivers/media/platform/s5p-mfc/s5p_mfc.c +++ b/drivers/media/platform/s5p-mfc/s5p_mfc.c @@ -1284,6 +1284,10 @@ static int s5p_mfc_probe(struct platform_device *pdev) } dev->variant = of_device_get_match_data(&pdev->dev); + if (!dev->variant) { + dev_err(&pdev->dev, "Failed to get device MFC hardware variant information\n"); + return -ENOENT; + } res = platform_get_resource(pdev, IORESOURCE_MEM, 0); dev->regs_base = devm_ioremap_resource(&pdev->dev, res); From 22fd00ee7c9787a224e0b918d05385c376b4cbba Mon Sep 17 00:00:00 2001 From: Martin Kepplinger Date: Wed, 8 Sep 2021 10:47:46 +0200 Subject: [PATCH 1467/5344] media: imx: set a media_device bus_info string [ Upstream commit 6d0d779b212c27293d9ccb4da092ff0ccb6efa39 ] Some tools like v4l2-compliance let users select a media device based on the bus_info string which can be quite convenient. Use a unique string for that. This also fixes the following v4l2-compliance warning: warn: v4l2-test-media.cpp(52): empty bus_info Signed-off-by: Martin Kepplinger Reviewed-by: Laurent Pinchart Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/staging/media/imx/imx-media-dev-common.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/staging/media/imx/imx-media-dev-common.c b/drivers/staging/media/imx/imx-media-dev-common.c index 66b505f7e8dff..137e414cda186 100644 --- a/drivers/staging/media/imx/imx-media-dev-common.c +++ b/drivers/staging/media/imx/imx-media-dev-common.c @@ -373,6 +373,8 @@ struct imx_media_dev *imx_media_dev_init(struct device *dev, imxmd->v4l2_dev.notify = imx_media_notify; strscpy(imxmd->v4l2_dev.name, "imx-media", sizeof(imxmd->v4l2_dev.name)); + snprintf(imxmd->md.bus_info, sizeof(imxmd->md.bus_info), + "platform:%s", dev_name(imxmd->md.dev)); media_device_init(&imxmd->md); From 7441908860e04ee6ca2d93b85194e2dca21c3a9d Mon Sep 17 00:00:00 2001 From: Rajat Asthana Date: Wed, 18 Aug 2021 22:31:10 +0200 Subject: [PATCH 1468/5344] media: mceusb: return without resubmitting URB in case of -EPROTO error. [ Upstream commit 476db72e521983ecb847e4013b263072bb1110fc ] Syzkaller reported a warning called "rcu detected stall in dummy_timer". The error seems to be an error in mceusb_dev_recv(). In the case of -EPROTO error, the routine immediately resubmits the URB. Instead it should return without resubmitting URB. Reported-by: syzbot+4d3749e9612c2cfab956@syzkaller.appspotmail.com Signed-off-by: Rajat Asthana Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/rc/mceusb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/rc/mceusb.c b/drivers/media/rc/mceusb.c index c68e52c17ae13..31e56f4f34791 100644 --- a/drivers/media/rc/mceusb.c +++ b/drivers/media/rc/mceusb.c @@ -1386,6 +1386,7 @@ static void mceusb_dev_recv(struct urb *urb) case -ECONNRESET: case -ENOENT: case -EILSEQ: + case -EPROTO: case -ESHUTDOWN: usb_unlink_urb(urb); return; From 1349822f5b885b074eb14aba526a9fba7cb287c6 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 26 Sep 2021 10:12:24 -0700 Subject: [PATCH 1469/5344] ia64: don't do IA64_CMPXCHG_DEBUG without CONFIG_PRINTK [ Upstream commit c15b5fc054c3d6c97e953617605235c5cb8ce979 ] When CONFIG_PRINTK is not set, the CMPXCHG_BUGCHECK() macro calls _printk(), but _printk() is a static inline function, not available as an extern. Since the purpose of the macro is to print the BUGCHECK info, make this config option depend on PRINTK. Fixes multiple occurrences of this build error: ../include/linux/printk.h:208:5: error: static declaration of '_printk' follows non-static declaration 208 | int _printk(const char *s, ...) | ^~~~~~~ In file included from ../arch/ia64/include/asm/cmpxchg.h:5, ../arch/ia64/include/uapi/asm/cmpxchg.h:146:28: note: previous declaration of '_printk' with type 'int(const char *, ...)' 146 | extern int _printk(const char *fmt, ...); Cc: linux-ia64@vger.kernel.org Cc: Andrew Morton Cc: Tony Luck Cc: Chris Down Cc: Paul Gortmaker Cc: John Paul Adrian Glaubitz Signed-off-by: Randy Dunlap Signed-off-by: Petr Mladek Signed-off-by: Sasha Levin --- arch/ia64/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/ia64/Kconfig.debug b/arch/ia64/Kconfig.debug index 40ca23bd228d6..2ce008e2d1644 100644 --- a/arch/ia64/Kconfig.debug +++ b/arch/ia64/Kconfig.debug @@ -39,7 +39,7 @@ config DISABLE_VHPT config IA64_DEBUG_CMPXCHG bool "Turn on compare-and-exchange bug checking (slow!)" - depends on DEBUG_KERNEL + depends on DEBUG_KERNEL && PRINTK help Selecting this option turns on bug checking for the IA-64 compare-and-exchange instructions. This is slow! Itaniums From d470a541631dd0e59848f3c9f93b230099b84b82 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 28 Sep 2021 18:06:33 +0200 Subject: [PATCH 1470/5344] brcmfmac: Add DMI nvram filename quirk for Cyberbook T116 tablet [ Upstream commit 49c3eb3036e6359c5c20fe76c611a2c0e0d4710e ] The Cyberbook T116 tablet contains quite generic names in the sys_vendor and product_name DMI strings, without this patch brcmfmac will try to load: "brcmfmac43455-sdio.Default string-Default string.txt" as nvram file which is way too generic. The nvram file shipped on the factory Android image contains the exact same settings as those used on the AcePC T8 mini PC, so point the new DMI nvram filename quirk to the acepc-t8 nvram file. Signed-off-by: Hans de Goede Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210928160633.96928-1-hdegoede@redhat.com Signed-off-by: Sasha Levin --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/dmi.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/dmi.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/dmi.c index 6d5188b78f2de..0af452dca7664 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/dmi.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/dmi.c @@ -75,6 +75,16 @@ static const struct dmi_system_id dmi_platform_data[] = { }, .driver_data = (void *)&acepc_t8_data, }, + { + /* Cyberbook T116 rugged tablet */ + .matches = { + DMI_EXACT_MATCH(DMI_BOARD_VENDOR, "Default string"), + DMI_EXACT_MATCH(DMI_BOARD_NAME, "Cherry Trail CR"), + DMI_EXACT_MATCH(DMI_PRODUCT_SKU, "20170531"), + }, + /* The factory image nvram file is identical to the ACEPC T8 one */ + .driver_data = (void *)&acepc_t8_data, + }, { /* Match for the GPDwin which unfortunately uses somewhat * generic dmi strings, which is why we test for 4 strings. From 400054a1471816be9dd14ea535470caf6b854743 Mon Sep 17 00:00:00 2001 From: Nadezda Lutovinova Date: Wed, 11 Aug 2021 19:18:16 +0200 Subject: [PATCH 1471/5344] media: rcar-csi2: Add checking to rcsi2_start_receiver() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit fc41665498332ad394b7db37f23e9394096ddc71 ] If rcsi2_code_to_fmt() return NULL, then null pointer dereference occurs in the next cycle. That should not be possible now but adding checking protects from future bugs. The patch adds checking if format is NULL. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Nadezda Lutovinova Reviewed-by: Jacopo Mondi Reviewed-by: Niklas Söderlund Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/platform/rcar-vin/rcar-csi2.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/media/platform/rcar-vin/rcar-csi2.c b/drivers/media/platform/rcar-vin/rcar-csi2.c index d27eccfa57cae..e01f22bf826d4 100644 --- a/drivers/media/platform/rcar-vin/rcar-csi2.c +++ b/drivers/media/platform/rcar-vin/rcar-csi2.c @@ -488,6 +488,8 @@ static int rcsi2_start_receiver(struct rcar_csi2 *priv) /* Code is validated in set_fmt. */ format = rcsi2_code_to_fmt(priv->mf.code); + if (!format) + return -EINVAL; /* * Enable all supported CSI-2 channels with virtual channel and From a2f707db5781438343bb3860996cee5619b5e683 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Thu, 16 Sep 2021 11:36:20 -0500 Subject: [PATCH 1472/5344] ipmi: Disable some operations during a panic [ Upstream commit b36eb5e7b75a756baa64909a176dd4269ee05a8b ] Don't do kfree or other risky things when oops_in_progress is set. It's easy enough to avoid doing them Signed-off-by: Corey Minyard Signed-off-by: Sasha Levin --- drivers/char/ipmi/ipmi_msghandler.c | 10 +++++++--- drivers/char/ipmi/ipmi_watchdog.c | 17 ++++++++++++----- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index 331c11a157d68..283d6b4498a50 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -4799,7 +4799,9 @@ static atomic_t recv_msg_inuse_count = ATOMIC_INIT(0); static void free_smi_msg(struct ipmi_smi_msg *msg) { atomic_dec(&smi_msg_inuse_count); - kfree(msg); + /* Try to keep as much stuff out of the panic path as possible. */ + if (!oops_in_progress) + kfree(msg); } struct ipmi_smi_msg *ipmi_alloc_smi_msg(void) @@ -4818,7 +4820,9 @@ EXPORT_SYMBOL(ipmi_alloc_smi_msg); static void free_recv_msg(struct ipmi_recv_msg *msg) { atomic_dec(&recv_msg_inuse_count); - kfree(msg); + /* Try to keep as much stuff out of the panic path as possible. */ + if (!oops_in_progress) + kfree(msg); } static struct ipmi_recv_msg *ipmi_alloc_recv_msg(void) @@ -4836,7 +4840,7 @@ static struct ipmi_recv_msg *ipmi_alloc_recv_msg(void) void ipmi_free_recv_msg(struct ipmi_recv_msg *msg) { - if (msg->user) + if (msg->user && !oops_in_progress) kref_put(&msg->user->refcount, free_user); msg->done(msg); } diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c index ae06e5402e9d5..72ad7fff64a7a 100644 --- a/drivers/char/ipmi/ipmi_watchdog.c +++ b/drivers/char/ipmi/ipmi_watchdog.c @@ -337,13 +337,17 @@ static atomic_t msg_tofree = ATOMIC_INIT(0); static DECLARE_COMPLETION(msg_wait); static void msg_free_smi(struct ipmi_smi_msg *msg) { - if (atomic_dec_and_test(&msg_tofree)) - complete(&msg_wait); + if (atomic_dec_and_test(&msg_tofree)) { + if (!oops_in_progress) + complete(&msg_wait); + } } static void msg_free_recv(struct ipmi_recv_msg *msg) { - if (atomic_dec_and_test(&msg_tofree)) - complete(&msg_wait); + if (atomic_dec_and_test(&msg_tofree)) { + if (!oops_in_progress) + complete(&msg_wait); + } } static struct ipmi_smi_msg smi_msg = { .done = msg_free_smi @@ -429,8 +433,10 @@ static int _ipmi_set_timeout(int do_heartbeat) rv = __ipmi_set_timeout(&smi_msg, &recv_msg, &send_heartbeat_now); - if (rv) + if (rv) { + atomic_set(&msg_tofree, 0); return rv; + } wait_for_completion(&msg_wait); @@ -575,6 +581,7 @@ static int __ipmi_heartbeat(void) &recv_msg, 1); if (rv) { + atomic_set(&msg_tofree, 0); pr_warn("heartbeat send failure: %d\n", rv); return rv; } From f8937ebb4348a3754f2ffc6081dbd3e0c62b720a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 29 Sep 2021 18:31:25 +0200 Subject: [PATCH 1473/5344] ACPICA: Avoid evaluating methods too early during system resume [ Upstream commit d3c4b6f64ad356c0d9ddbcf73fa471e6a841cc5c ] ACPICA commit 0762982923f95eb652cf7ded27356b247c9774de During wakeup from system-wide sleep states, acpi_get_sleep_type_data() is called and it tries to get memory from the slab allocator in order to evaluate a control method, but if KFENCE is enabled in the kernel, the memory allocation attempt causes an IRQ work to be queued and a self-IPI to be sent to the CPU running the code which requires the memory controller to be ready, so if that happens too early in the wakeup path, it doesn't work. Prevent that from taking place by calling acpi_get_sleep_type_data() for S0 upfront, when preparing to enter a given sleep state, and saving the data obtained by it for later use during system wakeup. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=214271 Reported-by: Reik Keutterling Tested-by: Reik Keutterling Signed-off-by: Rafael J. Wysocki Signed-off-by: Sasha Levin --- drivers/acpi/acpica/acglobal.h | 2 ++ drivers/acpi/acpica/hwesleep.c | 8 ++------ drivers/acpi/acpica/hwsleep.c | 11 ++++------- drivers/acpi/acpica/hwxfsleep.c | 7 +++++++ 4 files changed, 15 insertions(+), 13 deletions(-) diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h index fd3beea934213..42c4bfd796f42 100644 --- a/drivers/acpi/acpica/acglobal.h +++ b/drivers/acpi/acpica/acglobal.h @@ -220,6 +220,8 @@ extern struct acpi_bit_register_info acpi_gbl_bit_register_info[ACPI_NUM_BITREG]; ACPI_GLOBAL(u8, acpi_gbl_sleep_type_a); ACPI_GLOBAL(u8, acpi_gbl_sleep_type_b); +ACPI_GLOBAL(u8, acpi_gbl_sleep_type_a_s0); +ACPI_GLOBAL(u8, acpi_gbl_sleep_type_b_s0); /***************************************************************************** * diff --git a/drivers/acpi/acpica/hwesleep.c b/drivers/acpi/acpica/hwesleep.c index dee3affaca491..aa502ae3b6b31 100644 --- a/drivers/acpi/acpica/hwesleep.c +++ b/drivers/acpi/acpica/hwesleep.c @@ -147,17 +147,13 @@ acpi_status acpi_hw_extended_sleep(u8 sleep_state) acpi_status acpi_hw_extended_wake_prep(u8 sleep_state) { - acpi_status status; u8 sleep_type_value; ACPI_FUNCTION_TRACE(hw_extended_wake_prep); - status = acpi_get_sleep_type_data(ACPI_STATE_S0, - &acpi_gbl_sleep_type_a, - &acpi_gbl_sleep_type_b); - if (ACPI_SUCCESS(status)) { + if (acpi_gbl_sleep_type_a_s0 != ACPI_SLEEP_TYPE_INVALID) { sleep_type_value = - ((acpi_gbl_sleep_type_a << ACPI_X_SLEEP_TYPE_POSITION) & + ((acpi_gbl_sleep_type_a_s0 << ACPI_X_SLEEP_TYPE_POSITION) & ACPI_X_SLEEP_TYPE_MASK); (void)acpi_write((u64)(sleep_type_value | ACPI_X_SLEEP_ENABLE), diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c index b62db8ec446fa..5f7d63badbe9d 100644 --- a/drivers/acpi/acpica/hwsleep.c +++ b/drivers/acpi/acpica/hwsleep.c @@ -179,7 +179,7 @@ acpi_status acpi_hw_legacy_sleep(u8 sleep_state) acpi_status acpi_hw_legacy_wake_prep(u8 sleep_state) { - acpi_status status; + acpi_status status = AE_OK; struct acpi_bit_register_info *sleep_type_reg_info; struct acpi_bit_register_info *sleep_enable_reg_info; u32 pm1a_control; @@ -192,10 +192,7 @@ acpi_status acpi_hw_legacy_wake_prep(u8 sleep_state) * This is unclear from the ACPI Spec, but it is required * by some machines. */ - status = acpi_get_sleep_type_data(ACPI_STATE_S0, - &acpi_gbl_sleep_type_a, - &acpi_gbl_sleep_type_b); - if (ACPI_SUCCESS(status)) { + if (acpi_gbl_sleep_type_a_s0 != ACPI_SLEEP_TYPE_INVALID) { sleep_type_reg_info = acpi_hw_get_bit_register_info(ACPI_BITREG_SLEEP_TYPE); sleep_enable_reg_info = @@ -216,9 +213,9 @@ acpi_status acpi_hw_legacy_wake_prep(u8 sleep_state) /* Insert the SLP_TYP bits */ - pm1a_control |= (acpi_gbl_sleep_type_a << + pm1a_control |= (acpi_gbl_sleep_type_a_s0 << sleep_type_reg_info->bit_position); - pm1b_control |= (acpi_gbl_sleep_type_b << + pm1b_control |= (acpi_gbl_sleep_type_b_s0 << sleep_type_reg_info->bit_position); /* Write the control registers and ignore any errors */ diff --git a/drivers/acpi/acpica/hwxfsleep.c b/drivers/acpi/acpica/hwxfsleep.c index abbf9702aa7f2..79731efbe8fe2 100644 --- a/drivers/acpi/acpica/hwxfsleep.c +++ b/drivers/acpi/acpica/hwxfsleep.c @@ -214,6 +214,13 @@ acpi_status acpi_enter_sleep_state_prep(u8 sleep_state) return_ACPI_STATUS(status); } + status = acpi_get_sleep_type_data(ACPI_STATE_S0, + &acpi_gbl_sleep_type_a_s0, + &acpi_gbl_sleep_type_b_s0); + if (ACPI_FAILURE(status)) { + acpi_gbl_sleep_type_a_s0 = ACPI_SLEEP_TYPE_INVALID; + } + /* Execute the _PTS method (Prepare To Sleep) */ arg_list.count = 1; From 68d128ac750f483b12f7c9a7f59879ff6a1ebb97 Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Thu, 7 Oct 2021 00:26:21 +0200 Subject: [PATCH 1474/5344] media: ipu3-imgu: imgu_fmt: Handle properly try [ Upstream commit 553481e38045f349bb9aa596d03bebd020020c9c ] For a try_fmt call, the node noes not need to be enabled. Fixes v4l2-compliance fail: v4l2-test-formats.cpp(717): Video Output Multiplanar is valid, but no TRY_FMT was implemented test VIDIOC_TRY_FMT: FAIL Signed-off-by: Ricardo Ribalda Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/staging/media/ipu3/ipu3-v4l2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/media/ipu3/ipu3-v4l2.c b/drivers/staging/media/ipu3/ipu3-v4l2.c index 908ae74aa970d..dd214fcd80cf8 100644 --- a/drivers/staging/media/ipu3/ipu3-v4l2.c +++ b/drivers/staging/media/ipu3/ipu3-v4l2.c @@ -695,7 +695,7 @@ static int imgu_fmt(struct imgu_device *imgu, unsigned int pipe, int node, /* CSS expects some format on OUT queue */ if (i != IPU3_CSS_QUEUE_OUT && - !imgu_pipe->nodes[inode].enabled) { + !imgu_pipe->nodes[inode].enabled && !try) { fmts[i] = NULL; continue; } From 278ecc922faa91544fbfe7b08989164fad55c340 Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Thu, 7 Oct 2021 00:26:22 +0200 Subject: [PATCH 1475/5344] media: ipu3-imgu: VIDIOC_QUERYCAP: Fix bus_info [ Upstream commit ea2b9a33711604e91f8c826f4dcb3c12baa1990a ] bus_info field had a different value for the media entity and the video device. Fixes v4l2-compliance: v4l2-compliance.cpp(637): media bus_info 'PCI:0000:00:05.0' differs from V4L2 bus_info 'PCI:viewfinder' Reviewed-by: Bingbu Cao Signed-off-by: Ricardo Ribalda Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/staging/media/ipu3/ipu3-v4l2.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/staging/media/ipu3/ipu3-v4l2.c b/drivers/staging/media/ipu3/ipu3-v4l2.c index dd214fcd80cf8..53239ea67fe48 100644 --- a/drivers/staging/media/ipu3/ipu3-v4l2.c +++ b/drivers/staging/media/ipu3/ipu3-v4l2.c @@ -594,11 +594,12 @@ static const struct imgu_fmt *find_format(struct v4l2_format *f, u32 type) static int imgu_vidioc_querycap(struct file *file, void *fh, struct v4l2_capability *cap) { - struct imgu_video_device *node = file_to_intel_imgu_node(file); + struct imgu_device *imgu = video_drvdata(file); strscpy(cap->driver, IMGU_NAME, sizeof(cap->driver)); strscpy(cap->card, IMGU_NAME, sizeof(cap->card)); - snprintf(cap->bus_info, sizeof(cap->bus_info), "PCI:%s", node->name); + snprintf(cap->bus_info, sizeof(cap->bus_info), "PCI:%s", + pci_name(imgu->pci_dev)); return 0; } From ff52291e7215649e651a6b56ea2f2d5a806c0d0e Mon Sep 17 00:00:00 2001 From: Anant Thazhemadam Date: Mon, 7 Dec 2020 07:16:06 +0100 Subject: [PATCH 1476/5344] media: usb: dvd-usb: fix uninit-value bug in dibusb_read_eeprom_byte() [ Upstream commit 899a61a3305d49e8a712e9ab20d0db94bde5929f ] In dibusb_read_eeprom_byte(), if dibusb_i2c_msg() fails, val gets assigned an value that's not properly initialized. Using kzalloc() in place of kmalloc() for the buffer fixes this issue, as the val can now be set to 0 in the event dibusb_i2c_msg() fails. Reported-by: syzbot+e27b4fd589762b0b9329@syzkaller.appspotmail.com Tested-by: syzbot+e27b4fd589762b0b9329@syzkaller.appspotmail.com Signed-off-by: Anant Thazhemadam Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/usb/dvb-usb/dibusb-common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/usb/dvb-usb/dibusb-common.c b/drivers/media/usb/dvb-usb/dibusb-common.c index 59ce2dec11e98..9c1ebea68b544 100644 --- a/drivers/media/usb/dvb-usb/dibusb-common.c +++ b/drivers/media/usb/dvb-usb/dibusb-common.c @@ -223,7 +223,7 @@ int dibusb_read_eeprom_byte(struct dvb_usb_device *d, u8 offs, u8 *val) u8 *buf; int rc; - buf = kmalloc(2, GFP_KERNEL); + buf = kzalloc(2, GFP_KERNEL); if (!buf) return -ENOMEM; From 6683eefdb701d436054c82cac6075094e259a3e3 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 7 Oct 2021 16:00:51 +0200 Subject: [PATCH 1477/5344] net-sysfs: try not to restart the syscall if it will fail eventually [ Upstream commit 146e5e733310379f51924111068f08a3af0db830 ] Due to deadlocks in the networking subsystem spotted 12 years ago[1], a workaround was put in place[2] to avoid taking the rtnl lock when it was not available and restarting the syscall (back to VFS, letting userspace spin). The following construction is found a lot in the net sysfs and sysctl code: if (!rtnl_trylock()) return restart_syscall(); This can be problematic when multiple userspace threads use such interfaces in a short period, making them to spin a lot. This happens for example when adding and moving virtual interfaces: userspace programs listening on events, such as systemd-udevd and NetworkManager, do trigger actions reading files in sysfs. It gets worse when a lot of virtual interfaces are created concurrently, say when creating containers at boot time. Returning early without hitting the above pattern when the syscall will fail eventually does make things better. While it is not a fix for the issue, it does ease things. [1] https://lore.kernel.org/netdev/49A4D5D5.5090602@trash.net/ https://lore.kernel.org/netdev/m14oyhis31.fsf@fess.ebiederm.org/ and https://lore.kernel.org/netdev/20090226084924.16cb3e08@nehalam/ [2] Rightfully, those deadlocks are *hard* to solve. Signed-off-by: Antoine Tenart Reviewed-by: Paolo Abeni Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/core/net-sysfs.c | 55 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 4a80408b4b368..ccd1d8a8e8b79 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -174,6 +174,14 @@ static int change_carrier(struct net_device *dev, unsigned long new_carrier) static ssize_t carrier_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { + struct net_device *netdev = to_net_dev(dev); + + /* The check is also done in change_carrier; this helps returning early + * without hitting the trylock/restart in netdev_store. + */ + if (!netdev->netdev_ops->ndo_change_carrier) + return -EOPNOTSUPP; + return netdev_store(dev, attr, buf, len, change_carrier); } @@ -195,6 +203,12 @@ static ssize_t speed_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); int ret = -EINVAL; + /* The check is also done in __ethtool_get_link_ksettings; this helps + * returning early without hitting the trylock/restart below. + */ + if (!netdev->ethtool_ops->get_link_ksettings) + return ret; + if (!rtnl_trylock()) return restart_syscall(); @@ -215,6 +229,12 @@ static ssize_t duplex_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); int ret = -EINVAL; + /* The check is also done in __ethtool_get_link_ksettings; this helps + * returning early without hitting the trylock/restart below. + */ + if (!netdev->ethtool_ops->get_link_ksettings) + return ret; + if (!rtnl_trylock()) return restart_syscall(); @@ -438,6 +458,14 @@ static ssize_t proto_down_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { + struct net_device *netdev = to_net_dev(dev); + + /* The check is also done in change_proto_down; this helps returning + * early without hitting the trylock/restart in netdev_store. + */ + if (!netdev->netdev_ops->ndo_change_proto_down) + return -EOPNOTSUPP; + return netdev_store(dev, attr, buf, len, change_proto_down); } NETDEVICE_SHOW_RW(proto_down, fmt_dec); @@ -448,6 +476,12 @@ static ssize_t phys_port_id_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); ssize_t ret = -EINVAL; + /* The check is also done in dev_get_phys_port_id; this helps returning + * early without hitting the trylock/restart below. + */ + if (!netdev->netdev_ops->ndo_get_phys_port_id) + return -EOPNOTSUPP; + if (!rtnl_trylock()) return restart_syscall(); @@ -470,6 +504,13 @@ static ssize_t phys_port_name_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); ssize_t ret = -EINVAL; + /* The checks are also done in dev_get_phys_port_name; this helps + * returning early without hitting the trylock/restart below. + */ + if (!netdev->netdev_ops->ndo_get_phys_port_name && + !netdev->netdev_ops->ndo_get_devlink_port) + return -EOPNOTSUPP; + if (!rtnl_trylock()) return restart_syscall(); @@ -492,6 +533,14 @@ static ssize_t phys_switch_id_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); ssize_t ret = -EINVAL; + /* The checks are also done in dev_get_phys_port_name; this helps + * returning early without hitting the trylock/restart below. This works + * because recurse is false when calling dev_get_port_parent_id. + */ + if (!netdev->netdev_ops->ndo_get_port_parent_id && + !netdev->netdev_ops->ndo_get_devlink_port) + return -EOPNOTSUPP; + if (!rtnl_trylock()) return restart_syscall(); @@ -1164,6 +1213,12 @@ static ssize_t tx_maxrate_store(struct netdev_queue *queue, if (!capable(CAP_NET_ADMIN)) return -EPERM; + /* The check is also done later; this helps returning early without + * hitting the trylock/restart below. + */ + if (!dev->netdev_ops->ndo_set_tx_maxrate) + return -EOPNOTSUPP; + err = kstrtou32(buf, 10, &rate); if (err < 0) return err; From 2ab9d81dd29adc71b2cc67bd3991bf3ad319f271 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 18 Aug 2021 11:24:50 -0400 Subject: [PATCH 1478/5344] tracefs: Have tracefs directories not set OTH permission bits by default [ Upstream commit 49d67e445742bbcb03106b735b2ab39f6e5c56bc ] The tracefs file system is by default mounted such that only root user can access it. But there are legitimate reasons to create a group and allow those added to the group to have access to tracing. By changing the permissions of the tracefs mount point to allow access, it will allow group access to the tracefs directory. There should not be any real reason to allow all access to the tracefs directory as it contains sensitive information. Have the default permission of directories being created not have any OTH (other) bits set, such that an admin that wants to give permission to a group has to first disable all OTH bits in the file system. Link: https://lkml.kernel.org/r/20210818153038.664127804@goodmis.org Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Sasha Levin --- fs/tracefs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 0caa151cae4ee..efe078fe5d4a9 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -427,7 +427,8 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent, if (unlikely(!inode)) return failed_creating(dentry); - inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; + /* Do not set bits for OTH */ + inode->i_mode = S_IFDIR | S_IRWXU | S_IRUSR| S_IRGRP | S_IXUSR | S_IXGRP; inode->i_op = ops; inode->i_fop = &simple_dir_operations; From 7296857a8ceba28011db3a47e8a39ec23e27c661 Mon Sep 17 00:00:00 2001 From: Tuo Li Date: Thu, 5 Aug 2021 08:38:53 -0700 Subject: [PATCH 1479/5344] ath: dfs_pattern_detector: Fix possible null-pointer dereference in channel_detector_create() [ Upstream commit 4b6012a7830b813799a7faf40daa02a837e0fd5b ] kzalloc() is used to allocate memory for cd->detectors, and if it fails, channel_detector_exit() behind the label fail will be called: channel_detector_exit(dpd, cd); In channel_detector_exit(), cd->detectors is dereferenced through: struct pri_detector *de = cd->detectors[i]; To fix this possible null-pointer dereference, check cd->detectors before the for loop to dereference cd->detectors. Reported-by: TOTE Robot Signed-off-by: Tuo Li Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210805153854.154066-1-islituo@gmail.com Signed-off-by: Sasha Levin --- drivers/net/wireless/ath/dfs_pattern_detector.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/ath/dfs_pattern_detector.c b/drivers/net/wireless/ath/dfs_pattern_detector.c index a274eb0d19688..a0ad6e48a35b4 100644 --- a/drivers/net/wireless/ath/dfs_pattern_detector.c +++ b/drivers/net/wireless/ath/dfs_pattern_detector.c @@ -182,10 +182,12 @@ static void channel_detector_exit(struct dfs_pattern_detector *dpd, if (cd == NULL) return; list_del(&cd->head); - for (i = 0; i < dpd->num_radar_types; i++) { - struct pri_detector *de = cd->detectors[i]; - if (de != NULL) - de->exit(de); + if (cd->detectors) { + for (i = 0; i < dpd->num_radar_types; i++) { + struct pri_detector *de = cd->detectors[i]; + if (de != NULL) + de->exit(de); + } } kfree(cd->detectors); kfree(cd); From e228bcfd210b4b70df8bd39a3d597374db732f21 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 21 Jul 2021 19:03:47 +0200 Subject: [PATCH 1480/5344] iov_iter: Fix iov_iter_get_pages{,_alloc} page fault return value [ Upstream commit 814a66741b9ffb5e1ba119e368b178edb0b7322d ] Both iov_iter_get_pages and iov_iter_get_pages_alloc return the number of bytes of the iovec they could get the pages for. When they cannot get any pages, they're supposed to return 0, but when the start of the iovec isn't page aligned, the calculation goes wrong and they return a negative value. Fix both functions. In addition, change iov_iter_get_pages_alloc to return NULL in that case to prevent resource leaks. Signed-off-by: Andreas Gruenbacher Reviewed-by: Christoph Hellwig Signed-off-by: Sasha Levin --- lib/iov_iter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 973e5b12b37e8..9d3bda3d49fed 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1304,7 +1304,7 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, res = get_user_pages_fast(addr, n, iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, pages); - if (unlikely(res < 0)) + if (unlikely(res <= 0)) return res; return (res == n ? len : res * PAGE_SIZE) - *start; 0;}),({ @@ -1386,8 +1386,9 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, return -ENOMEM; res = get_user_pages_fast(addr, n, iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, p); - if (unlikely(res < 0)) { + if (unlikely(res <= 0)) { kvfree(p); + *pages = NULL; return res; } *pages = p; From f61d6adf236ac2ec271cd27b76b9eae8f90f8b9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Fri, 8 Oct 2021 00:05:29 -0300 Subject: [PATCH 1481/5344] ACPI: battery: Accept charges over the design capacity as full MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 2835f327bd1240508db2c89fe94a056faa53c49a ] Some buggy firmware and/or brand new batteries can support a charge that's slightly over the reported design capacity. In such cases, the kernel will report to userspace that the charging state of the battery is "Unknown", when in reality the battery charge is "Full", at least from the design capacity point of view. Make the fallback condition accepts capacities over the designed capacity so userspace knows that is full. Signed-off-by: André Almeida Reviewed-by: Hans de Goede Reviewed-by: Sebastian Reichel Signed-off-by: Rafael J. Wysocki Signed-off-by: Sasha Levin --- drivers/acpi/battery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c index 254a7d98b9d4c..6e96ed68b3379 100644 --- a/drivers/acpi/battery.c +++ b/drivers/acpi/battery.c @@ -185,7 +185,7 @@ static int acpi_battery_is_charged(struct acpi_battery *battery) return 1; /* fallback to using design values for broken batteries */ - if (battery->design_capacity == battery->capacity_now) + if (battery->design_capacity <= battery->capacity_now) return 1; /* we don't do any sort of metric based on percentages */ From 8808492b3b627d02f0b61bb02949d9529559ee81 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 29 Sep 2021 15:02:18 -0700 Subject: [PATCH 1482/5344] leaking_addresses: Always print a trailing newline [ Upstream commit cf2a85efdade117e2169d6e26641016cbbf03ef0 ] For files that lack trailing newlines and match a leaking address (e.g. wchan[1]), the leaking_addresses.pl report would run together with the next line, making things look corrupted. Unconditionally remove the newline on input, and write it back out on output. [1] https://lore.kernel.org/all/20210103142726.GC30643@xsang-OptiPlex-9020/ Signed-off-by: Kees Cook Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211008111626.151570317@infradead.org Signed-off-by: Sasha Levin --- scripts/leaking_addresses.pl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/leaking_addresses.pl b/scripts/leaking_addresses.pl index b2d8b8aa2d99e..8f636a23bc3f2 100755 --- a/scripts/leaking_addresses.pl +++ b/scripts/leaking_addresses.pl @@ -455,8 +455,9 @@ sub parse_file open my $fh, "<", $file or return; while ( <$fh> ) { + chomp; if (may_leak_address($_)) { - print $file . ': ' . $_; + printf("$file: $_\n"); } } close $fh; From 3449913f05e8d5b2211f373ff2100aabd0696135 Mon Sep 17 00:00:00 2001 From: Zheyu Ma Date: Sat, 16 Oct 2021 11:26:21 +0000 Subject: [PATCH 1483/5344] memstick: r592: Fix a UAF bug when removing the driver [ Upstream commit 738216c1953e802aa9f930c5d15b8f9092c847ff ] In r592_remove(), the driver will free dma after freeing the host, which may cause a UAF bug. The following log reveals it: [ 45.361796 ] BUG: KASAN: use-after-free in r592_remove+0x269/0x350 [r592] [ 45.364286 ] Call Trace: [ 45.364472 ] dump_stack_lvl+0xa8/0xd1 [ 45.364751 ] print_address_description+0x87/0x3b0 [ 45.365137 ] kasan_report+0x172/0x1c0 [ 45.365415 ] ? r592_remove+0x269/0x350 [r592] [ 45.365834 ] ? r592_remove+0x269/0x350 [r592] [ 45.366168 ] __asan_report_load8_noabort+0x14/0x20 [ 45.366531 ] r592_remove+0x269/0x350 [r592] [ 45.378785 ] [ 45.378903 ] Allocated by task 4674: [ 45.379162 ] ____kasan_kmalloc+0xb5/0xe0 [ 45.379455 ] __kasan_kmalloc+0x9/0x10 [ 45.379730 ] __kmalloc+0x150/0x280 [ 45.379984 ] memstick_alloc_host+0x2a/0x190 [ 45.380664 ] [ 45.380781 ] Freed by task 5509: [ 45.381014 ] kasan_set_track+0x3d/0x70 [ 45.381293 ] kasan_set_free_info+0x23/0x40 [ 45.381635 ] ____kasan_slab_free+0x10b/0x140 [ 45.381950 ] __kasan_slab_free+0x11/0x20 [ 45.382241 ] slab_free_freelist_hook+0x81/0x150 [ 45.382575 ] kfree+0x13e/0x290 [ 45.382805 ] memstick_free+0x1c/0x20 [ 45.383070 ] device_release+0x9c/0x1d0 [ 45.383349 ] kobject_put+0x2ef/0x4c0 [ 45.383616 ] put_device+0x1f/0x30 [ 45.383865 ] memstick_free_host+0x24/0x30 [ 45.384162 ] r592_remove+0x242/0x350 [r592] [ 45.384473 ] pci_device_remove+0xa9/0x250 Signed-off-by: Zheyu Ma Link: https://lore.kernel.org/r/1634383581-11055-1-git-send-email-zheyuma97@gmail.com Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- drivers/memstick/host/r592.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/memstick/host/r592.c b/drivers/memstick/host/r592.c index d2ef46337191c..eaa2a94d18be4 100644 --- a/drivers/memstick/host/r592.c +++ b/drivers/memstick/host/r592.c @@ -837,15 +837,15 @@ static void r592_remove(struct pci_dev *pdev) } memstick_remove_host(dev->host); + if (dev->dummy_dma_page) + dma_free_coherent(&pdev->dev, PAGE_SIZE, dev->dummy_dma_page, + dev->dummy_dma_page_physical_address); + free_irq(dev->irq, dev); iounmap(dev->mmio); pci_release_regions(pdev); pci_disable_device(pdev); memstick_free_host(dev->host); - - if (dev->dummy_dma_page) - dma_free_coherent(&pdev->dev, PAGE_SIZE, dev->dummy_dma_page, - dev->dummy_dma_page_physical_address); } #ifdef CONFIG_PM_SLEEP From 520f449b934ce45100ee1af4ec4e24e4b7dd9da8 Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Mon, 11 Oct 2021 05:31:39 +0800 Subject: [PATCH 1484/5344] lib/xz: Avoid overlapping memcpy() with invalid input with in-place decompression [ Upstream commit 83d3c4f22a36d005b55f44628f46cc0d319a75e8 ] With valid files, the safety margin described in lib/decompress_unxz.c ensures that these buffers cannot overlap. But if the uncompressed size of the input is larger than the caller thought, which is possible when the input file is invalid/corrupt, the buffers can overlap. Obviously the result will then be garbage (and usually the decoder will return an error too) but no other harm will happen when such an over-run occurs. This change only affects uncompressed LZMA2 chunks and so this should have no effect on performance. Link: https://lore.kernel.org/r/20211010213145.17462-2-xiang@kernel.org Signed-off-by: Lasse Collin Signed-off-by: Gao Xiang Signed-off-by: Sasha Levin --- lib/decompress_unxz.c | 2 +- lib/xz/xz_dec_lzma2.c | 21 +++++++++++++++++++-- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/lib/decompress_unxz.c b/lib/decompress_unxz.c index 25d59a95bd668..abea25310ac73 100644 --- a/lib/decompress_unxz.c +++ b/lib/decompress_unxz.c @@ -167,7 +167,7 @@ * memeq and memzero are not used much and any remotely sane implementation * is fast enough. memcpy/memmove speed matters in multi-call mode, but * the kernel image is decompressed in single-call mode, in which only - * memcpy speed can matter and only if there is a lot of uncompressible data + * memmove speed can matter and only if there is a lot of uncompressible data * (LZMA2 stores uncompressible chunks in uncompressed form). Thus, the * functions below should just be kept small; it's probably not worth * optimizing for speed. diff --git a/lib/xz/xz_dec_lzma2.c b/lib/xz/xz_dec_lzma2.c index 156f26fdc4c91..dd80989ca5a6b 100644 --- a/lib/xz/xz_dec_lzma2.c +++ b/lib/xz/xz_dec_lzma2.c @@ -387,7 +387,14 @@ static void dict_uncompressed(struct dictionary *dict, struct xz_buf *b, *left -= copy_size; - memcpy(dict->buf + dict->pos, b->in + b->in_pos, copy_size); + /* + * If doing in-place decompression in single-call mode and the + * uncompressed size of the file is larger than the caller + * thought (i.e. it is invalid input!), the buffers below may + * overlap and cause undefined behavior with memcpy(). + * With valid inputs memcpy() would be fine here. + */ + memmove(dict->buf + dict->pos, b->in + b->in_pos, copy_size); dict->pos += copy_size; if (dict->full < dict->pos) @@ -397,7 +404,11 @@ static void dict_uncompressed(struct dictionary *dict, struct xz_buf *b, if (dict->pos == dict->end) dict->pos = 0; - memcpy(b->out + b->out_pos, b->in + b->in_pos, + /* + * Like above but for multi-call mode: use memmove() + * to avoid undefined behavior with invalid input. + */ + memmove(b->out + b->out_pos, b->in + b->in_pos, copy_size); } @@ -421,6 +432,12 @@ static uint32_t dict_flush(struct dictionary *dict, struct xz_buf *b) if (dict->pos == dict->end) dict->pos = 0; + /* + * These buffers cannot overlap even if doing in-place + * decompression because in multi-call mode dict->buf + * has been allocated by us in this file; it's not + * provided by the caller like in single-call mode. + */ memcpy(b->out + b->out_pos, dict->buf + dict->start, copy_size); } From ce97ac49a9ba054c05c1db99038118e12783974b Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Mon, 11 Oct 2021 05:31:40 +0800 Subject: [PATCH 1485/5344] lib/xz: Validate the value before assigning it to an enum variable [ Upstream commit 4f8d7abaa413c34da9d751289849dbfb7c977d05 ] This might matter, for example, if the underlying type of enum xz_check was a signed char. In such a case the validation wouldn't have caught an unsupported header. I don't know if this problem can occur in the kernel on any arch but it's still good to fix it because some people might copy the XZ code to their own projects from Linux instead of the upstream XZ Embedded repository. This change may increase the code size by a few bytes. An alternative would have been to use an unsigned int instead of enum xz_check but using an enumeration looks cleaner. Link: https://lore.kernel.org/r/20211010213145.17462-3-xiang@kernel.org Signed-off-by: Lasse Collin Signed-off-by: Gao Xiang Signed-off-by: Sasha Levin --- lib/xz/xz_dec_stream.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/xz/xz_dec_stream.c b/lib/xz/xz_dec_stream.c index bd1d182419d7e..0b161f90d8d80 100644 --- a/lib/xz/xz_dec_stream.c +++ b/lib/xz/xz_dec_stream.c @@ -402,12 +402,12 @@ static enum xz_ret dec_stream_header(struct xz_dec *s) * we will accept other check types too, but then the check won't * be verified and a warning (XZ_UNSUPPORTED_CHECK) will be given. */ + if (s->temp.buf[HEADER_MAGIC_SIZE + 1] > XZ_CHECK_MAX) + return XZ_OPTIONS_ERROR; + s->check_type = s->temp.buf[HEADER_MAGIC_SIZE + 1]; #ifdef XZ_DEC_ANY_CHECK - if (s->check_type > XZ_CHECK_MAX) - return XZ_OPTIONS_ERROR; - if (s->check_type > XZ_CHECK_CRC32) return XZ_UNSUPPORTED_CHECK; #else From 625a91200e97bb4b3131099093da2c3170e6fb37 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sun, 17 Oct 2021 20:04:02 +0800 Subject: [PATCH 1486/5344] workqueue: make sysfs of unbound kworker cpumask more clever [ Upstream commit d25302e46592c97d29f70ccb1be558df31a9a360 ] Some unfriendly component, such as dpdk, write the same mask to unbound kworker cpumask again and again. Every time it write to this interface some work is queue to cpu, even though the mask is same with the original mask. So, fix it by return success and do nothing if the cpumask is equal with the old one. Signed-off-by: Mengen Sun Signed-off-by: Menglong Dong Signed-off-by: Tejun Heo Signed-off-by: Sasha Levin --- kernel/workqueue.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 885d4792abdfc..77e6964ae1a99 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5302,9 +5302,6 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) int ret = -EINVAL; cpumask_var_t saved_cpumask; - if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) - return -ENOMEM; - /* * Not excluding isolated cpus on purpose. * If the user wishes to include them, we allow that. @@ -5312,6 +5309,15 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) cpumask_and(cpumask, cpumask, cpu_possible_mask); if (!cpumask_empty(cpumask)) { apply_wqattrs_lock(); + if (cpumask_equal(cpumask, wq_unbound_cpumask)) { + ret = 0; + goto out_unlock; + } + + if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) { + ret = -ENOMEM; + goto out_unlock; + } /* save the old wq_unbound_cpumask. */ cpumask_copy(saved_cpumask, wq_unbound_cpumask); @@ -5324,10 +5330,11 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) if (ret < 0) cpumask_copy(wq_unbound_cpumask, saved_cpumask); + free_cpumask_var(saved_cpumask); +out_unlock: apply_wqattrs_unlock(); } - free_cpumask_var(saved_cpumask); return ret; } From 97431ee0f9fd88f150c9433c68284319998b9ff9 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Wed, 13 Oct 2021 21:52:17 -0700 Subject: [PATCH 1487/5344] tracing/cfi: Fix cmp_entries_* functions signature mismatch [ Upstream commit 7ce1bb83a14019f8c396d57ec704d19478747716 ] If CONFIG_CFI_CLANG=y, attempting to read an event histogram will cause the kernel to panic due to failed CFI check. 1. echo 'hist:keys=common_pid' >> events/sched/sched_switch/trigger 2. cat events/sched/sched_switch/hist 3. kernel panics on attempting to read hist This happens because the sort() function expects a generic int (*)(const void *, const void *) pointer for the compare function. To prevent this CFI failure, change tracing map cmp_entries_* function signatures to match this. Also, fix the build error reported by the kernel test robot [1]. [1] https://lore.kernel.org/r/202110141140.zzi4dRh4-lkp@intel.com/ Link: https://lkml.kernel.org/r/20211014045217.3265162-1-kaleshsingh@google.com Signed-off-by: Kalesh Singh Reported-by: kernel test robot Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Sasha Levin --- kernel/trace/tracing_map.c | 40 ++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 9e31bfc818ff8..10657b8dc2c2d 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -834,29 +834,35 @@ int tracing_map_init(struct tracing_map *map) return err; } -static int cmp_entries_dup(const struct tracing_map_sort_entry **a, - const struct tracing_map_sort_entry **b) +static int cmp_entries_dup(const void *A, const void *B) { + const struct tracing_map_sort_entry *a, *b; int ret = 0; - if (memcmp((*a)->key, (*b)->key, (*a)->elt->map->key_size)) + a = *(const struct tracing_map_sort_entry **)A; + b = *(const struct tracing_map_sort_entry **)B; + + if (memcmp(a->key, b->key, a->elt->map->key_size)) ret = 1; return ret; } -static int cmp_entries_sum(const struct tracing_map_sort_entry **a, - const struct tracing_map_sort_entry **b) +static int cmp_entries_sum(const void *A, const void *B) { const struct tracing_map_elt *elt_a, *elt_b; + const struct tracing_map_sort_entry *a, *b; struct tracing_map_sort_key *sort_key; struct tracing_map_field *field; tracing_map_cmp_fn_t cmp_fn; void *val_a, *val_b; int ret = 0; - elt_a = (*a)->elt; - elt_b = (*b)->elt; + a = *(const struct tracing_map_sort_entry **)A; + b = *(const struct tracing_map_sort_entry **)B; + + elt_a = a->elt; + elt_b = b->elt; sort_key = &elt_a->map->sort_key; @@ -873,18 +879,21 @@ static int cmp_entries_sum(const struct tracing_map_sort_entry **a, return ret; } -static int cmp_entries_key(const struct tracing_map_sort_entry **a, - const struct tracing_map_sort_entry **b) +static int cmp_entries_key(const void *A, const void *B) { const struct tracing_map_elt *elt_a, *elt_b; + const struct tracing_map_sort_entry *a, *b; struct tracing_map_sort_key *sort_key; struct tracing_map_field *field; tracing_map_cmp_fn_t cmp_fn; void *val_a, *val_b; int ret = 0; - elt_a = (*a)->elt; - elt_b = (*b)->elt; + a = *(const struct tracing_map_sort_entry **)A; + b = *(const struct tracing_map_sort_entry **)B; + + elt_a = a->elt; + elt_b = b->elt; sort_key = &elt_a->map->sort_key; @@ -989,10 +998,8 @@ static void sort_secondary(struct tracing_map *map, struct tracing_map_sort_key *primary_key, struct tracing_map_sort_key *secondary_key) { - int (*primary_fn)(const struct tracing_map_sort_entry **, - const struct tracing_map_sort_entry **); - int (*secondary_fn)(const struct tracing_map_sort_entry **, - const struct tracing_map_sort_entry **); + int (*primary_fn)(const void *, const void *); + int (*secondary_fn)(const void *, const void *); unsigned i, start = 0, n_sub = 1; if (is_key(map, primary_key->field_idx)) @@ -1061,8 +1068,7 @@ int tracing_map_sort_entries(struct tracing_map *map, unsigned int n_sort_keys, struct tracing_map_sort_entry ***sort_entries) { - int (*cmp_entries_fn)(const struct tracing_map_sort_entry **, - const struct tracing_map_sort_entry **); + int (*cmp_entries_fn)(const void *, const void *); struct tracing_map_sort_entry *sort_entry, **entries; int i, n_entries, ret; From 53e501f9901820f30e3321de8209c6c987e5b84f Mon Sep 17 00:00:00 2001 From: Zheyu Ma Date: Sat, 16 Oct 2021 04:02:59 +0000 Subject: [PATCH 1488/5344] mwl8k: Fix use-after-free in mwl8k_fw_state_machine() [ Upstream commit 257051a235c17e33782b6e24a4b17f2d7915aaec ] When the driver fails to request the firmware, it calls its error handler. In the error handler, the driver detaches device from driver first before releasing the firmware, which can cause a use-after-free bug. Fix this by releasing firmware first. The following log reveals it: [ 9.007301 ] BUG: KASAN: use-after-free in mwl8k_fw_state_machine+0x320/0xba0 [ 9.010143 ] Workqueue: events request_firmware_work_func [ 9.010830 ] Call Trace: [ 9.010830 ] dump_stack_lvl+0xa8/0xd1 [ 9.010830 ] print_address_description+0x87/0x3b0 [ 9.010830 ] kasan_report+0x172/0x1c0 [ 9.010830 ] ? mutex_unlock+0xd/0x10 [ 9.010830 ] ? mwl8k_fw_state_machine+0x320/0xba0 [ 9.010830 ] ? mwl8k_fw_state_machine+0x320/0xba0 [ 9.010830 ] __asan_report_load8_noabort+0x14/0x20 [ 9.010830 ] mwl8k_fw_state_machine+0x320/0xba0 [ 9.010830 ] ? mwl8k_load_firmware+0x5f0/0x5f0 [ 9.010830 ] request_firmware_work_func+0x172/0x250 [ 9.010830 ] ? read_lock_is_recursive+0x20/0x20 [ 9.010830 ] ? process_one_work+0x7a1/0x1100 [ 9.010830 ] ? request_firmware_nowait+0x460/0x460 [ 9.010830 ] ? __this_cpu_preempt_check+0x13/0x20 [ 9.010830 ] process_one_work+0x9bb/0x1100 Signed-off-by: Zheyu Ma Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1634356979-6211-1-git-send-email-zheyuma97@gmail.com Signed-off-by: Sasha Levin --- drivers/net/wireless/marvell/mwl8k.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/marvell/mwl8k.c b/drivers/net/wireless/marvell/mwl8k.c index 1b76b24191866..14ac2384218df 100644 --- a/drivers/net/wireless/marvell/mwl8k.c +++ b/drivers/net/wireless/marvell/mwl8k.c @@ -5796,8 +5796,8 @@ static void mwl8k_fw_state_machine(const struct firmware *fw, void *context) fail: priv->fw_state = FW_STATE_ERROR; complete(&priv->firmware_loading_complete); - device_release_driver(&priv->pdev->dev); mwl8k_release_firmware(priv); + device_release_driver(&priv->pdev->dev); } #define MAX_RESTART_ATTEMPTS 1 From 7e0bc08a25865c76b274d094ce7317985e9f842c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 20 Oct 2021 08:21:40 -0600 Subject: [PATCH 1489/5344] block: remove inaccurate requeue check [ Upstream commit 037057a5a979c7eeb2ee5d12cf4c24b805192c75 ] This check is meant to catch cases where a requeue is attempted on a request that is still inserted. It's never really been useful to catch any misuse, and now it's actively wrong. Outside of that, this should not be a BUG_ON() to begin with. Remove the check as it's now causing active harm, as requeue off the plug path will trigger it even though the request state is just fine. Reported-by: Yi Zhang Link: https://lore.kernel.org/linux-block/CAHj4cs80zAUc2grnCZ015-2Rvd-=gXRfB_dFKy=RTm+wRo09HQ@mail.gmail.com/ Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- block/blk-mq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 0674f53c60528..84798d09ca464 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -733,7 +733,6 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) /* this request will be re-inserted to io scheduler queue */ blk_mq_sched_requeue_request(rq); - BUG_ON(!list_empty(&rq->queuelist)); blk_mq_add_to_requeue_list(rq, true, kick_requeue_list); } EXPORT_SYMBOL(blk_mq_requeue_request); From aa2e1d536a682ff5f99bf86cec455cd2b3f21eff Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 20 Oct 2021 07:59:10 +0200 Subject: [PATCH 1490/5344] nvme: drop scan_lock and always kick requeue list when removing namespaces [ Upstream commit 2b81a5f015199f3d585ce710190a9e87714d3c1e ] When reading the partition table on initial scan hits an I/O error the I/O will hang with the scan_mutex held: [<0>] do_read_cache_page+0x49b/0x790 [<0>] read_part_sector+0x39/0xe0 [<0>] read_lba+0xf9/0x1d0 [<0>] efi_partition+0xf1/0x7f0 [<0>] bdev_disk_changed+0x1ee/0x550 [<0>] blkdev_get_whole+0x81/0x90 [<0>] blkdev_get_by_dev+0x128/0x2e0 [<0>] device_add_disk+0x377/0x3c0 [<0>] nvme_mpath_set_live+0x130/0x1b0 [nvme_core] [<0>] nvme_mpath_add_disk+0x150/0x160 [nvme_core] [<0>] nvme_alloc_ns+0x417/0x950 [nvme_core] [<0>] nvme_validate_or_alloc_ns+0xe9/0x1e0 [nvme_core] [<0>] nvme_scan_work+0x168/0x310 [nvme_core] [<0>] process_one_work+0x231/0x420 and trying to delete the controller will deadlock as it tries to grab the scan mutex: [<0>] nvme_mpath_clear_ctrl_paths+0x25/0x80 [nvme_core] [<0>] nvme_remove_namespaces+0x31/0xf0 [nvme_core] [<0>] nvme_do_delete_ctrl+0x4b/0x80 [nvme_core] As we're now properly ordering the namespace list there is no need to hold the scan_mutex in nvme_mpath_clear_ctrl_paths() anymore. And we always need to kick the requeue list as the path will be marked as unusable and I/O will be requeued _without_ a current path. Signed-off-by: Hannes Reinecke Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Sasha Levin --- drivers/nvme/host/multipath.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 016a67fd41989..9f01af2f03e68 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -156,13 +156,12 @@ void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; - mutex_lock(&ctrl->scan_lock); down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) - if (nvme_mpath_clear_current_path(ns)) - kblockd_schedule_work(&ns->head->requeue_work); + list_for_each_entry(ns, &ctrl->namespaces, list) { + nvme_mpath_clear_current_path(ns); + kblockd_schedule_work(&ns->head->requeue_work); + } up_read(&ctrl->namespaces_rwsem); - mutex_unlock(&ctrl->scan_lock); } static bool nvme_path_is_disabled(struct nvme_ns *ns) From 4c551a083d055e66b99f5c2817d62072b31cebed Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Wed, 13 Oct 2021 20:19:14 +0800 Subject: [PATCH 1491/5344] PM: hibernate: Get block device exclusively in swsusp_check() [ Upstream commit 39fbef4b0f77f9c89c8f014749ca533643a37c9f ] The following kernel crash can be triggered: [ 89.266592] ------------[ cut here ]------------ [ 89.267427] kernel BUG at fs/buffer.c:3020! [ 89.268264] invalid opcode: 0000 [#1] SMP KASAN PTI [ 89.269116] CPU: 7 PID: 1750 Comm: kmmpd-loop0 Not tainted 5.10.0-862.14.0.6.x86_64-08610-gc932cda3cef4-dirty #20 [ 89.273169] RIP: 0010:submit_bh_wbc.isra.0+0x538/0x6d0 [ 89.277157] RSP: 0018:ffff888105ddfd08 EFLAGS: 00010246 [ 89.278093] RAX: 0000000000000005 RBX: ffff888124231498 RCX: ffffffffb2772612 [ 89.279332] RDX: 1ffff11024846293 RSI: 0000000000000008 RDI: ffff888124231498 [ 89.280591] RBP: ffff8881248cc000 R08: 0000000000000001 R09: ffffed1024846294 [ 89.281851] R10: ffff88812423149f R11: ffffed1024846293 R12: 0000000000003800 [ 89.283095] R13: 0000000000000001 R14: 0000000000000000 R15: ffff8881161f7000 [ 89.284342] FS: 0000000000000000(0000) GS:ffff88839b5c0000(0000) knlGS:0000000000000000 [ 89.285711] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 89.286701] CR2: 00007f166ebc01a0 CR3: 0000000435c0e000 CR4: 00000000000006e0 [ 89.287919] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 89.289138] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 89.290368] Call Trace: [ 89.290842] write_mmp_block+0x2ca/0x510 [ 89.292218] kmmpd+0x433/0x9a0 [ 89.294902] kthread+0x2dd/0x3e0 [ 89.296268] ret_from_fork+0x22/0x30 [ 89.296906] Modules linked in: by running the following commands: 1. mkfs.ext4 -O mmp /dev/sda -b 1024 2. mount /dev/sda /home/test 3. echo "/dev/sda" > /sys/power/resume That happens because swsusp_check() calls set_blocksize() on the target partition which confuses the file system: Thread1 Thread2 mount /dev/sda /home/test get s_mmp_bh --> has mapped flag start kmmpd thread echo "/dev/sda" > /sys/power/resume resume_store software_resume swsusp_check set_blocksize truncate_inode_pages_range truncate_cleanup_page block_invalidatepage discard_buffer --> clean mapped flag write_mmp_block submit_bh submit_bh_wbc BUG_ON(!buffer_mapped(bh)) To address this issue, modify swsusp_check() to open the target block device with exclusive access. Signed-off-by: Ye Bin [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki Signed-off-by: Sasha Levin --- kernel/power/swap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 0516c422206d8..d32cd03d5ff8c 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -1509,9 +1509,10 @@ int swsusp_read(unsigned int *flags_p) int swsusp_check(void) { int error; + void *holder; hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, - FMODE_READ, NULL); + FMODE_READ | FMODE_EXCL, &holder); if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); clear_page(swsusp_header); @@ -1533,7 +1534,7 @@ int swsusp_check(void) put: if (error) - blkdev_put(hib_resume_bdev, FMODE_READ); + blkdev_put(hib_resume_bdev, FMODE_READ | FMODE_EXCL); else pr_debug("Image signature found, resuming\n"); } else { From 80d03c4769ff451d71698e2c8d522bd355ff7444 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 21 Oct 2021 11:56:03 -0600 Subject: [PATCH 1492/5344] selftests: kvm: fix mismatched fclose() after popen() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit c3867ab5924b7a9a0b4a117902a08669d8be7c21 ] get_warnings_count() does fclose() using File * returned from popen(). Fix it to call pclose() as it should. tools/testing/selftests/kvm/x86_64/mmio_warning_test x86_64/mmio_warning_test.c: In function ‘get_warnings_count’: x86_64/mmio_warning_test.c:87:9: warning: ‘fclose’ called on pointer returned from a mismatched allocation function [-Wmismatched-dealloc] 87 | fclose(f); | ^~~~~~~~~ x86_64/mmio_warning_test.c:84:13: note: returned from ‘popen’ 84 | f = popen("dmesg | grep \"WARNING:\" | wc -l", "r"); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Shuah Khan Acked-by: Paolo Bonzini Signed-off-by: Shuah Khan Signed-off-by: Sasha Levin --- tools/testing/selftests/kvm/x86_64/mmio_warning_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c b/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c index 2cbc09aad7f64..92419b66b0578 100644 --- a/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c +++ b/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c @@ -84,7 +84,7 @@ int get_warnings_count(void) f = popen("dmesg | grep \"WARNING:\" | wc -l", "r"); if (fscanf(f, "%d", &warnings) < 1) warnings = 0; - fclose(f); + pclose(f); return warnings; } From 0a2a29a8fb2e2e88052c3d1a79dcf63e4355111b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sun, 17 Oct 2021 11:43:40 +0300 Subject: [PATCH 1493/5344] iwlwifi: mvm: disable RX-diversity in powersave [ Upstream commit e5322b9ab5f63536c41301150b7ce64605ce52cc ] Just like we have default SMPS mode as dynamic in powersave, we should not enable RX-diversity in powersave, to reduce power consumption when connected to a non-MIMO AP. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20211017113927.fc896bc5cdaa.I1d11da71b8a5cbe921a37058d5f578f1b14a2023@changeid Signed-off-by: Luca Coelho Signed-off-by: Sasha Levin --- drivers/net/wireless/intel/iwlwifi/mvm/utils.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c index 8686107da1168..a637d7fb4b261 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c @@ -758,6 +758,9 @@ bool iwl_mvm_rx_diversity_allowed(struct iwl_mvm *mvm) lockdep_assert_held(&mvm->mutex); + if (iwlmvm_mod_params.power_scheme != IWL_POWER_SCHEME_CAM) + return false; + if (num_of_ant(iwl_mvm_get_valid_rx_ant(mvm)) == 1) return false; From 239e484c595008b375664b06afbb55e6718ecc8d Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 19 Oct 2021 20:54:31 +0900 Subject: [PATCH 1494/5344] smackfs: use __GFP_NOFAIL for smk_cipso_doi() [ Upstream commit f91488ee15bd3cac467e2d6a361fc2d34d1052ae ] syzbot is reporting kernel panic at smk_cipso_doi() due to memory allocation fault injection [1]. The reason for need to use panic() was not explained. But since no fix was proposed for 18 months, for now let's use __GFP_NOFAIL for utilizing syzbot resource on other bugs. Link: https://syzkaller.appspot.com/bug?extid=89731ccb6fec15ce1c22 [1] Reported-by: syzbot Signed-off-by: Tetsuo Handa Signed-off-by: Casey Schaufler Signed-off-by: Sasha Levin --- security/smack/smackfs.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c index cec3f56739dc2..fdf5f336f834a 100644 --- a/security/smack/smackfs.c +++ b/security/smack/smackfs.c @@ -693,9 +693,7 @@ static void smk_cipso_doi(void) printk(KERN_WARNING "%s:%d remove rc = %d\n", __func__, __LINE__, rc); - doip = kmalloc(sizeof(struct cipso_v4_doi), GFP_KERNEL); - if (doip == NULL) - panic("smack: Failed to initialize cipso DOI.\n"); + doip = kmalloc(sizeof(struct cipso_v4_doi), GFP_KERNEL | __GFP_NOFAIL); doip->map.std = NULL; doip->doi = smk_cipso_doi_value; doip->type = CIPSO_V4_MAP_PASS; From 939104ff44892b65373be3a6240c95e24efdb396 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 21 Oct 2021 09:55:17 +0900 Subject: [PATCH 1495/5344] ARM: clang: Do not rely on lr register for stacktrace [ Upstream commit b3ea5d56f212ad81328c82454829a736197ebccc ] Currently the stacktrace on clang compiled arm kernel uses the 'lr' register to find the first frame address from pt_regs. However, that is wrong after calling another function, because the 'lr' register is used by 'bl' instruction and never be recovered. As same as gcc arm kernel, directly use the frame pointer (r11) of the pt_regs to find the first frame address. Note that this fixes kretprobe stacktrace issue only with CONFIG_UNWINDER_FRAME_POINTER=y. For the CONFIG_UNWINDER_ARM, we need another fix. Signed-off-by: Masami Hiramatsu Reviewed-by: Nick Desaulniers Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Sasha Levin --- arch/arm/kernel/stacktrace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm/kernel/stacktrace.c b/arch/arm/kernel/stacktrace.c index 76ea4178a55cb..db798eac74315 100644 --- a/arch/arm/kernel/stacktrace.c +++ b/arch/arm/kernel/stacktrace.c @@ -54,8 +54,7 @@ int notrace unwind_frame(struct stackframe *frame) frame->sp = frame->fp; frame->fp = *(unsigned long *)(fp); - frame->pc = frame->lr; - frame->lr = *(unsigned long *)(fp + 4); + frame->pc = *(unsigned long *)(fp + 4); #else /* check current frame pointer is within bounds */ if (fp < low + 12 || fp > high - 4) From 9eb89211e298a8a61077a42b6ae15717a64378b6 Mon Sep 17 00:00:00 2001 From: Stephen Suryaputra Date: Wed, 20 Oct 2021 16:06:18 -0400 Subject: [PATCH 1496/5344] gre/sit: Don't generate link-local addr if addr_gen_mode is IN6_ADDR_GEN_MODE_NONE [ Upstream commit 61e18ce7348bfefb5688a8bcd4b4d6b37c0f9b2a ] When addr_gen_mode is set to IN6_ADDR_GEN_MODE_NONE, the link-local addr should not be generated. But it isn't the case for GRE (as well as GRE6) and SIT tunnels. Make it so that tunnels consider the addr_gen_mode, especially for IN6_ADDR_GEN_MODE_NONE. Do this in add_v4_addrs() to cover both GRE and SIT only if the addr scope is link. Signed-off-by: Stephen Suryaputra Acked-by: Antonio Quartulli Link: https://lore.kernel.org/r/20211020200618.467342-1-ssuryaextr@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/ipv6/addrconf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 366c3792b8604..d1f29a3eb70be 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3111,6 +3111,9 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) memcpy(&addr.s6_addr32[3], idev->dev->dev_addr, 4); if (idev->dev->flags&IFF_POINTOPOINT) { + if (idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_NONE) + return; + addr.s6_addr32[0] = htonl(0xfe800000); scope = IFA_LINK; plen = 64; From e3910766d7f0f89d21683ab052841204b29ecfa7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 18 Oct 2021 15:30:06 +0100 Subject: [PATCH 1497/5344] ARM: 9136/1: ARMv7-M uses BE-8, not BE-32 [ Upstream commit 345dac33f58894a56d17b92a41be10e16585ceff ] When configuring the kernel for big-endian, we set either BE-8 or BE-32 based on the CPU architecture level. Until linux-4.4, we did not have any ARMv7-M platform allowing big-endian builds, but now i.MX/Vybrid is in that category, adn we get a build error because of this: arch/arm/kernel/module-plts.c: In function 'get_module_plt': arch/arm/kernel/module-plts.c:60:46: error: implicit declaration of function '__opcode_to_mem_thumb32' [-Werror=implicit-function-declaration] This comes down to picking the wrong default, ARMv7-M uses BE8 like ARMv7-A does. Changing the default gets the kernel to compile and presumably works. https://lore.kernel.org/all/1455804123-2526139-2-git-send-email-arnd@arndb.de/ Tested-by: Vladimir Murzin Signed-off-by: Arnd Bergmann Signed-off-by: Russell King (Oracle) Signed-off-by: Sasha Levin --- arch/arm/mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index b2d522f207826..7126139aa300e 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -752,7 +752,7 @@ config CPU_BIG_ENDIAN config CPU_ENDIAN_BE8 bool depends on CPU_BIG_ENDIAN - default CPU_V6 || CPU_V6K || CPU_V7 + default CPU_V6 || CPU_V6K || CPU_V7 || CPU_V7M help Support for the BE-8 (big-endian) mode on ARMv6 and ARMv7 processors. From 3074c797527fb8af098c17e9d4b05b82c317a5bf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 25 Oct 2021 16:14:00 +0200 Subject: [PATCH 1498/5344] vrf: run conntrack only in context of lower/physdev for locally generated packets [ Upstream commit 8c9c296adfae9ea05f655d69e9f6e13daa86fb4a ] The VRF driver invokes netfilter for output+postrouting hooks so that users can create rules that check for 'oif $vrf' rather than lower device name. This is a problem when NAT rules are configured. To avoid any conntrack involvement in round 1, tag skbs as 'untracked' to prevent conntrack from picking them up. This gets cleared before the packet gets handed to the ip stack so conntrack will be active on the second iteration. One remaining issue is that a rule like output ... oif $vrfname notrack won't propagate to the second round because we can't tell 'notrack set via ruleset' and 'notrack set by vrf driver' apart. However, this isn't a regression: the 'notrack' removal happens instead of unconditional nf_reset_ct(). I'd also like to avoid leaking more vrf specific conditionals into the netfilter infra. For ingress, conntrack has already been done before the packet makes it to the vrf driver, with this patch egress does connection tracking with lower/physical device as well. Signed-off-by: Florian Westphal Acked-by: David Ahern Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/vrf.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index f08ed52d51f3f..f436b8c130611 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -33,6 +33,7 @@ #include #include #include +#include #define DRV_NAME "vrf" #define DRV_VERSION "1.0" @@ -147,12 +148,26 @@ static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev, return NETDEV_TX_OK; } +static void vrf_nf_set_untracked(struct sk_buff *skb) +{ + if (skb_get_nfct(skb) == 0) + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); +} + +static void vrf_nf_reset_ct(struct sk_buff *skb) +{ + if (skb_get_nfct(skb) == IP_CT_UNTRACKED) + nf_reset_ct(skb); +} + #if IS_ENABLED(CONFIG_IPV6) static int vrf_ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; + vrf_nf_reset_ct(skb); + err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); @@ -232,6 +247,8 @@ static int vrf_ip_local_out(struct net *net, struct sock *sk, { int err; + vrf_nf_reset_ct(skb); + err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); if (likely(err == 1)) @@ -351,8 +368,7 @@ static void vrf_finish_direct(struct sk_buff *skb) skb_pull(skb, ETH_HLEN); } - /* reset skb device */ - nf_reset_ct(skb); + vrf_nf_reset_ct(skb); } #if IS_ENABLED(CONFIG_IPV6) @@ -366,7 +382,7 @@ static int vrf_finish_output6(struct net *net, struct sock *sk, struct neighbour *neigh; int ret; - nf_reset_ct(skb); + vrf_nf_reset_ct(skb); skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; @@ -477,6 +493,8 @@ static struct sk_buff *vrf_ip6_out_direct(struct net_device *vrf_dev, skb->dev = vrf_dev; + vrf_nf_set_untracked(skb); + err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, vrf_dev, vrf_ip6_out_direct_finish); @@ -584,7 +602,7 @@ static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *s bool is_v6gw = false; int ret = -EINVAL; - nf_reset_ct(skb); + vrf_nf_reset_ct(skb); /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { @@ -712,6 +730,8 @@ static struct sk_buff *vrf_ip_out_direct(struct net_device *vrf_dev, skb->dev = vrf_dev; + vrf_nf_set_untracked(skb); + err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, vrf_dev, vrf_ip_out_direct_finish); From 4f8c122f2701e40a8cef0a9580c3d6af3ca6fd8d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 Oct 2021 11:15:55 -0700 Subject: [PATCH 1499/5344] net: annotate data-race in neigh_output() [ Upstream commit d18785e213866935b4c3dc0c33c3e18801ce0ce8 ] neigh_output() reads n->nud_state and hh->hh_len locklessly. This is fine, but we need to add annotations and document this. We evaluate skip_cache first to avoid reading these fields if the cache has to by bypassed. syzbot report: BUG: KCSAN: data-race in __neigh_event_send / ip_finish_output2 write to 0xffff88810798a885 of 1 bytes by interrupt on cpu 1: __neigh_event_send+0x40d/0xac0 net/core/neighbour.c:1128 neigh_event_send include/net/neighbour.h:444 [inline] neigh_resolve_output+0x104/0x410 net/core/neighbour.c:1476 neigh_output include/net/neighbour.h:510 [inline] ip_finish_output2+0x80a/0xaa0 net/ipv4/ip_output.c:221 ip_finish_output+0x3b5/0x510 net/ipv4/ip_output.c:309 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip_output+0xf3/0x1a0 net/ipv4/ip_output.c:423 dst_output include/net/dst.h:450 [inline] ip_local_out+0x164/0x220 net/ipv4/ip_output.c:126 __ip_queue_xmit+0x9d3/0xa20 net/ipv4/ip_output.c:525 ip_queue_xmit+0x34/0x40 net/ipv4/ip_output.c:539 __tcp_transmit_skb+0x142a/0x1a00 net/ipv4/tcp_output.c:1405 tcp_transmit_skb net/ipv4/tcp_output.c:1423 [inline] tcp_xmit_probe_skb net/ipv4/tcp_output.c:4011 [inline] tcp_write_wakeup+0x4a9/0x810 net/ipv4/tcp_output.c:4064 tcp_send_probe0+0x2c/0x2b0 net/ipv4/tcp_output.c:4079 tcp_probe_timer net/ipv4/tcp_timer.c:398 [inline] tcp_write_timer_handler+0x394/0x520 net/ipv4/tcp_timer.c:626 tcp_write_timer+0xb9/0x180 net/ipv4/tcp_timer.c:642 call_timer_fn+0x2e/0x1d0 kernel/time/timer.c:1421 expire_timers+0x135/0x240 kernel/time/timer.c:1466 __run_timers+0x368/0x430 kernel/time/timer.c:1734 run_timer_softirq+0x19/0x30 kernel/time/timer.c:1747 __do_softirq+0x12c/0x26e kernel/softirq.c:558 invoke_softirq kernel/softirq.c:432 [inline] __irq_exit_rcu kernel/softirq.c:636 [inline] irq_exit_rcu+0x4e/0xa0 kernel/softirq.c:648 sysvec_apic_timer_interrupt+0x69/0x80 arch/x86/kernel/apic/apic.c:1097 asm_sysvec_apic_timer_interrupt+0x12/0x20 native_safe_halt arch/x86/include/asm/irqflags.h:51 [inline] arch_safe_halt arch/x86/include/asm/irqflags.h:89 [inline] acpi_safe_halt drivers/acpi/processor_idle.c:109 [inline] acpi_idle_do_entry drivers/acpi/processor_idle.c:553 [inline] acpi_idle_enter+0x258/0x2e0 drivers/acpi/processor_idle.c:688 cpuidle_enter_state+0x2b4/0x760 drivers/cpuidle/cpuidle.c:237 cpuidle_enter+0x3c/0x60 drivers/cpuidle/cpuidle.c:351 call_cpuidle kernel/sched/idle.c:158 [inline] cpuidle_idle_call kernel/sched/idle.c:239 [inline] do_idle+0x1a3/0x250 kernel/sched/idle.c:306 cpu_startup_entry+0x15/0x20 kernel/sched/idle.c:403 secondary_startup_64_no_verify+0xb1/0xbb read to 0xffff88810798a885 of 1 bytes by interrupt on cpu 0: neigh_output include/net/neighbour.h:507 [inline] ip_finish_output2+0x79a/0xaa0 net/ipv4/ip_output.c:221 ip_finish_output+0x3b5/0x510 net/ipv4/ip_output.c:309 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip_output+0xf3/0x1a0 net/ipv4/ip_output.c:423 dst_output include/net/dst.h:450 [inline] ip_local_out+0x164/0x220 net/ipv4/ip_output.c:126 __ip_queue_xmit+0x9d3/0xa20 net/ipv4/ip_output.c:525 ip_queue_xmit+0x34/0x40 net/ipv4/ip_output.c:539 __tcp_transmit_skb+0x142a/0x1a00 net/ipv4/tcp_output.c:1405 tcp_transmit_skb net/ipv4/tcp_output.c:1423 [inline] tcp_xmit_probe_skb net/ipv4/tcp_output.c:4011 [inline] tcp_write_wakeup+0x4a9/0x810 net/ipv4/tcp_output.c:4064 tcp_send_probe0+0x2c/0x2b0 net/ipv4/tcp_output.c:4079 tcp_probe_timer net/ipv4/tcp_timer.c:398 [inline] tcp_write_timer_handler+0x394/0x520 net/ipv4/tcp_timer.c:626 tcp_write_timer+0xb9/0x180 net/ipv4/tcp_timer.c:642 call_timer_fn+0x2e/0x1d0 kernel/time/timer.c:1421 expire_timers+0x135/0x240 kernel/time/timer.c:1466 __run_timers+0x368/0x430 kernel/time/timer.c:1734 run_timer_softirq+0x19/0x30 kernel/time/timer.c:1747 __do_softirq+0x12c/0x26e kernel/softirq.c:558 invoke_softirq kernel/softirq.c:432 [inline] __irq_exit_rcu kernel/softirq.c:636 [inline] irq_exit_rcu+0x4e/0xa0 kernel/softirq.c:648 sysvec_apic_timer_interrupt+0x69/0x80 arch/x86/kernel/apic/apic.c:1097 asm_sysvec_apic_timer_interrupt+0x12/0x20 native_safe_halt arch/x86/include/asm/irqflags.h:51 [inline] arch_safe_halt arch/x86/include/asm/irqflags.h:89 [inline] acpi_safe_halt drivers/acpi/processor_idle.c:109 [inline] acpi_idle_do_entry drivers/acpi/processor_idle.c:553 [inline] acpi_idle_enter+0x258/0x2e0 drivers/acpi/processor_idle.c:688 cpuidle_enter_state+0x2b4/0x760 drivers/cpuidle/cpuidle.c:237 cpuidle_enter+0x3c/0x60 drivers/cpuidle/cpuidle.c:351 call_cpuidle kernel/sched/idle.c:158 [inline] cpuidle_idle_call kernel/sched/idle.c:239 [inline] do_idle+0x1a3/0x250 kernel/sched/idle.c:306 cpu_startup_entry+0x15/0x20 kernel/sched/idle.c:403 rest_init+0xee/0x100 init/main.c:734 arch_call_rest_init+0xa/0xb start_kernel+0x5e4/0x669 init/main.c:1142 secondary_startup_64_no_verify+0xb1/0xbb value changed: 0x20 -> 0x01 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.15.0-rc6-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/net/neighbour.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 2be8d6b0dfb69..4232bc8ce3d7d 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -505,10 +505,15 @@ static inline int neigh_output(struct neighbour *n, struct sk_buff *skb, { const struct hh_cache *hh = &n->hh; - if ((n->nud_state & NUD_CONNECTED) && hh->hh_len && !skip_cache) + /* n->nud_state and hh->hh_len could be changed under us. + * neigh_hh_output() is taking care of the race later. + */ + if (!skip_cache && + (READ_ONCE(n->nud_state) & NUD_CONNECTED) && + READ_ONCE(hh->hh_len)) return neigh_hh_output(hh, skb); - else - return n->output(n, skb); + + return n->output(n, skb); } static inline struct neighbour * From 7851853c4f72af2f87f316252ec39a71268a188e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 27 Jul 2021 17:01:14 -0400 Subject: [PATCH 1500/5344] btrfs: do not take the uuid_mutex in btrfs_rm_device [ Upstream commit 8ef9dc0f14ba6124c62547a4fdc59b163d8b864e ] We got the following lockdep splat while running fstests (specifically btrfs/003 and btrfs/020 in a row) with the new rc. This was uncovered by 87579e9b7d8d ("loop: use worker per cgroup instead of kworker") which converted loop to using workqueues, which comes with lockdep annotations that don't exist with kworkers. The lockdep splat is as follows: WARNING: possible circular locking dependency detected 5.14.0-rc2-custom+ #34 Not tainted ------------------------------------------------------ losetup/156417 is trying to acquire lock: ffff9c7645b02d38 ((wq_completion)loop0){+.+.}-{0:0}, at: flush_workqueue+0x84/0x600 but task is already holding lock: ffff9c7647395468 (&lo->lo_mutex){+.+.}-{3:3}, at: __loop_clr_fd+0x41/0x650 [loop] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #5 (&lo->lo_mutex){+.+.}-{3:3}: __mutex_lock+0xba/0x7c0 lo_open+0x28/0x60 [loop] blkdev_get_whole+0x28/0xf0 blkdev_get_by_dev.part.0+0x168/0x3c0 blkdev_open+0xd2/0xe0 do_dentry_open+0x163/0x3a0 path_openat+0x74d/0xa40 do_filp_open+0x9c/0x140 do_sys_openat2+0xb1/0x170 __x64_sys_openat+0x54/0x90 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae -> #4 (&disk->open_mutex){+.+.}-{3:3}: __mutex_lock+0xba/0x7c0 blkdev_get_by_dev.part.0+0xd1/0x3c0 blkdev_get_by_path+0xc0/0xd0 btrfs_scan_one_device+0x52/0x1f0 [btrfs] btrfs_control_ioctl+0xac/0x170 [btrfs] __x64_sys_ioctl+0x83/0xb0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae -> #3 (uuid_mutex){+.+.}-{3:3}: __mutex_lock+0xba/0x7c0 btrfs_rm_device+0x48/0x6a0 [btrfs] btrfs_ioctl+0x2d1c/0x3110 [btrfs] __x64_sys_ioctl+0x83/0xb0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae -> #2 (sb_writers#11){.+.+}-{0:0}: lo_write_bvec+0x112/0x290 [loop] loop_process_work+0x25f/0xcb0 [loop] process_one_work+0x28f/0x5d0 worker_thread+0x55/0x3c0 kthread+0x140/0x170 ret_from_fork+0x22/0x30 -> #1 ((work_completion)(&lo->rootcg_work)){+.+.}-{0:0}: process_one_work+0x266/0x5d0 worker_thread+0x55/0x3c0 kthread+0x140/0x170 ret_from_fork+0x22/0x30 -> #0 ((wq_completion)loop0){+.+.}-{0:0}: __lock_acquire+0x1130/0x1dc0 lock_acquire+0xf5/0x320 flush_workqueue+0xae/0x600 drain_workqueue+0xa0/0x110 destroy_workqueue+0x36/0x250 __loop_clr_fd+0x9a/0x650 [loop] lo_ioctl+0x29d/0x780 [loop] block_ioctl+0x3f/0x50 __x64_sys_ioctl+0x83/0xb0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae other info that might help us debug this: Chain exists of: (wq_completion)loop0 --> &disk->open_mutex --> &lo->lo_mutex Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&lo->lo_mutex); lock(&disk->open_mutex); lock(&lo->lo_mutex); lock((wq_completion)loop0); *** DEADLOCK *** 1 lock held by losetup/156417: #0: ffff9c7647395468 (&lo->lo_mutex){+.+.}-{3:3}, at: __loop_clr_fd+0x41/0x650 [loop] stack backtrace: CPU: 8 PID: 156417 Comm: losetup Not tainted 5.14.0-rc2-custom+ #34 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 Call Trace: dump_stack_lvl+0x57/0x72 check_noncircular+0x10a/0x120 __lock_acquire+0x1130/0x1dc0 lock_acquire+0xf5/0x320 ? flush_workqueue+0x84/0x600 flush_workqueue+0xae/0x600 ? flush_workqueue+0x84/0x600 drain_workqueue+0xa0/0x110 destroy_workqueue+0x36/0x250 __loop_clr_fd+0x9a/0x650 [loop] lo_ioctl+0x29d/0x780 [loop] ? __lock_acquire+0x3a0/0x1dc0 ? update_dl_rq_load_avg+0x152/0x360 ? lock_is_held_type+0xa5/0x120 ? find_held_lock.constprop.0+0x2b/0x80 block_ioctl+0x3f/0x50 __x64_sys_ioctl+0x83/0xb0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f645884de6b Usually the uuid_mutex exists to protect the fs_devices that map together all of the devices that match a specific uuid. In rm_device we're messing with the uuid of a device, so it makes sense to protect that here. However in doing that it pulls in a whole host of lockdep dependencies, as we call mnt_may_write() on the sb before we grab the uuid_mutex, thus we end up with the dependency chain under the uuid_mutex being added under the normal sb write dependency chain, which causes problems with loop devices. We don't need the uuid mutex here however. If we call btrfs_scan_one_device() before we scratch the super block we will find the fs_devices and not find the device itself and return EBUSY because the fs_devices is open. If we call it after the scratch happens it will not appear to be a valid btrfs file system. We do not need to worry about other fs_devices modifying operations here because we're protected by the exclusive operations locking. So drop the uuid_mutex here in order to fix the lockdep splat. A more detailed explanation from the discussion: We are worried about rm and scan racing with each other, before this change we'll zero the device out under the UUID mutex so when scan does run it'll make sure that it can go through the whole device scan thing without rm messing with us. We aren't worried if the scratch happens first, because the result is we don't think this is a btrfs device and we bail out. The only case we are concerned with is we scratch _after_ scan is able to read the superblock and gets a seemingly valid super block, so lets consider this case. Scan will call device_list_add() with the device we're removing. We'll call find_fsid_with_metadata_uuid() and get our fs_devices for this UUID. At this point we lock the fs_devices->device_list_mutex. This is what protects us in this case, but we have two cases here. 1. We aren't to the device removal part of the RM. We found our device, and device name matches our path, we go down and we set total_devices to our super number of devices, which doesn't affect anything because we haven't done the remove yet. 2. We are past the device removal part, which is protected by the device_list_mutex. Scan doesn't find the device, it goes down and does the if (fs_devices->opened) return -EBUSY; check and we bail out. Nothing about this situation is ideal, but the lockdep splat is real, and the fix is safe, tho admittedly a bit scary looking. Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba [ copy more from the discussion ] Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/volumes.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8af92e44deae4..344d18de1f08c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2162,8 +2162,11 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, u64 num_devices; int ret = 0; - mutex_lock(&uuid_mutex); - + /* + * The device list in fs_devices is accessed without locks (neither + * uuid_mutex nor device_list_mutex) as it won't change on a mounted + * filesystem and another device rm cannot run. + */ num_devices = btrfs_num_devices(fs_info); ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); @@ -2207,11 +2210,9 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, mutex_unlock(&fs_info->chunk_mutex); } - mutex_unlock(&uuid_mutex); ret = btrfs_shrink_device(device, 0); if (!ret) btrfs_reada_remove_dev(device); - mutex_lock(&uuid_mutex); if (ret) goto error_undo; @@ -2293,7 +2294,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, } out: - mutex_unlock(&uuid_mutex); return ret; error_undo: From 4e389c7dd1a7ba1deb832fec0ae98f8b53de7319 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Mon, 18 Oct 2021 15:34:13 +0800 Subject: [PATCH 1501/5344] spi: bcm-qspi: Fix missing clk_disable_unprepare() on error in bcm_qspi_probe() [ Upstream commit ca9b8f56ec089d3a436050afefd17b7237301f47 ] Fix the missing clk_disable_unprepare() before return from bcm_qspi_probe() in the error handling case. Reported-by: Hulk Robot Signed-off-by: Yang Yingliang Link: https://lore.kernel.org/r/20211018073413.2029081-1-yangyingliang@huawei.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- drivers/spi/spi-bcm-qspi.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-bcm-qspi.c b/drivers/spi/spi-bcm-qspi.c index 8a4be34bccfd2..8a1176efa4c85 100644 --- a/drivers/spi/spi-bcm-qspi.c +++ b/drivers/spi/spi-bcm-qspi.c @@ -1300,7 +1300,7 @@ int bcm_qspi_probe(struct platform_device *pdev, &qspi->dev_ids[val]); if (ret < 0) { dev_err(&pdev->dev, "IRQ %s not found\n", name); - goto qspi_probe_err; + goto qspi_unprepare_err; } qspi->dev_ids[val].dev = qspi; @@ -1315,7 +1315,7 @@ int bcm_qspi_probe(struct platform_device *pdev, if (!num_ints) { dev_err(&pdev->dev, "no IRQs registered, cannot init driver\n"); ret = -EINVAL; - goto qspi_probe_err; + goto qspi_unprepare_err; } /* @@ -1359,6 +1359,7 @@ int bcm_qspi_probe(struct platform_device *pdev, qspi_reg_err: bcm_qspi_hw_uninit(qspi); +qspi_unprepare_err: clk_disable_unprepare(qspi->clk); qspi_probe_err: kfree(qspi->dev_ids); From ede43c786425d05eacfdf404a3b8992055a30a3e Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 12 Oct 2021 17:50:05 +0200 Subject: [PATCH 1502/5344] x86/hyperv: Protect set_hv_tscchange_cb() against getting preempted [ Upstream commit 285f68afa8b20f752b0b7194d54980b5e0e27b75 ] The following issue is observed with CONFIG_DEBUG_PREEMPT when KVM loads: KVM: vmx: using Hyper-V Enlightened VMCS BUG: using smp_processor_id() in preemptible [00000000] code: systemd-udevd/488 caller is set_hv_tscchange_cb+0x16/0x80 CPU: 1 PID: 488 Comm: systemd-udevd Not tainted 5.15.0-rc5+ #396 Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.0 12/17/2019 Call Trace: dump_stack_lvl+0x6a/0x9a check_preemption_disabled+0xde/0xe0 ? kvm_gen_update_masterclock+0xd0/0xd0 [kvm] set_hv_tscchange_cb+0x16/0x80 kvm_arch_init+0x23f/0x290 [kvm] kvm_init+0x30/0x310 [kvm] vmx_init+0xaf/0x134 [kvm_intel] ... set_hv_tscchange_cb() can get preempted in between acquiring smp_processor_id() and writing to HV_X64_MSR_REENLIGHTENMENT_CONTROL. This is not an issue by itself: HV_X64_MSR_REENLIGHTENMENT_CONTROL is a partition-wide MSR and it doesn't matter which particular CPU will be used to receive reenlightenment notifications. The only real problem can (in theory) be observed if the CPU whose id was acquired with smp_processor_id() goes offline before we manage to write to the MSR, the logic in hv_cpu_die() won't be able to reassign it correctly. Reported-by: Michael Kelley Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20211012155005.1613352-1-vkuznets@redhat.com Signed-off-by: Wei Liu Signed-off-by: Sasha Levin --- arch/x86/hyperv/hv_init.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 79583bac9ac4a..812db1ac8cb11 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -155,7 +155,6 @@ void set_hv_tscchange_cb(void (*cb)(void)) struct hv_reenlightenment_control re_ctrl = { .vector = HYPERV_REENLIGHTENMENT_VECTOR, .enabled = 1, - .target_vp = hv_vp_index[smp_processor_id()] }; struct hv_tsc_emulation_control emu_ctrl = {.enabled = 1}; @@ -169,8 +168,12 @@ void set_hv_tscchange_cb(void (*cb)(void)) /* Make sure callback is registered before we write to MSRs */ wmb(); + re_ctrl.target_vp = hv_vp_index[get_cpu()]; + wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); wrmsrl(HV_X64_MSR_TSC_EMULATION_CONTROL, *((u64 *)&emu_ctrl)); + + put_cpu(); } EXPORT_SYMBOL_GPL(set_hv_tscchange_cb); From 4f458bc7b8deec293c9cc50d40127a4f4a9e587a Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Sat, 9 Oct 2021 20:24:39 +0200 Subject: [PATCH 1503/5344] parisc: fix warning in flush_tlb_all [ Upstream commit 1030d681319b43869e0d5b568b9d0226652d1a6f ] I've got the following splat after enabling preemption: [ 3.724721] BUG: using __this_cpu_add() in preemptible [00000000] code: swapper/0/1 [ 3.734630] caller is __this_cpu_preempt_check+0x38/0x50 [ 3.740635] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 5.15.0-rc4-64bit+ #324 [ 3.744605] Hardware name: 9000/785/C8000 [ 3.744605] Backtrace: [ 3.744605] [<00000000401d9d58>] show_stack+0x74/0xb0 [ 3.744605] [<0000000040c27bd4>] dump_stack_lvl+0x10c/0x188 [ 3.744605] [<0000000040c27c84>] dump_stack+0x34/0x48 [ 3.744605] [<0000000040c33438>] check_preemption_disabled+0x178/0x1b0 [ 3.744605] [<0000000040c334f8>] __this_cpu_preempt_check+0x38/0x50 [ 3.744605] [<00000000401d632c>] flush_tlb_all+0x58/0x2e0 [ 3.744605] [<00000000401075c0>] 0x401075c0 [ 3.744605] [<000000004010b8fc>] 0x4010b8fc [ 3.744605] [<00000000401080fc>] 0x401080fc [ 3.744605] [<00000000401d5224>] do_one_initcall+0x128/0x378 [ 3.744605] [<0000000040102de8>] 0x40102de8 [ 3.744605] [<0000000040c33864>] kernel_init+0x60/0x3a8 [ 3.744605] [<00000000401d1020>] ret_from_kernel_thread+0x20/0x28 [ 3.744605] Fix this by moving the __inc_irq_stat() into the locked section. Signed-off-by: Sven Schnelle Signed-off-by: Helge Deller Signed-off-by: Sasha Levin --- arch/parisc/mm/init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 3e54484797f62..d769d61cde7ca 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -892,9 +892,9 @@ void flush_tlb_all(void) { int do_recycle; - __inc_irq_stat(irq_tlb_count); do_recycle = 0; spin_lock(&sid_lock); + __inc_irq_stat(irq_tlb_count); if (dirty_space_ids > RECYCLE_THRESHOLD) { BUG_ON(recycle_inuse); /* FIXME: Use a semaphore/wait queue here */ get_dirty_sids(&recycle_ndirty,recycle_dirty_array); @@ -913,8 +913,8 @@ void flush_tlb_all(void) #else void flush_tlb_all(void) { - __inc_irq_stat(irq_tlb_count); spin_lock(&sid_lock); + __inc_irq_stat(irq_tlb_count); flush_tlb_all_local(NULL); recycle_sids(); spin_unlock(&sid_lock); From fe859e49bd9f76b9a2b2773653e4587187c05b0c Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Tue, 5 Oct 2021 00:05:43 +0200 Subject: [PATCH 1504/5344] task_stack: Fix end_of_stack() for architectures with upwards-growing stack [ Upstream commit 9cc2fa4f4a92ccc6760d764e7341be46ee8aaaa1 ] The function end_of_stack() returns a pointer to the last entry of a stack. For architectures like parisc where the stack grows upwards return the pointer to the highest address in the stack. Without this change I faced a crash on parisc, because the stackleak functionality wrote STACKLEAK_POISON to the lowest address and thus overwrote the first 4 bytes of the task_struct which included the TIF_FLAGS. Signed-off-by: Helge Deller Signed-off-by: Sasha Levin --- include/linux/sched/task_stack.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h index 2413427e439c7..d10150587d819 100644 --- a/include/linux/sched/task_stack.h +++ b/include/linux/sched/task_stack.h @@ -25,7 +25,11 @@ static inline void *task_stack_page(const struct task_struct *task) static inline unsigned long *end_of_stack(const struct task_struct *task) { +#ifdef CONFIG_STACK_GROWSUP + return (unsigned long *)((unsigned long)task->stack + THREAD_SIZE) - 1; +#else return task->stack; +#endif } #elif !defined(__HAVE_THREAD_FUNCTIONS) From c5192bde95df97002ffec78add202edd2704199c Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Sat, 9 Oct 2021 23:15:17 +0200 Subject: [PATCH 1505/5344] parisc/unwind: fix unwinder when CONFIG_64BIT is enabled [ Upstream commit 8e0ba125c2bf1030af3267058019ba86da96863f ] With 64 bit kernels unwind_special() is not working because it compares the pc to the address of the function descriptor. Add a helper function that compares pc with the dereferenced address. This fixes all of the backtraces on my c8000. Without this changes, a lot of backtraces are missing in kdb or the show-all-tasks command from /proc/sysrq-trigger. Signed-off-by: Sven Schnelle Signed-off-by: Helge Deller Signed-off-by: Sasha Levin --- arch/parisc/kernel/unwind.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/arch/parisc/kernel/unwind.c b/arch/parisc/kernel/unwind.c index 87ae476d1c4f5..86a57fb0e6fae 100644 --- a/arch/parisc/kernel/unwind.c +++ b/arch/parisc/kernel/unwind.c @@ -21,6 +21,8 @@ #include #include +#include +#include /* #define DEBUG 1 */ #ifdef DEBUG @@ -203,6 +205,11 @@ int __init unwind_init(void) return 0; } +static bool pc_is_kernel_fn(unsigned long pc, void *fn) +{ + return (unsigned long)dereference_kernel_function_descriptor(fn) == pc; +} + static int unwind_special(struct unwind_frame_info *info, unsigned long pc, int frame_size) { /* @@ -221,7 +228,7 @@ static int unwind_special(struct unwind_frame_info *info, unsigned long pc, int extern void * const _call_on_stack; #endif /* CONFIG_IRQSTACKS */ - if (pc == (unsigned long) &handle_interruption) { + if (pc_is_kernel_fn(pc, handle_interruption)) { struct pt_regs *regs = (struct pt_regs *)(info->sp - frame_size - PT_SZ_ALGN); dbg("Unwinding through handle_interruption()\n"); info->prev_sp = regs->gr[30]; @@ -229,13 +236,13 @@ static int unwind_special(struct unwind_frame_info *info, unsigned long pc, int return 1; } - if (pc == (unsigned long) &ret_from_kernel_thread || - pc == (unsigned long) &syscall_exit) { + if (pc_is_kernel_fn(pc, ret_from_kernel_thread) || + pc_is_kernel_fn(pc, syscall_exit)) { info->prev_sp = info->prev_ip = 0; return 1; } - if (pc == (unsigned long) &intr_return) { + if (pc_is_kernel_fn(pc, intr_return)) { struct pt_regs *regs; dbg("Found intr_return()\n"); @@ -246,20 +253,20 @@ static int unwind_special(struct unwind_frame_info *info, unsigned long pc, int return 1; } - if (pc == (unsigned long) &_switch_to_ret) { + if (pc_is_kernel_fn(pc, _switch_to) || + pc_is_kernel_fn(pc, _switch_to_ret)) { info->prev_sp = info->sp - CALLEE_SAVE_FRAME_SIZE; info->prev_ip = *(unsigned long *)(info->prev_sp - RP_OFFSET); return 1; } #ifdef CONFIG_IRQSTACKS - if (pc == (unsigned long) &_call_on_stack) { + if (pc_is_kernel_fn(pc, _call_on_stack)) { info->prev_sp = *(unsigned long *)(info->sp - FRAME_SIZE - REG_SZ); info->prev_ip = *(unsigned long *)(info->sp - FRAME_SIZE - RP_OFFSET); return 1; } #endif - return 0; } From c1fe5c996a401ea6c775f458bd0bc30ec13d8b16 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Fri, 15 Oct 2021 21:49:23 +0200 Subject: [PATCH 1506/5344] parisc/kgdb: add kgdb_roundup() to make kgdb work with idle polling [ Upstream commit 66e29fcda1824f0427966fbee2bd2c85bf362c82 ] With idle polling, IPIs are not sent when a CPU idle, but queued and run later from do_idle(). The default kgdb_call_nmi_hook() implementation gets the pointer to struct pt_regs from get_irq_reqs(), which doesn't work in that case because it was not called from the IPI interrupt handler. Fix it by defining our own kgdb_roundup() function which sents an IPI_ENTER_KGDB. When that IPI is received on the target CPU kgdb_nmicallback() is called. Signed-off-by: Sven Schnelle Signed-off-by: Helge Deller Signed-off-by: Sasha Levin --- arch/parisc/kernel/smp.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c index e202c37e56af3..9997465c11820 100644 --- a/arch/parisc/kernel/smp.c +++ b/arch/parisc/kernel/smp.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -71,7 +72,10 @@ enum ipi_message_type { IPI_CALL_FUNC, IPI_CPU_START, IPI_CPU_STOP, - IPI_CPU_TEST + IPI_CPU_TEST, +#ifdef CONFIG_KGDB + IPI_ENTER_KGDB, +#endif }; @@ -169,7 +173,12 @@ ipi_interrupt(int irq, void *dev_id) case IPI_CPU_TEST: smp_debug(100, KERN_DEBUG "CPU%d is alive!\n", this_cpu); break; - +#ifdef CONFIG_KGDB + case IPI_ENTER_KGDB: + smp_debug(100, KERN_DEBUG "CPU%d ENTER_KGDB\n", this_cpu); + kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs()); + break; +#endif default: printk(KERN_CRIT "Unknown IPI num on CPU%d: %lu\n", this_cpu, which); @@ -225,6 +234,12 @@ send_IPI_allbutself(enum ipi_message_type op) } } +#ifdef CONFIG_KGDB +void kgdb_roundup_cpus(void) +{ + send_IPI_allbutself(IPI_ENTER_KGDB); +} +#endif inline void smp_send_stop(void) { send_IPI_allbutself(IPI_CPU_STOP); } From db66c0b730b4c326d66c62fbf9c1430ff43497ab Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 25 Oct 2021 11:26:49 +0200 Subject: [PATCH 1507/5344] netfilter: conntrack: set on IPS_ASSURED if flows enters internal stream state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit b7b1d02fc43925a4d569ec221715db2dfa1ce4f5 ] The internal stream state sets the timeout to 120 seconds 2 seconds after the creation of the flow, attach this internal stream state to the IPS_ASSURED flag for consistent event reporting. Before this patch: [NEW] udp 17 30 src=10.246.11.13 dst=216.239.35.0 sport=37282 dport=123 [UNREPLIED] src=216.239.35.0 dst=10.246.11.13 sport=123 dport=37282 [UPDATE] udp 17 30 src=10.246.11.13 dst=216.239.35.0 sport=37282 dport=123 src=216.239.35.0 dst=10.246.11.13 sport=123 dport=37282 [UPDATE] udp 17 30 src=10.246.11.13 dst=216.239.35.0 sport=37282 dport=123 src=216.239.35.0 dst=10.246.11.13 sport=123 dport=37282 [ASSURED] [DESTROY] udp 17 src=10.246.11.13 dst=216.239.35.0 sport=37282 dport=123 src=216.239.35.0 dst=10.246.11.13 sport=123 dport=37282 [ASSURED] Note IPS_ASSURED for the flow not yet in the internal stream state. after this update: [NEW] udp 17 30 src=10.246.11.13 dst=216.239.35.0 sport=37282 dport=123 [UNREPLIED] src=216.239.35.0 dst=10.246.11.13 sport=123 dport=37282 [UPDATE] udp 17 30 src=10.246.11.13 dst=216.239.35.0 sport=37282 dport=123 src=216.239.35.0 dst=10.246.11.13 sport=123 dport=37282 [UPDATE] udp 17 120 src=10.246.11.13 dst=216.239.35.0 sport=37282 dport=123 src=216.239.35.0 dst=10.246.11.13 sport=123 dport=37282 [ASSURED] [DESTROY] udp 17 src=10.246.11.13 dst=216.239.35.0 sport=37282 dport=123 src=216.239.35.0 dst=10.246.11.13 sport=123 dport=37282 [ASSURED] Before this patch, short-lived UDP flows never entered IPS_ASSURED, so they were already candidate flow to be deleted by early_drop under stress. Before this patch, IPS_ASSURED is set on regardless the internal stream state, attach this internal stream state to IPS_ASSURED. packet #1 (original direction) enters NEW state packet #2 (reply direction) enters ESTABLISHED state, sets on IPS_SEEN_REPLY paclet #3 (any direction) sets on IPS_ASSURED (if 2 seconds since the creation has passed by). Reported-by: Maciej Żenczykowski Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nf_conntrack_proto_udp.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 7365b43f8f980..e3a2d018f4ec5 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -105,15 +105,18 @@ int nf_conntrack_udp_packet(struct nf_conn *ct, */ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { unsigned long extra = timeouts[UDP_CT_UNREPLIED]; + bool stream = false; /* Still active after two seconds? Extend timeout. */ - if (time_after(jiffies, ct->proto.udp.stream_ts)) + if (time_after(jiffies, ct->proto.udp.stream_ts)) { extra = timeouts[UDP_CT_REPLIED]; + stream = true; + } nf_ct_refresh_acct(ct, ctinfo, skb, extra); /* Also, more likely to be important, and not a probe */ - if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) + if (stream && !test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } else { nf_ct_refresh_acct(ct, ctinfo, skb, From 87a4d11f47c64e8d94f9b9f48e6481738da6e1a9 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 29 Oct 2021 11:29:07 -0700 Subject: [PATCH 1508/5344] selftests/bpf: Fix strobemeta selftest regression [ Upstream commit 0133c20480b14820d43c37c0e9502da4bffcad3a ] After most recent nightly Clang update strobemeta selftests started failing with the following error (relevant portion of assembly included): 1624: (85) call bpf_probe_read_user_str#114 1625: (bf) r1 = r0 1626: (18) r2 = 0xfffffffe 1628: (5f) r1 &= r2 1629: (55) if r1 != 0x0 goto pc+7 1630: (07) r9 += 104 1631: (6b) *(u16 *)(r9 +0) = r0 1632: (67) r0 <<= 32 1633: (77) r0 >>= 32 1634: (79) r1 = *(u64 *)(r10 -456) 1635: (0f) r1 += r0 1636: (7b) *(u64 *)(r10 -456) = r1 1637: (79) r1 = *(u64 *)(r10 -368) 1638: (c5) if r1 s< 0x1 goto pc+778 1639: (bf) r6 = r8 1640: (0f) r6 += r7 1641: (b4) w1 = 0 1642: (6b) *(u16 *)(r6 +108) = r1 1643: (79) r3 = *(u64 *)(r10 -352) 1644: (79) r9 = *(u64 *)(r10 -456) 1645: (bf) r1 = r9 1646: (b4) w2 = 1 1647: (85) call bpf_probe_read_user_str#114 R1 unbounded memory access, make sure to bounds check any such access In the above code r0 and r1 are implicitly related. Clang knows that, but verifier isn't able to infer this relationship. Yonghong Song narrowed down this "regression" in code generation to a recent Clang optimization change ([0]), which for BPF target generates code pattern that BPF verifier can't handle and loses track of register boundaries. This patch works around the issue by adding an BPF assembly-based helper that helps to prove to the verifier that upper bound of the register is a given constant by controlling the exact share of generated BPF instruction sequence. This fixes the immediate issue for strobemeta selftest. [0] https://github.com/llvm/llvm-project/commit/acabad9ff6bf13e00305d9d8621ee8eafc1f8b08 Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20211029182907.166910-1-andrii@kernel.org Signed-off-by: Sasha Levin --- tools/testing/selftests/bpf/progs/strobemeta.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/strobemeta.h b/tools/testing/selftests/bpf/progs/strobemeta.h index 067eb625d01c5..938765d87528a 100644 --- a/tools/testing/selftests/bpf/progs/strobemeta.h +++ b/tools/testing/selftests/bpf/progs/strobemeta.h @@ -10,6 +10,14 @@ #include #include "bpf_helpers.h" +#define bpf_clamp_umax(VAR, UMAX) \ + asm volatile ( \ + "if %0 <= %[max] goto +1\n" \ + "%0 = %[max]\n" \ + : "+r"(VAR) \ + : [max]"i"(UMAX) \ + ) + typedef uint32_t pid_t; struct task_struct {}; @@ -404,6 +412,7 @@ static __always_inline void *read_map_var(struct strobemeta_cfg *cfg, len = bpf_probe_read_str(payload, STROBE_MAX_STR_LEN, map.tag); if (len <= STROBE_MAX_STR_LEN) { + bpf_clamp_umax(len, STROBE_MAX_STR_LEN); descr->tag_len = len; payload += len; } @@ -421,6 +430,7 @@ static __always_inline void *read_map_var(struct strobemeta_cfg *cfg, len = bpf_probe_read_str(payload, STROBE_MAX_STR_LEN, map.entries[i].key); if (len <= STROBE_MAX_STR_LEN) { + bpf_clamp_umax(len, STROBE_MAX_STR_LEN); descr->key_lens[i] = len; payload += len; } @@ -428,6 +438,7 @@ static __always_inline void *read_map_var(struct strobemeta_cfg *cfg, len = bpf_probe_read_str(payload, STROBE_MAX_STR_LEN, map.entries[i].val); if (len <= STROBE_MAX_STR_LEN) { + bpf_clamp_umax(len, STROBE_MAX_STR_LEN); descr->val_lens[i] = len; payload += len; } From 4f40d0eac74d9d7d6a8f8f29d1d20a253963a8e2 Mon Sep 17 00:00:00 2001 From: Desmond Cheong Zhi Xi Date: Thu, 2 Sep 2021 23:13:06 -0400 Subject: [PATCH 1509/5344] Bluetooth: fix init and cleanup of sco_conn.timeout_work [ Upstream commit 49d8a5606428ca0962d09050a5af81461ff90fbb ] Before freeing struct sco_conn, all delayed timeout work should be cancelled. Otherwise, sco_sock_timeout could potentially use the sco_conn after it has been freed. Additionally, sco_conn.timeout_work should be initialized when the connection is allocated, not when the channel is added. This is because an sco_conn can create channels with multiple sockets over its lifetime, which happens if sockets are released but the connection isn't deleted. Fixes: ba316be1b6a0 ("Bluetooth: schedule SCO timeouts with delayed_work") Signed-off-by: Desmond Cheong Zhi Xi Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Sasha Levin --- net/bluetooth/sco.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index cc5a1d2545679..2c616c1c62958 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -133,6 +133,7 @@ static struct sco_conn *sco_conn_add(struct hci_conn *hcon) return NULL; spin_lock_init(&conn->lock); + INIT_DELAYED_WORK(&conn->timeout_work, sco_sock_timeout); hcon->sco_data = conn; conn->hcon = hcon; @@ -196,11 +197,11 @@ static void sco_conn_del(struct hci_conn *hcon, int err) sco_chan_del(sk, err); bh_unlock_sock(sk); sock_put(sk); - - /* Ensure no more work items will run before freeing conn. */ - cancel_delayed_work_sync(&conn->timeout_work); } + /* Ensure no more work items will run before freeing conn. */ + cancel_delayed_work_sync(&conn->timeout_work); + hcon->sco_data = NULL; kfree(conn); } @@ -213,8 +214,6 @@ static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, sco_pi(sk)->conn = conn; conn->sk = sk; - INIT_DELAYED_WORK(&conn->timeout_work, sco_sock_timeout); - if (parent) bt_accept_enqueue(parent, sk, true); } From 045dce36a2957ddcb46217f2c813b71ed269f2b2 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Wed, 18 Aug 2021 13:34:00 +0530 Subject: [PATCH 1510/5344] rcu: Fix existing exp request check in sync_sched_exp_online_cleanup() [ Upstream commit f0b2b2df5423fb369ac762c77900bc7765496d58 ] The sync_sched_exp_online_cleanup() checks to see if RCU needs an expedited quiescent state from the incoming CPU, sending it an IPI if so. Before sending IPI, it checks whether expedited qs need has been already requested for the incoming CPU, by checking rcu_data.cpu_no_qs.b.exp for the current cpu, on which sync_sched_exp_online_cleanup() is running. This works for the case where incoming CPU is same as self. However, for the case where incoming CPU is different from self, expedited request won't get marked, which can potentially delay reporting of expedited quiescent state for the incoming CPU. Fixes: e015a3411220 ("rcu: Avoid self-IPI in sync_sched_exp_online_cleanup()") Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney Signed-off-by: Sasha Levin --- kernel/rcu/tree_exp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index df90d4d7ad2e2..4c4d7683a4e5b 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -738,7 +738,7 @@ static void sync_sched_exp_online_cleanup(int cpu) my_cpu = get_cpu(); /* Quiescent state either not needed or already requested, leave. */ if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || - __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) { + rdp->cpu_no_qs.b.exp) { put_cpu(); return; } From 992ddd65f966cd2dd9342014a973a399e66bbcd6 Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Wed, 15 Sep 2021 12:05:07 +0200 Subject: [PATCH 1511/5344] drm/v3d: fix wait for TMU write combiner flush [ Upstream commit e4f868191138975f2fdf2f37c11318b47db4acc9 ] The hardware sets the TMUWCF bit back to 0 when the TMU write combiner flush completes so we should be checking for that instead of the L2TFLS bit. v2 (Melissa Wen): - Add Signed-off-by and Fixes tags. - Change the error message for the timeout to be more clear. Fixes spurious Vulkan CTS failures in: dEQP-VK.binding_model.descriptorset_random.* Fixes: d223f98f02099 ("drm/v3d: Add support for compute shader dispatch.") Signed-off-by: Iago Toral Quiroga Reviewed-by: Melissa Wen Signed-off-by: Melissa Wen Link: https://patchwork.freedesktop.org/patch/msgid/20210915100507.3945-1-itoral@igalia.com Signed-off-by: Sasha Levin --- drivers/gpu/drm/v3d/v3d_gem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c index 19c092d75266b..1609a85429cef 100644 --- a/drivers/gpu/drm/v3d/v3d_gem.c +++ b/drivers/gpu/drm/v3d/v3d_gem.c @@ -195,8 +195,8 @@ v3d_clean_caches(struct v3d_dev *v3d) V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL, V3D_L2TCACTL_TMUWCF); if (wait_for(!(V3D_CORE_READ(core, V3D_CTL_L2TCACTL) & - V3D_L2TCACTL_L2TFLS), 100)) { - DRM_ERROR("Timeout waiting for L1T write combiner flush\n"); + V3D_L2TCACTL_TMUWCF), 100)) { + DRM_ERROR("Timeout waiting for TMU write combiner flush\n"); } mutex_lock(&v3d->cache_clean_lock); From 83aba0d41698d8542219098af7d04c91183a8f46 Mon Sep 17 00:00:00 2001 From: liuyuntao Date: Sat, 28 Aug 2021 18:43:21 +0800 Subject: [PATCH 1512/5344] virtio-gpu: fix possible memory allocation failure [ Upstream commit 5bd4f20de8acad37dbb3154feb34dbc36d506c02 ] When kmem_cache_zalloc in virtio_gpu_get_vbuf fails, it will return an error code. But none of its callers checks this error code, and a core dump will take place. Considering many of its callers can't handle such error, I add a __GFP_NOFAIL flag when calling kmem_cache_zalloc to make sure it won't fail, and delete those unused error handlings. Fixes: dc5698e80cf724 ("Add virtio gpu driver.") Signed-off-by: Yuntao Liu Link: http://patchwork.freedesktop.org/patch/msgid/20210828104321.3410312-1-liuyuntao10@huawei.com Signed-off-by: Gerd Hoffmann Signed-off-by: Sasha Levin --- drivers/gpu/drm/virtio/virtgpu_vq.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/gpu/drm/virtio/virtgpu_vq.c b/drivers/gpu/drm/virtio/virtgpu_vq.c index bb46e7a0f1b5d..0ca996e6fd5cb 100644 --- a/drivers/gpu/drm/virtio/virtgpu_vq.c +++ b/drivers/gpu/drm/virtio/virtgpu_vq.c @@ -80,9 +80,7 @@ virtio_gpu_get_vbuf(struct virtio_gpu_device *vgdev, { struct virtio_gpu_vbuffer *vbuf; - vbuf = kmem_cache_zalloc(vgdev->vbufs, GFP_KERNEL); - if (!vbuf) - return ERR_PTR(-ENOMEM); + vbuf = kmem_cache_zalloc(vgdev->vbufs, GFP_KERNEL | __GFP_NOFAIL); BUG_ON(size > MAX_INLINE_CMD_SIZE); vbuf->buf = (void *)vbuf + sizeof(*vbuf); @@ -142,10 +140,6 @@ static void *virtio_gpu_alloc_cmd_resp(struct virtio_gpu_device *vgdev, vbuf = virtio_gpu_get_vbuf(vgdev, cmd_size, resp_size, resp_buf, cb); - if (IS_ERR(vbuf)) { - *vbuffer_p = NULL; - return ERR_CAST(vbuf); - } *vbuffer_p = vbuf; return (struct virtio_gpu_command *)vbuf->buf; } From f1e7ea01d63ff54f05ddedebb8c7adc42b527f1d Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Sat, 18 Sep 2021 17:04:10 +0800 Subject: [PATCH 1513/5344] net: net_namespace: Fix undefined member in key_remove_domain() [ Upstream commit aed0826b0cf2e488900ab92193893e803d65c070 ] The key_domain member in struct net only exists if we define CONFIG_KEYS. So we should add the define when we used key_domain. Fixes: 9b242610514f ("keys: Network namespace domain tag") Signed-off-by: Yajun Deng Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/core/net_namespace.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 9bf15512601bf..cd1d40195e461 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -480,7 +480,9 @@ struct net *copy_net_ns(unsigned long flags, if (rv < 0) { put_userns: +#ifdef CONFIG_KEYS key_remove_domain(net->key_domain); +#endif put_user_ns(user_ns); net_drop_ns(net); dec_ucounts: @@ -612,7 +614,9 @@ static void cleanup_net(struct work_struct *work) list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { list_del_init(&net->exit_list); dec_net_namespaces(net->ucounts); +#ifdef CONFIG_KEYS key_remove_domain(net->key_domain); +#endif put_user_ns(net->user_ns); net_drop_ns(net); } From 057a6dc555c0c7438d879cc71b97f8243fec80cc Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sat, 18 Sep 2021 18:53:08 -0400 Subject: [PATCH 1514/5344] cgroup: Make rebind_subsystems() disable v2 controllers all at once [ Upstream commit 7ee285395b211cad474b2b989db52666e0430daf ] It was found that the following warning was displayed when remounting controllers from cgroup v2 to v1: [ 8042.997778] WARNING: CPU: 88 PID: 80682 at kernel/cgroup/cgroup.c:3130 cgroup_apply_control_disable+0x158/0x190 : [ 8043.091109] RIP: 0010:cgroup_apply_control_disable+0x158/0x190 [ 8043.096946] Code: ff f6 45 54 01 74 39 48 8d 7d 10 48 c7 c6 e0 46 5a a4 e8 7b 67 33 00 e9 41 ff ff ff 49 8b 84 24 e8 01 00 00 0f b7 40 08 eb 95 <0f> 0b e9 5f ff ff ff 48 83 c4 08 5b 5d 41 5c 41 5d 41 5e 41 5f c3 [ 8043.115692] RSP: 0018:ffffba8a47c23d28 EFLAGS: 00010202 [ 8043.120916] RAX: 0000000000000036 RBX: ffffffffa624ce40 RCX: 000000000000181a [ 8043.128047] RDX: ffffffffa63c43e0 RSI: ffffffffa63c43e0 RDI: ffff9d7284ee1000 [ 8043.135180] RBP: ffff9d72874c5800 R08: ffffffffa624b090 R09: 0000000000000004 [ 8043.142314] R10: ffffffffa624b080 R11: 0000000000002000 R12: ffff9d7284ee1000 [ 8043.149447] R13: ffff9d7284ee1000 R14: ffffffffa624ce70 R15: ffffffffa6269e20 [ 8043.156576] FS: 00007f7747cff740(0000) GS:ffff9d7a5fc00000(0000) knlGS:0000000000000000 [ 8043.164663] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 8043.170409] CR2: 00007f7747e96680 CR3: 0000000887d60001 CR4: 00000000007706e0 [ 8043.177539] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 8043.184673] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 8043.191804] PKRU: 55555554 [ 8043.194517] Call Trace: [ 8043.196970] rebind_subsystems+0x18c/0x470 [ 8043.201070] cgroup_setup_root+0x16c/0x2f0 [ 8043.205177] cgroup1_root_to_use+0x204/0x2a0 [ 8043.209456] cgroup1_get_tree+0x3e/0x120 [ 8043.213384] vfs_get_tree+0x22/0xb0 [ 8043.216883] do_new_mount+0x176/0x2d0 [ 8043.220550] __x64_sys_mount+0x103/0x140 [ 8043.224474] do_syscall_64+0x38/0x90 [ 8043.228063] entry_SYSCALL_64_after_hwframe+0x44/0xae It was caused by the fact that rebind_subsystem() disables controllers to be rebound one by one. If more than one disabled controllers are originally from the default hierarchy, it means that cgroup_apply_control_disable() will be called multiple times for the same default hierarchy. A controller may be killed by css_kill() in the first round. In the second round, the killed controller may not be completely dead yet leading to the warning. To avoid this problem, we collect all the ssid's of controllers that needed to be disabled from the default hierarchy and then disable them in one go instead of one by one. Fixes: 334c3679ec4b ("cgroup: reimplement rebind_subsystems() using cgroup_apply_control() and friends") Signed-off-by: Waiman Long Signed-off-by: Tejun Heo Signed-off-by: Sasha Levin --- kernel/cgroup/cgroup.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 007b861b75516..f1f1c169d167f 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1736,6 +1736,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; int ssid, i, ret; + u16 dfl_disable_ss_mask = 0; lockdep_assert_held(&cgroup_mutex); @@ -1752,8 +1753,28 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) /* can't move between two non-dummy roots either */ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) return -EBUSY; + + /* + * Collect ssid's that need to be disabled from default + * hierarchy. + */ + if (ss->root == &cgrp_dfl_root) + dfl_disable_ss_mask |= 1 << ssid; + } while_each_subsys_mask(); + if (dfl_disable_ss_mask) { + struct cgroup *scgrp = &cgrp_dfl_root.cgrp; + + /* + * Controllers from default hierarchy that need to be rebound + * are all disabled together in one go. + */ + cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask; + WARN_ON(cgroup_apply_control(scgrp)); + cgroup_finalize_control(scgrp, 0); + } + do_each_subsys_mask(ss, ssid, ss_mask) { struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; @@ -1762,10 +1783,12 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) WARN_ON(!css || cgroup_css(dcgrp, ss)); - /* disable from the source */ - src_root->subsys_mask &= ~(1 << ssid); - WARN_ON(cgroup_apply_control(scgrp)); - cgroup_finalize_control(scgrp, 0); + if (src_root != &cgrp_dfl_root) { + /* disable from the source */ + src_root->subsys_mask &= ~(1 << ssid); + WARN_ON(cgroup_apply_control(scgrp)); + cgroup_finalize_control(scgrp, 0); + } /* rebind */ RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); From 720e64bf51d4e742d2ba466872cacde7acd9374f Mon Sep 17 00:00:00 2001 From: Ajay Singh Date: Thu, 16 Sep 2021 16:49:18 +0000 Subject: [PATCH 1515/5344] wilc1000: fix possible memory leak in cfg_scan_result() [ Upstream commit 3c719fed0f3a5e95b1d164609ecc81c4191ade70 ] When the BSS reference holds a valid reference, it is not freed. The 'if' condition is wrong. Instead of the 'if (bss)' check, the 'if (!bss)' check is used. The issue is solved by removing the unnecessary 'if' check because cfg80211_put_bss() already performs the NULL validation. Fixes: 6cd4fa5ab691 ("staging: wilc1000: make use of cfg80211_inform_bss_frame()") Signed-off-by: Ajay Singh Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210916164902.74629-3-ajay.kathat@microchip.com Signed-off-by: Sasha Levin --- drivers/staging/wilc1000/wilc_wfi_cfgoperations.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c b/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c index c3cd6f389a989..2a369fdaf0cbb 100644 --- a/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c +++ b/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c @@ -97,8 +97,7 @@ static void cfg_scan_result(enum scan_event scan_event, info->frame_len, (s32)info->rssi * 100, GFP_KERNEL); - if (!bss) - cfg80211_put_bss(wiphy, bss); + cfg80211_put_bss(wiphy, bss); } else if (scan_event == SCAN_EVENT_DONE) { mutex_lock(&priv->scan_req_lock); From 94d4922080b489ead1712d4a9c442e55993960ef Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Wed, 22 Sep 2021 21:49:45 +0800 Subject: [PATCH 1516/5344] Bluetooth: btmtkuart: fix a memleak in mtk_hci_wmt_sync [ Upstream commit 3e5f2d90c28f9454e421108554707620bc23269d ] bdev->evt_skb will get freed in the normal path and one error path of mtk_hci_wmt_sync, while the other error paths do not free it, which may cause a memleak. This bug is suggested by a static analysis tool, please advise. Fixes: e0b67035a90b ("Bluetooth: mediatek: update the common setup between MT7622 and other devices") Signed-off-by: Dinghao Liu Signed-off-by: Marcel Holtmann Signed-off-by: Sasha Levin --- drivers/bluetooth/btmtkuart.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/bluetooth/btmtkuart.c b/drivers/bluetooth/btmtkuart.c index 8a81fbca5c9d8..2beb2321825e3 100644 --- a/drivers/bluetooth/btmtkuart.c +++ b/drivers/bluetooth/btmtkuart.c @@ -158,8 +158,10 @@ static int mtk_hci_wmt_sync(struct hci_dev *hdev, int err; hlen = sizeof(*hdr) + wmt_params->dlen; - if (hlen > 255) - return -EINVAL; + if (hlen > 255) { + err = -EINVAL; + goto err_free_skb; + } hdr = (struct mtk_wmt_hdr *)&wc; hdr->dir = 1; @@ -173,7 +175,7 @@ static int mtk_hci_wmt_sync(struct hci_dev *hdev, err = __hci_cmd_send(hdev, 0xfc6f, hlen, &wc); if (err < 0) { clear_bit(BTMTKUART_TX_WAIT_VND_EVT, &bdev->tx_state); - return err; + goto err_free_skb; } /* The vendor specific WMT commands are all answered by a vendor @@ -190,13 +192,14 @@ static int mtk_hci_wmt_sync(struct hci_dev *hdev, if (err == -EINTR) { bt_dev_err(hdev, "Execution of wmt command interrupted"); clear_bit(BTMTKUART_TX_WAIT_VND_EVT, &bdev->tx_state); - return err; + goto err_free_skb; } if (err) { bt_dev_err(hdev, "Execution of wmt command timed out"); clear_bit(BTMTKUART_TX_WAIT_VND_EVT, &bdev->tx_state); - return -ETIMEDOUT; + err = -ETIMEDOUT; + goto err_free_skb; } /* Parse and handle the return WMT event */ From 1113ea48c581da1095fcbe213033cf014913939c Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Thu, 16 Sep 2021 00:03:07 +0200 Subject: [PATCH 1517/5344] crypto: caam - disable pkc for non-E SoCs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit f20311cc9c58052e0b215013046cbf390937910c ] On newer CAAM versions, not all accelerators are disabled if the SoC is a non-E variant. While the driver checks most of the modules for availability, there is one - PKHA - which sticks out. On non-E variants it is still reported as available, that is the number of instances is non-zero, but it has limited functionality. In particular it doesn't support encryption and decryption, but just signing and verifying. This is indicated by a bit in the PKHA_MISC field. Take this bit into account if we are checking for availability. This will the following error: [ 8.167817] caam_jr 8020000.jr: 20000b0f: CCB: desc idx 11: : Invalid CHA selected. Tested on an NXP LS1028A (non-E) SoC. Fixes: d239b10d4ceb ("crypto: caam - add register map changes cf. Era 10") Signed-off-by: Michael Walle Reviewed-by: Horia Geantă Signed-off-by: Herbert Xu Signed-off-by: Sasha Levin --- drivers/crypto/caam/caampkc.c | 19 +++++++++++++++---- drivers/crypto/caam/regs.h | 3 +++ 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/drivers/crypto/caam/caampkc.c b/drivers/crypto/caam/caampkc.c index 83f96d4f86e03..30e3f41ed8721 100644 --- a/drivers/crypto/caam/caampkc.c +++ b/drivers/crypto/caam/caampkc.c @@ -1087,16 +1087,27 @@ static struct caam_akcipher_alg caam_rsa = { int caam_pkc_init(struct device *ctrldev) { struct caam_drv_private *priv = dev_get_drvdata(ctrldev); - u32 pk_inst; + u32 pk_inst, pkha; int err; init_done = false; /* Determine public key hardware accelerator presence. */ - if (priv->era < 10) + if (priv->era < 10) { pk_inst = (rd_reg32(&priv->ctrl->perfmon.cha_num_ls) & CHA_ID_LS_PK_MASK) >> CHA_ID_LS_PK_SHIFT; - else - pk_inst = rd_reg32(&priv->ctrl->vreg.pkha) & CHA_VER_NUM_MASK; + } else { + pkha = rd_reg32(&priv->ctrl->vreg.pkha); + pk_inst = pkha & CHA_VER_NUM_MASK; + + /* + * Newer CAAMs support partially disabled functionality. If this is the + * case, the number is non-zero, but this bit is set to indicate that + * no encryption or decryption is supported. Only signing and verifying + * is supported. + */ + if (pkha & CHA_VER_MISC_PKHA_NO_CRYPT) + pk_inst = 0; + } /* Do not register algorithms if PKHA is not present. */ if (!pk_inst) diff --git a/drivers/crypto/caam/regs.h b/drivers/crypto/caam/regs.h index 05127b70527d7..43975f01465d2 100644 --- a/drivers/crypto/caam/regs.h +++ b/drivers/crypto/caam/regs.h @@ -317,6 +317,9 @@ struct version_regs { /* CHA Miscellaneous Information - AESA_MISC specific */ #define CHA_VER_MISC_AES_GCM BIT(1 + CHA_VER_MISC_SHIFT) +/* CHA Miscellaneous Information - PKHA_MISC specific */ +#define CHA_VER_MISC_PKHA_NO_CRYPT BIT(7 + CHA_VER_MISC_SHIFT) + /* * caam_perfmon - Performance Monitor/Secure Memory Status/ * CAAM Global Status/Component Version IDs From adf48a8fbc075f0c5593ee1810c22b7de6a436cb Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Fri, 24 Sep 2021 03:18:37 +0000 Subject: [PATCH 1518/5344] rxrpc: Fix _usecs_to_jiffies() by using usecs_to_jiffies() [ Upstream commit acde891c243c1ed85b19d4d5042bdf00914f5739 ] Directly using _usecs_to_jiffies() might be unsafe, so it's better to use usecs_to_jiffies() instead. Because we can see that the result of _usecs_to_jiffies() could be larger than MAX_JIFFY_OFFSET values without the check of the input. Fixes: c410bf01933e ("Fix the excessive initial retransmission timeout") Signed-off-by: Jiasheng Jiang Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/rxrpc/rtt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rxrpc/rtt.c b/net/rxrpc/rtt.c index 928d8b34a3eee..f3f87c9f0209d 100644 --- a/net/rxrpc/rtt.c +++ b/net/rxrpc/rtt.c @@ -23,7 +23,7 @@ static u32 rxrpc_rto_min_us(struct rxrpc_peer *peer) static u32 __rxrpc_set_rto(const struct rxrpc_peer *peer) { - return _usecs_to_jiffies((peer->srtt_us >> 3) + peer->rttvar_us); + return usecs_to_jiffies((peer->srtt_us >> 3) + peer->rttvar_us); } static u32 rxrpc_bound_rto(u32 rto) From 64c00888b68be001523eb41b8bb03bd6932401c7 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sun, 26 Sep 2021 00:59:27 +0200 Subject: [PATCH 1519/5344] net: dsa: rtl8366rb: Fix off-by-one bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 5f5f12f5d4b108399130bb5c11f07765851d9cdb ] The max VLAN number with non-4K VLAN activated is 15, and the range is 0..15. Not 16. The impact should be low since we by default have 4K VLAN and thus have 4095 VLANs to play with in this switch. There will not be a problem unless the code is rewritten to only use 16 VLANs. Fixes: d8652956cf37 ("net: dsa: realtek-smi: Add Realtek SMI driver") Cc: Mauri Sandberg Cc: DENG Qingfang Cc: Florian Fainelli Reviewed-by: Alvin Šipraga Reviewed-by: Vladimir Oltean Signed-off-by: Linus Walleij Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/dsa/rtl8366rb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/rtl8366rb.c b/drivers/net/dsa/rtl8366rb.c index 7f731bf369980..d047004360615 100644 --- a/drivers/net/dsa/rtl8366rb.c +++ b/drivers/net/dsa/rtl8366rb.c @@ -1264,7 +1264,7 @@ static int rtl8366rb_set_mc_index(struct realtek_smi *smi, int port, int index) static bool rtl8366rb_is_vlan_valid(struct realtek_smi *smi, unsigned int vlan) { - unsigned int max = RTL8366RB_NUM_VLANS; + unsigned int max = RTL8366RB_NUM_VLANS - 1; if (smi->vlan4k_enabled) max = RTL8366RB_NUM_VIDS - 1; From 1322fac3b134924aa175db06a533e02fa6d9d125 Mon Sep 17 00:00:00 2001 From: Loic Poulain Date: Tue, 28 Sep 2021 14:00:47 +0300 Subject: [PATCH 1520/5344] ath10k: Fix missing frame timestamp for beacon/probe-resp [ Upstream commit e6dfbc3ba90cc2b619229be56b485f085a0a8e1c ] When receiving a beacon or probe response, we should update the boottime_ns field which is the timestamp the frame was received at. (cf mac80211.h) This fixes a scanning issue with Android since it relies on this timestamp to determine when the AP has been seen for the last time (via the nl80211 BSS_LAST_SEEN_BOOTTIME parameter). Fixes: 5e3dd157d7e7 ("ath10k: mac80211 driver for Qualcomm Atheros 802.11ac CQA98xx devices") Signed-off-by: Loic Poulain Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1629811733-7927-1-git-send-email-loic.poulain@linaro.org Signed-off-by: Sasha Levin --- drivers/net/wireless/ath/ath10k/wmi.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c index 91604a14a8f46..796bd93c599b1 100644 --- a/drivers/net/wireless/ath/ath10k/wmi.c +++ b/drivers/net/wireless/ath/ath10k/wmi.c @@ -2541,6 +2541,10 @@ int ath10k_wmi_event_mgmt_rx(struct ath10k *ar, struct sk_buff *skb) if (ieee80211_is_beacon(hdr->frame_control)) ath10k_mac_handle_beacon(ar, skb); + if (ieee80211_is_beacon(hdr->frame_control) || + ieee80211_is_probe_resp(hdr->frame_control)) + status->boottime_ns = ktime_get_boottime_ns(); + ath10k_dbg(ar, ATH10K_DBG_MGMT, "event mgmt rx skb %pK len %d ftype %02x stype %02x\n", skb, skb->len, From 3ba0a29c1c719620dc4dd1be453bc5f489478410 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 27 Sep 2021 14:58:10 +0200 Subject: [PATCH 1521/5344] drm/amdgpu: fix warning for overflow check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 335aea75b0d95518951cad7c4c676e6f1c02c150 ] The overflow check in amdgpu_bo_list_create() causes a warning with clang-14 on 64-bit architectures, since the limit can never be exceeded. drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c:74:18: error: result of comparison of constant 256204778801521549 with expression of type 'unsigned int' is always false [-Werror,-Wtautological-constant-out-of-range-compare] if (num_entries > (SIZE_MAX - sizeof(struct amdgpu_bo_list)) ~~~~~~~~~~~ ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The check remains useful for 32-bit architectures, so just avoid the warning by using size_t as the type for the count. Fixes: 920990cb080a ("drm/amdgpu: allocate the bo_list array after the list") Reviewed-by: Christian König Signed-off-by: Arnd Bergmann Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c index 85b0515c0fdcf..e0d2f79571ef5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c @@ -61,7 +61,7 @@ static void amdgpu_bo_list_free(struct kref *ref) int amdgpu_bo_list_create(struct amdgpu_device *adev, struct drm_file *filp, struct drm_amdgpu_bo_list_entry *info, - unsigned num_entries, struct amdgpu_bo_list **result) + size_t num_entries, struct amdgpu_bo_list **result) { unsigned last_entry = 0, first_userptr = num_entries; struct amdgpu_bo_list_entry *array; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h index a130e766cbdbe..529d52a204cf4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h @@ -60,7 +60,7 @@ int amdgpu_bo_create_list_entry_array(struct drm_amdgpu_bo_list_in *in, int amdgpu_bo_list_create(struct amdgpu_device *adev, struct drm_file *filp, struct drm_amdgpu_bo_list_entry *info, - unsigned num_entries, + size_t num_entries, struct amdgpu_bo_list **list); static inline struct amdgpu_bo_list_entry * From 190a2b3ebedc52906d6fcd2125ed4c46165b59fa Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Thu, 29 Jul 2021 22:23:33 +0200 Subject: [PATCH 1522/5344] media: em28xx: add missing em28xx_close_extension [ Upstream commit 2c98b8a3458df03abdc6945bbef67ef91d181938 ] If em28xx dev has ->dev_next pointer, we need to delete ->dev_next list node from em28xx_extension_devlist on disconnect to avoid UAF bugs and corrupted list bugs, since driver frees this pointer on disconnect. Reported-and-tested-by: syzbot+a6969ef522a36d3344c9@syzkaller.appspotmail.com Fixes: 1a23f81b7dc3 ("V4L/DVB (9979): em28xx: move usb probe code to a proper place") Signed-off-by: Pavel Skripkin Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/usb/em28xx/em28xx-cards.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/media/usb/em28xx/em28xx-cards.c b/drivers/media/usb/em28xx/em28xx-cards.c index 5983e72a0622c..3e96b4b711d75 100644 --- a/drivers/media/usb/em28xx/em28xx-cards.c +++ b/drivers/media/usb/em28xx/em28xx-cards.c @@ -4029,8 +4029,11 @@ static void em28xx_usb_disconnect(struct usb_interface *intf) em28xx_close_extension(dev); - if (dev->dev_next) + if (dev->dev_next) { + em28xx_close_extension(dev->dev_next); em28xx_release_resources(dev->dev_next); + } + em28xx_release_resources(dev); if (dev->dev_next) { From f2386178bc07a505e686b75efb6ceb1688453059 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 20 Jul 2021 18:07:49 +0200 Subject: [PATCH 1523/5344] media: cxd2880-spi: Fix a null pointer dereference on error handling path [ Upstream commit 11b982e950d2138e90bd120501df10a439006ff8 ] Currently the null pointer check on dvb_spi->vcc_supply is inverted and this leads to only null values of the dvb_spi->vcc_supply being passed to the call of regulator_disable causing null pointer dereferences. Fix this by only calling regulator_disable if dvb_spi->vcc_supply is not null. Addresses-Coverity: ("Dereference after null check") Fixes: dcb014582101 ("media: cxd2880-spi: Fix an error handling path") Signed-off-by: Colin Ian King Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/spi/cxd2880-spi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/spi/cxd2880-spi.c b/drivers/media/spi/cxd2880-spi.c index 93194f03764d2..11273be702b6e 100644 --- a/drivers/media/spi/cxd2880-spi.c +++ b/drivers/media/spi/cxd2880-spi.c @@ -618,7 +618,7 @@ cxd2880_spi_probe(struct spi_device *spi) fail_attach: dvb_unregister_adapter(&dvb_spi->adapter); fail_adapter: - if (!dvb_spi->vcc_supply) + if (dvb_spi->vcc_supply) regulator_disable(dvb_spi->vcc_supply); fail_regulator: kfree(dvb_spi); From f499fdec55458375c4c27e6e66060444cf179a1d Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Fri, 13 Aug 2021 16:34:20 +0200 Subject: [PATCH 1524/5344] media: dvb-usb: fix ununit-value in az6027_rc_query [ Upstream commit afae4ef7d5ad913cab1316137854a36bea6268a5 ] Syzbot reported ununit-value bug in az6027_rc_query(). The problem was in missing state pointer initialization. Since this function does nothing we can simply initialize state to REMOTE_NO_KEY_PRESSED. Reported-and-tested-by: syzbot+2cd8c5db4a85f0a04142@syzkaller.appspotmail.com Fixes: 76f9a820c867 ("V4L/DVB: AZ6027: Initial import of the driver") Signed-off-by: Pavel Skripkin Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/usb/dvb-usb/az6027.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/usb/dvb-usb/az6027.c b/drivers/media/usb/dvb-usb/az6027.c index 8de18da0c4bd1..5aa9c501ed9c9 100644 --- a/drivers/media/usb/dvb-usb/az6027.c +++ b/drivers/media/usb/dvb-usb/az6027.c @@ -391,6 +391,7 @@ static struct rc_map_table rc_map_az6027_table[] = { /* remote control stuff (does not work with my box) */ static int az6027_rc_query(struct dvb_usb_device *d, u32 *event, int *state) { + *state = REMOTE_NO_KEY_PRESSED; return 0; } From d8ade2d097e3d0d9a8c8b42d5bd1af5321a3c94b Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Thu, 12 Aug 2021 19:00:43 +0200 Subject: [PATCH 1525/5344] media: TDA1997x: handle short reads of hdmi info frame. [ Upstream commit 48d219f9cc667bc6fbc3e3af0b1bfd75db94fce4 ] Static analysis reports this representative problem tda1997x.c:1939: warning: 7th function call argument is an uninitialized value The 7th argument is buffer[0], which is set in the earlier call to io_readn(). When io_readn() call to io_read() fails with the first read, buffer[0] is not set and 0 is returned and stored in len. The later call to hdmi_infoframe_unpack()'s size parameter is the static size of buffer, always 40, so a short read is not caught in hdmi_infoframe_unpacks()'s checking. The variable len should be used instead. Zero initialize buffer to 0 so it is in a known start state. Fixes: 9ac0038db9a7 ("media: i2c: Add TDA1997x HDMI receiver driver") Signed-off-by: Tom Rix Reviewed-by: Tim Harvey Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/i2c/tda1997x.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/media/i2c/tda1997x.c b/drivers/media/i2c/tda1997x.c index 18a2027ba1450..5faffedb0feba 100644 --- a/drivers/media/i2c/tda1997x.c +++ b/drivers/media/i2c/tda1997x.c @@ -1247,13 +1247,13 @@ tda1997x_parse_infoframe(struct tda1997x_state *state, u16 addr) { struct v4l2_subdev *sd = &state->sd; union hdmi_infoframe frame; - u8 buffer[40]; + u8 buffer[40] = { 0 }; u8 reg; int len, err; /* read data */ len = io_readn(sd, addr, sizeof(buffer), buffer); - err = hdmi_infoframe_unpack(&frame, buffer, sizeof(buffer)); + err = hdmi_infoframe_unpack(&frame, buffer, len); if (err) { v4l_err(state->client, "failed parsing %d byte infoframe: 0x%04x/0x%02x\n", @@ -1927,13 +1927,13 @@ static int tda1997x_log_infoframe(struct v4l2_subdev *sd, int addr) { struct tda1997x_state *state = to_state(sd); union hdmi_infoframe frame; - u8 buffer[40]; + u8 buffer[40] = { 0 }; int len, err; /* read data */ len = io_readn(sd, addr, sizeof(buffer), buffer); v4l2_dbg(1, debug, sd, "infoframe: addr=%d len=%d\n", addr, len); - err = hdmi_infoframe_unpack(&frame, buffer, sizeof(buffer)); + err = hdmi_infoframe_unpack(&frame, buffer, len); if (err) { v4l_err(state->client, "failed parsing %d byte infoframe: 0x%04x/0x%02x\n", From 32aaa9fdaffb7b8c7acfb44eaf1423e62f8eac0d Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 19 Aug 2021 22:21:25 +0200 Subject: [PATCH 1526/5344] media: mtk-vpu: Fix a resource leak in the error handling path of 'mtk_vpu_probe()' [ Upstream commit 2143ad413c05c7be24c3a92760e367b7f6aaac92 ] A successful 'clk_prepare()' call should be balanced by a corresponding 'clk_unprepare()' call in the error handling path of the probe, as already done in the remove function. Update the error handling path accordingly. Fixes: 3003a180ef6b ("[media] VPU: mediatek: support Mediatek VPU") Signed-off-by: Christophe JAILLET Reviewed-by: Houlong Wei Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/platform/mtk-vpu/mtk_vpu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/media/platform/mtk-vpu/mtk_vpu.c b/drivers/media/platform/mtk-vpu/mtk_vpu.c index cc2ff40d060d1..acf64723f9381 100644 --- a/drivers/media/platform/mtk-vpu/mtk_vpu.c +++ b/drivers/media/platform/mtk-vpu/mtk_vpu.c @@ -809,7 +809,8 @@ static int mtk_vpu_probe(struct platform_device *pdev) vpu->wdt.wq = create_singlethread_workqueue("vpu_wdt"); if (!vpu->wdt.wq) { dev_err(dev, "initialize wdt workqueue failed\n"); - return -ENOMEM; + ret = -ENOMEM; + goto clk_unprepare; } INIT_WORK(&vpu->wdt.ws, vpu_wdt_reset_func); mutex_init(&vpu->vpu_mutex); @@ -908,6 +909,8 @@ static int mtk_vpu_probe(struct platform_device *pdev) vpu_clock_disable(vpu); workqueue_destroy: destroy_workqueue(vpu->wdt.wq); +clk_unprepare: + clk_unprepare(vpu->clk); return ret; } From d3112dc37e05f368cbbfb0e8b9052fbde5f9dea0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 3 Aug 2021 21:46:08 +0200 Subject: [PATCH 1527/5344] media: radio-wl1273: Avoid card name truncation [ Upstream commit dfadec236aa99f6086141949c9dc3ec50f3ff20d ] The "card" string only holds 31 characters (and the terminating NUL). In order to avoid truncation, use a shorter card description instead of the current result, "Texas Instruments Wl1273 FM Rad". Suggested-by: Hans Verkuil Fixes: 87d1a50ce451 ("[media] V4L2: WL1273 FM Radio: TI WL1273 FM radio driver") Signed-off-by: Kees Cook Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/radio/radio-wl1273.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/radio/radio-wl1273.c b/drivers/media/radio/radio-wl1273.c index 1123768731676..484046471c03f 100644 --- a/drivers/media/radio/radio-wl1273.c +++ b/drivers/media/radio/radio-wl1273.c @@ -1279,7 +1279,7 @@ static int wl1273_fm_vidioc_querycap(struct file *file, void *priv, strscpy(capability->driver, WL1273_FM_DRIVER_NAME, sizeof(capability->driver)); - strscpy(capability->card, "Texas Instruments Wl1273 FM Radio", + strscpy(capability->card, "TI Wl1273 FM Radio", sizeof(capability->card)); strscpy(capability->bus_info, radio->bus_type, sizeof(capability->bus_info)); From e79ffedeb417510576b5c58c95c38c04f620a278 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 3 Aug 2021 21:46:09 +0200 Subject: [PATCH 1528/5344] media: si470x: Avoid card name truncation [ Upstream commit 2908249f3878a591f7918368fdf0b7b0a6c3158c ] The "card" string only holds 31 characters (and the terminating NUL). In order to avoid truncation, use a shorter card description instead of the current result, "Silicon Labs Si470x FM Radio Re". Suggested-by: Hans Verkuil Fixes: 78656acdcf48 ("V4L/DVB (7038): USB radio driver for Silicon Labs Si470x FM Radio Receivers") Fixes: cc35bbddfe10 ("V4L/DVB (12416): radio-si470x: add i2c driver for si470x") Signed-off-by: Kees Cook Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/radio/si470x/radio-si470x-i2c.c | 2 +- drivers/media/radio/si470x/radio-si470x-usb.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/media/radio/si470x/radio-si470x-i2c.c b/drivers/media/radio/si470x/radio-si470x-i2c.c index f491420d7b538..a972c0705ac79 100644 --- a/drivers/media/radio/si470x/radio-si470x-i2c.c +++ b/drivers/media/radio/si470x/radio-si470x-i2c.c @@ -11,7 +11,7 @@ /* driver definitions */ #define DRIVER_AUTHOR "Joonyoung Shim "; -#define DRIVER_CARD "Silicon Labs Si470x FM Radio Receiver" +#define DRIVER_CARD "Silicon Labs Si470x FM Radio" #define DRIVER_DESC "I2C radio driver for Si470x FM Radio Receivers" #define DRIVER_VERSION "1.0.2" diff --git a/drivers/media/radio/si470x/radio-si470x-usb.c b/drivers/media/radio/si470x/radio-si470x-usb.c index fedff68d8c496..3f8634a465730 100644 --- a/drivers/media/radio/si470x/radio-si470x-usb.c +++ b/drivers/media/radio/si470x/radio-si470x-usb.c @@ -16,7 +16,7 @@ /* driver definitions */ #define DRIVER_AUTHOR "Tobias Lorenz " -#define DRIVER_CARD "Silicon Labs Si470x FM Radio Receiver" +#define DRIVER_CARD "Silicon Labs Si470x FM Radio" #define DRIVER_DESC "USB radio driver for Si470x FM Radio Receivers" #define DRIVER_VERSION "1.0.10" From d7d4702c35949166707191955b9f10adf569ed00 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 3 Aug 2021 21:46:10 +0200 Subject: [PATCH 1529/5344] media: tm6000: Avoid card name truncation [ Upstream commit 42bb98e420d454fef3614b70ea11cc59068395f6 ] The "card" string only holds 31 characters (and the terminating NUL). In order to avoid truncation, use a shorter card description instead of the current result, "Trident TVMaster TM5600/6000/60". Suggested-by: Hans Verkuil Fixes: e28f49b0b2a8 ("V4L/DVB: tm6000: fix some info messages") Signed-off-by: Kees Cook Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/usb/tm6000/tm6000-video.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/media/usb/tm6000/tm6000-video.c b/drivers/media/usb/tm6000/tm6000-video.c index c46cbcfafab3f..8874b0b922eee 100644 --- a/drivers/media/usb/tm6000/tm6000-video.c +++ b/drivers/media/usb/tm6000/tm6000-video.c @@ -854,8 +854,7 @@ static int vidioc_querycap(struct file *file, void *priv, struct tm6000_core *dev = ((struct tm6000_fh *)priv)->dev; strscpy(cap->driver, "tm6000", sizeof(cap->driver)); - strscpy(cap->card, "Trident TVMaster TM5600/6000/6010", - sizeof(cap->card)); + strscpy(cap->card, "Trident TM5600/6000/6010", sizeof(cap->card)); usb_make_path(dev->udev, cap->bus_info, sizeof(cap->bus_info)); cap->capabilities = V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_READWRITE | V4L2_CAP_DEVICE_CAPS; From 51bc2f670d514378fcfa0ebd95599661267e64d2 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 4 Aug 2021 10:50:10 +0200 Subject: [PATCH 1530/5344] media: cx23885: Fix snd_card_free call on null card pointer [ Upstream commit 7266dda2f1dfe151b12ef0c14eb4d4e622fb211c ] Currently a call to snd_card_new that fails will set card with a NULL pointer, this causes a null pointer dereference on the error cleanup path when card it passed to snd_card_free. Fix this by adding a new error exit path that does not call snd_card_free and exiting via this new path. Addresses-Coverity: ("Explicit null dereference") Fixes: 9e44d63246a9 ("[media] cx23885: Add ALSA support") Signed-off-by: Colin Ian King Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/pci/cx23885/cx23885-alsa.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/media/pci/cx23885/cx23885-alsa.c b/drivers/media/pci/cx23885/cx23885-alsa.c index a8e980c6dacb9..50772c2611cad 100644 --- a/drivers/media/pci/cx23885/cx23885-alsa.c +++ b/drivers/media/pci/cx23885/cx23885-alsa.c @@ -550,7 +550,7 @@ struct cx23885_audio_dev *cx23885_audio_register(struct cx23885_dev *dev) SNDRV_DEFAULT_IDX1, SNDRV_DEFAULT_STR1, THIS_MODULE, sizeof(struct cx23885_audio_dev), &card); if (err < 0) - goto error; + goto error_msg; chip = (struct cx23885_audio_dev *) card->private_data; chip->dev = dev; @@ -576,6 +576,7 @@ struct cx23885_audio_dev *cx23885_audio_register(struct cx23885_dev *dev) error: snd_card_free(card); +error_msg: pr_err("%s(): Failed to register analog audio adapter\n", __func__); From 4fcaaeef3320846f1f2cdd1e5a9185052b39e251 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 14 Sep 2021 23:38:37 +0900 Subject: [PATCH 1531/5344] kprobes: Do not use local variable when creating debugfs file [ Upstream commit 8f7262cd66699a4b02eb7549b35c81b2116aad95 ] debugfs_create_file() takes a pointer argument that can be used during file operation callbacks (accessible via i_private in the inode structure). An obvious requirement is for the pointer to refer to valid memory when used. When creating the debugfs file to dynamically enable / disable kprobes, a pointer to local variable is passed to debugfs_create_file(); which will go out of scope when the init function returns. The reason this hasn't triggered random memory corruption is because the pointer is not accessed during the debugfs file callbacks. Since the enabled state is managed by the kprobes_all_disabled global variable, the local variable is not needed. Fix the incorrect (and unnecessary) usage of local variable during debugfs_file_create() by passing NULL instead. Link: https://lkml.kernel.org/r/163163031686.489837.4476867635937014973.stgit@devnote2 Fixes: bf8f6e5b3e51 ("Kprobes: The ON/OFF knob thru debugfs") Signed-off-by: Punit Agrawal Acked-by: Masami Hiramatsu Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Sasha Levin --- kernel/kprobes.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index a7812c115e487..1668439b269d3 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2712,14 +2712,13 @@ static const struct file_operations fops_kp = { static int __init debugfs_kprobe_init(void) { struct dentry *dir; - unsigned int value = 1; dir = debugfs_create_dir("kprobes", NULL); debugfs_create_file("list", 0400, dir, NULL, &debugfs_kprobes_operations); - debugfs_create_file("enabled", 0600, dir, &value, &fops_kp); + debugfs_create_file("enabled", 0600, dir, NULL, &fops_kp); debugfs_create_file("blacklist", 0400, dir, NULL, &debugfs_kprobe_blacklist_ops); From c7ea0a711aa78ebf24d8e4c07f0735f70efce606 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 20 Sep 2021 12:05:35 +0200 Subject: [PATCH 1532/5344] crypto: ecc - fix CRYPTO_DEFAULT_RNG dependency [ Upstream commit 38aa192a05f22f9778f9420e630f0322525ef12e ] The ecc.c file started out as part of the ECDH algorithm but got moved out into a standalone module later. It does not build without CRYPTO_DEFAULT_RNG, so now that other modules are using it as well we can run into this link error: aarch64-linux-ld: ecc.c:(.text+0xfc8): undefined reference to `crypto_default_rng' aarch64-linux-ld: ecc.c:(.text+0xff4): undefined reference to `crypto_put_default_rng' Move the 'select CRYPTO_DEFAULT_RNG' statement into the correct symbol. Fixes: 0d7a78643f69 ("crypto: ecrdsa - add EC-RDSA (GOST 34.10) algorithm") Fixes: 4e6602916bc6 ("crypto: ecdsa - Add support for ECDSA signature verification") Signed-off-by: Arnd Bergmann Reviewed-by: Stefan Berger Signed-off-by: Herbert Xu Signed-off-by: Sasha Levin --- crypto/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/Kconfig b/crypto/Kconfig index fda4faf5fb6d0..b75e382e8508f 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -242,12 +242,12 @@ config CRYPTO_DH config CRYPTO_ECC tristate + select CRYPTO_RNG_DEFAULT config CRYPTO_ECDH tristate "ECDH algorithm" select CRYPTO_ECC select CRYPTO_KPP - select CRYPTO_RNG_DEFAULT help Generic implementation of the ECDH algorithm From ee6d96625a07a545afc5a358a52d01f806c2f957 Mon Sep 17 00:00:00 2001 From: Anel Orazgaliyeva Date: Mon, 6 Sep 2021 18:34:40 +0000 Subject: [PATCH 1533/5344] cpuidle: Fix kobject memory leaks in error paths [ Upstream commit e5f5a66c9aa9c331da5527c2e3fd9394e7091e01 ] Commit c343bf1ba5ef ("cpuidle: Fix three reference count leaks") fixes the cleanup of kobjects; however, it removes kfree() calls altogether, leading to memory leaks. Fix those and also defer the initialization of dev->kobj_dev until after the error check, so that we do not end up with a dangling pointer. Fixes: c343bf1ba5ef ("cpuidle: Fix three reference count leaks") Signed-off-by: Anel Orazgaliyeva Suggested-by: Aman Priyadarshi [ rjw: Subject edits ] Signed-off-by: Rafael J. Wysocki Signed-off-by: Sasha Levin --- drivers/cpuidle/sysfs.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c index 306a6213df1c5..86e89eba8b449 100644 --- a/drivers/cpuidle/sysfs.c +++ b/drivers/cpuidle/sysfs.c @@ -491,6 +491,7 @@ static int cpuidle_add_state_sysfs(struct cpuidle_device *device) &kdev->kobj, "state%d", i); if (ret) { kobject_put(&kobj->kobj); + kfree(kobj); goto error_state; } cpuidle_add_s2idle_attr_group(kobj); @@ -622,6 +623,7 @@ static int cpuidle_add_driver_sysfs(struct cpuidle_device *dev) &kdev->kobj, "driver"); if (ret) { kobject_put(&kdrv->kobj); + kfree(kdrv); return ret; } @@ -708,7 +710,6 @@ int cpuidle_add_sysfs(struct cpuidle_device *dev) if (!kdev) return -ENOMEM; kdev->dev = dev; - dev->kobj_dev = kdev; init_completion(&kdev->kobj_unregister); @@ -716,9 +717,11 @@ int cpuidle_add_sysfs(struct cpuidle_device *dev) "cpuidle"); if (error) { kobject_put(&kdev->kobj); + kfree(kdev); return error; } + dev->kobj_dev = kdev; kobject_uevent(&kdev->kobj, KOBJ_ADD); return 0; From a5ddd30f4187663eda69ab4c19abc9a66a997c7f Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 17 Sep 2021 18:07:02 +0200 Subject: [PATCH 1534/5344] media: em28xx: Don't use ops->suspend if it is NULL [ Upstream commit 51fa3b70d27342baf1ea8aaab3e96e5f4f26d5b2 ] The call to ops->suspend for the dev->dev_next case can currently trigger a call on a null function pointer if ops->suspend is null. Skip over the use of function ops->suspend if it is null. Addresses-Coverity: ("Dereference after null check") Fixes: be7fd3c3a8c5 ("media: em28xx: Hauppauge DualHD second tuner functionality") Signed-off-by: Colin Ian King Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/usb/em28xx/em28xx-core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/media/usb/em28xx/em28xx-core.c b/drivers/media/usb/em28xx/em28xx-core.c index 3daa64bb1e1d9..af9216278024f 100644 --- a/drivers/media/usb/em28xx/em28xx-core.c +++ b/drivers/media/usb/em28xx/em28xx-core.c @@ -1152,8 +1152,9 @@ int em28xx_suspend_extension(struct em28xx *dev) dev_info(&dev->intf->dev, "Suspending extensions\n"); mutex_lock(&em28xx_devlist_mutex); list_for_each_entry(ops, &em28xx_extension_devlist, next) { - if (ops->suspend) - ops->suspend(dev); + if (!ops->suspend) + continue; + ops->suspend(dev); if (dev->dev_next) ops->suspend(dev->dev_next); } From d7378d21dbc02e5812379005f57af034243b69d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20L=C3=BCssing?= Date: Tue, 5 Oct 2021 16:55:53 +0300 Subject: [PATCH 1535/5344] ath9k: Fix potential interrupt storm on queue reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 4925642d541278575ad1948c5924d71ffd57ef14 ] In tests with two Lima boards from 8devices (QCA4531 based) on OpenWrt 19.07 we could force a silent restart of a device with no serial output when we were sending a high amount of UDP traffic (iperf3 at 80 MBit/s in both directions from external hosts, saturating the wifi and causing a load of about 4.5 to 6) and were then triggering an ath9k_queue_reset(). Further debugging showed that the restart was caused by the ath79 watchdog. With disabled watchdog we could observe that the device was constantly going into ath_isr() interrupt handler and was returning early after the ATH_OP_HW_RESET flag test, without clearing any interrupts. Even though ath9k_queue_reset() calls ath9k_hw_kill_interrupts(). With JTAG we could observe the following race condition: 1) ath9k_queue_reset() ... -> ath9k_hw_kill_interrupts() -> set_bit(ATH_OP_HW_RESET, &common->op_flags); ... <- returns 2) ath9k_tasklet() ... -> ath9k_hw_resume_interrupts() ... <- returns 3) loops around: ... handle_int() -> ath_isr() ... -> if (test_bit(ATH_OP_HW_RESET, &common->op_flags)) return IRQ_HANDLED; x) ath_reset_internal(): => never reached <= And in ath_isr() we would typically see the following interrupts / interrupt causes: * status: 0x00111030 or 0x00110030 * async_cause: 2 (AR_INTR_MAC_IPQ) * sync_cause: 0 So the ath9k_tasklet() reenables the ath9k interrupts through ath9k_hw_resume_interrupts() which ath9k_queue_reset() had just disabled. And ath_isr() then keeps firing because it returns IRQ_HANDLED without actually clearing the interrupt. To fix this IRQ storm also clear/disable the interrupts again when we are in reset state. Cc: Sven Eckelmann Cc: Simon Wunderlich Cc: Linus Lüssing Fixes: 872b5d814f99 ("ath9k: do not access hardware on IRQs during reset") Signed-off-by: Linus Lüssing Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210914192515.9273-3-linus.luessing@c0d3.blue Signed-off-by: Sasha Levin --- drivers/net/wireless/ath/ath9k/main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c index 28ccdcb197de2..ec13bd8d5487d 100644 --- a/drivers/net/wireless/ath/ath9k/main.c +++ b/drivers/net/wireless/ath/ath9k/main.c @@ -530,8 +530,10 @@ irqreturn_t ath_isr(int irq, void *dev) ath9k_debug_sync_cause(sc, sync_cause); status &= ah->imask; /* discard unasked-for bits */ - if (test_bit(ATH_OP_HW_RESET, &common->op_flags)) + if (test_bit(ATH_OP_HW_RESET, &common->op_flags)) { + ath9k_hw_kill_interrupts(sc->sc_ah); return IRQ_HANDLED; + } /* * If there are no status bits set, then this interrupt was not From a73ec5c4c3522f6a49697c5df8d6751a7bd5c764 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 5 Oct 2021 15:44:19 +0000 Subject: [PATCH 1536/5344] EDAC/amd64: Handle three rank interleaving mode [ Upstream commit 9f4873fb6af7966de8fcbd95c36b61351c1c4b1f ] AMD Rome systems and later support interleaving between three identical ranks within a channel. Check for this mode by counting the number of enabled chip selects and comparing their masks. If there are exactly three enabled chip selects and their masks are identical, then three rank interleaving is enabled. The size of a rank is determined from its mask value. However, three rank interleaving doesn't follow the method of swapping an interleave bit with the most significant bit. Rather, the interleave bit is flipped and the most significant bit remains the same. There is only a single interleave bit in this case. Account for this when determining the chip select size by keeping the most significant bit at its original value and ignoring any zero bits. This will return a full bitmask in [MSB:1]. Fixes: e53a3b267fb0 ("EDAC/amd64: Find Chip Select memory size using Address Mask") Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211005154419.2060504-1-yazen.ghannam@amd.com Signed-off-by: Sasha Levin --- drivers/edac/amd64_edac.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 18b046c902428..8a6c5f98775af 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -790,12 +790,14 @@ static void debug_dump_dramcfg_low(struct amd64_pvt *pvt, u32 dclr, int chan) #define CS_ODD_PRIMARY BIT(1) #define CS_EVEN_SECONDARY BIT(2) #define CS_ODD_SECONDARY BIT(3) +#define CS_3R_INTERLEAVE BIT(4) #define CS_EVEN (CS_EVEN_PRIMARY | CS_EVEN_SECONDARY) #define CS_ODD (CS_ODD_PRIMARY | CS_ODD_SECONDARY) static int f17_get_cs_mode(int dimm, u8 ctrl, struct amd64_pvt *pvt) { + u8 base, count = 0; int cs_mode = 0; if (csrow_enabled(2 * dimm, ctrl, pvt)) @@ -808,6 +810,20 @@ static int f17_get_cs_mode(int dimm, u8 ctrl, struct amd64_pvt *pvt) if (csrow_sec_enabled(2 * dimm + 1, ctrl, pvt)) cs_mode |= CS_ODD_SECONDARY; + /* + * 3 Rank inteleaving support. + * There should be only three bases enabled and their two masks should + * be equal. + */ + for_each_chip_select(base, ctrl, pvt) + count += csrow_enabled(base, ctrl, pvt); + + if (count == 3 && + pvt->csels[ctrl].csmasks[0] == pvt->csels[ctrl].csmasks[1]) { + edac_dbg(1, "3R interleaving in use.\n"); + cs_mode |= CS_3R_INTERLEAVE; + } + return cs_mode; } @@ -1616,10 +1632,14 @@ static int f17_addr_mask_to_cs_size(struct amd64_pvt *pvt, u8 umc, * * The MSB is the number of bits in the full mask because BIT[0] is * always 0. + * + * In the special 3 Rank interleaving case, a single bit is flipped + * without swapping with the most significant bit. This can be handled + * by keeping the MSB where it is and ignoring the single zero bit. */ msb = fls(addr_mask_orig) - 1; weight = hweight_long(addr_mask_orig); - num_zero_bits = msb - weight; + num_zero_bits = msb - weight - !!(cs_mode & CS_3R_INTERLEAVE); /* Take the number of zero bits off from the top of the mask. */ addr_mask_deinterleaved = GENMASK_ULL(msb - num_zero_bits, 1); From 4b3d4c38b8734c52a7c6fad11dd5b6286d0797f4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 25 Sep 2021 22:40:26 +0200 Subject: [PATCH 1537/5344] netfilter: nft_dynset: relax superfluous check on set updates [ Upstream commit 7b1394892de8d95748d05e3ee41e85edb4abbfa1 ] Relax this condition to make add and update commands idempotent for sets with no timeout. The eval function already checks if the set element timeout is available and updates it if the update command is used. Fixes: 22fe54d5fefc ("netfilter: nf_tables: add support for dynamic set updates") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nft_dynset.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 95415d2b81c93..6fdea0e57db8a 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -164,17 +164,8 @@ static int nft_dynset_init(const struct nft_ctx *ctx, return -EBUSY; priv->op = ntohl(nla_get_be32(tb[NFTA_DYNSET_OP])); - switch (priv->op) { - case NFT_DYNSET_OP_ADD: - case NFT_DYNSET_OP_DELETE: - break; - case NFT_DYNSET_OP_UPDATE: - if (!(set->flags & NFT_SET_TIMEOUT)) - return -EOPNOTSUPP; - break; - default: + if (priv->op > NFT_DYNSET_OP_DELETE) return -EOPNOTSUPP; - } timeout = 0; if (tb[NFTA_DYNSET_TIMEOUT] != NULL) { From 53abe8b1256f2b6a5d19716f1f271898d2740fa2 Mon Sep 17 00:00:00 2001 From: Evgeny Novikov Date: Sun, 22 Aug 2021 11:48:03 +0200 Subject: [PATCH 1538/5344] media: dvb-frontends: mn88443x: Handle errors of clk_prepare_enable() [ Upstream commit 69a10678e2fba3d182e78ea041f2d1b1a6058764 ] mn88443x_cmn_power_on() did not handle possible errors of clk_prepare_enable() and always finished successfully so that its caller mn88443x_probe() did not care about failed preparing/enabling of clocks as well. Add missed error handling in both mn88443x_cmn_power_on() and mn88443x_probe(). This required to change the return value of the former from "void" to "int". Found by Linux Driver Verification project (linuxtesting.org). Fixes: 0f408ce8941f ("media: dvb-frontends: add Socionext MN88443x ISDB-S/T demodulator driver") Signed-off-by: Evgeny Novikov Co-developed-by: Kirill Shilimanov Signed-off-by: Kirill Shilimanov Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/dvb-frontends/mn88443x.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/media/dvb-frontends/mn88443x.c b/drivers/media/dvb-frontends/mn88443x.c index e4528784f8477..fff212c0bf3b5 100644 --- a/drivers/media/dvb-frontends/mn88443x.c +++ b/drivers/media/dvb-frontends/mn88443x.c @@ -204,11 +204,18 @@ struct mn88443x_priv { struct regmap *regmap_t; }; -static void mn88443x_cmn_power_on(struct mn88443x_priv *chip) +static int mn88443x_cmn_power_on(struct mn88443x_priv *chip) { + struct device *dev = &chip->client_s->dev; struct regmap *r_t = chip->regmap_t; + int ret; - clk_prepare_enable(chip->mclk); + ret = clk_prepare_enable(chip->mclk); + if (ret) { + dev_err(dev, "Failed to prepare and enable mclk: %d\n", + ret); + return ret; + } gpiod_set_value_cansleep(chip->reset_gpio, 1); usleep_range(100, 1000); @@ -222,6 +229,8 @@ static void mn88443x_cmn_power_on(struct mn88443x_priv *chip) } else { regmap_write(r_t, HIZSET3, 0x8f); } + + return 0; } static void mn88443x_cmn_power_off(struct mn88443x_priv *chip) @@ -738,7 +747,10 @@ static int mn88443x_probe(struct i2c_client *client, chip->fe.demodulator_priv = chip; i2c_set_clientdata(client, chip); - mn88443x_cmn_power_on(chip); + ret = mn88443x_cmn_power_on(chip); + if (ret) + goto err_i2c_t; + mn88443x_s_sleep(chip); mn88443x_t_sleep(chip); From ddfb4b593235d2ff03e3151efe073fa9e10c645b Mon Sep 17 00:00:00 2001 From: Giovanni Cabiddu Date: Tue, 28 Sep 2021 12:44:29 +0100 Subject: [PATCH 1539/5344] crypto: qat - detect PFVF collision after ACK [ Upstream commit 9b768e8a3909ac1ab39ed44a3933716da7761a6f ] Detect a PFVF collision between the local and the remote function by checking if the message on the PFVF CSR has been overwritten. This is done after the remote function confirms that the message has been received, by clearing the interrupt bit, or the maximum number of attempts (ADF_IOV_MSG_ACK_MAX_RETRY) to check the CSR has been exceeded. Fixes: ed8ccaef52fa ("crypto: qat - Add support for SRIOV") Signed-off-by: Giovanni Cabiddu Co-developed-by: Marco Chiappero Signed-off-by: Marco Chiappero Signed-off-by: Herbert Xu Signed-off-by: Sasha Levin --- drivers/crypto/qat/qat_common/adf_pf2vf_msg.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/crypto/qat/qat_common/adf_pf2vf_msg.c b/drivers/crypto/qat/qat_common/adf_pf2vf_msg.c index c64481160b711..72fd2bbbe704e 100644 --- a/drivers/crypto/qat/qat_common/adf_pf2vf_msg.c +++ b/drivers/crypto/qat/qat_common/adf_pf2vf_msg.c @@ -195,6 +195,13 @@ static int __adf_iov_putmsg(struct adf_accel_dev *accel_dev, u32 msg, u8 vf_nr) val = ADF_CSR_RD(pmisc_bar_addr, pf2vf_offset); } while ((val & int_bit) && (count++ < ADF_IOV_MSG_ACK_MAX_RETRY)); + if (val != msg) { + dev_dbg(&GET_DEV(accel_dev), + "Collision - PFVF CSR overwritten by remote function\n"); + ret = -EIO; + goto out; + } + if (val & int_bit) { dev_dbg(&GET_DEV(accel_dev), "ACK not received from remote\n"); val &= ~int_bit; From e7951b89170df81f4f97965e4da2b9c199ea0e35 Mon Sep 17 00:00:00 2001 From: Giovanni Cabiddu Date: Tue, 28 Sep 2021 12:44:30 +0100 Subject: [PATCH 1540/5344] crypto: qat - disregard spurious PFVF interrupts [ Upstream commit 18fcba469ba5359c1de7e3fb16f7b9e8cd1b8e02 ] Upon receiving a PFVF message, check if the interrupt bit is set in the message. If it is not, that means that the interrupt was probably triggered by a collision. In this case, disregard the message and re-enable the interrupts. Fixes: ed8ccaef52fa ("crypto: qat - Add support for SRIOV") Signed-off-by: Giovanni Cabiddu Reviewed-by: Marco Chiappero Signed-off-by: Herbert Xu Signed-off-by: Sasha Levin --- drivers/crypto/qat/qat_common/adf_pf2vf_msg.c | 6 ++++++ drivers/crypto/qat/qat_common/adf_vf_isr.c | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/drivers/crypto/qat/qat_common/adf_pf2vf_msg.c b/drivers/crypto/qat/qat_common/adf_pf2vf_msg.c index 72fd2bbbe704e..180016e157771 100644 --- a/drivers/crypto/qat/qat_common/adf_pf2vf_msg.c +++ b/drivers/crypto/qat/qat_common/adf_pf2vf_msg.c @@ -250,6 +250,11 @@ void adf_vf2pf_req_hndl(struct adf_accel_vf_info *vf_info) /* Read message from the VF */ msg = ADF_CSR_RD(pmisc_addr, hw_data->get_pf2vf_offset(vf_nr)); + if (!(msg & ADF_VF2PF_INT)) { + dev_info(&GET_DEV(accel_dev), + "Spurious VF2PF interrupt, msg %X. Ignored\n", msg); + goto out; + } /* To ACK, clear the VF2PFINT bit */ msg &= ~ADF_VF2PF_INT; @@ -333,6 +338,7 @@ void adf_vf2pf_req_hndl(struct adf_accel_vf_info *vf_info) if (resp && adf_iov_putmsg(accel_dev, resp, vf_nr)) dev_err(&GET_DEV(accel_dev), "Failed to send response to VF\n"); +out: /* re-enable interrupt on PF from this VF */ adf_enable_vf2pf_interrupts(accel_dev, (1 << vf_nr)); return; diff --git a/drivers/crypto/qat/qat_common/adf_vf_isr.c b/drivers/crypto/qat/qat_common/adf_vf_isr.c index ef90902c8200d..86274e3c6781d 100644 --- a/drivers/crypto/qat/qat_common/adf_vf_isr.c +++ b/drivers/crypto/qat/qat_common/adf_vf_isr.c @@ -123,6 +123,11 @@ static void adf_pf2vf_bh_handler(void *data) /* Read the message from PF */ msg = ADF_CSR_RD(pmisc_bar_addr, hw_data->get_pf2vf_offset(0)); + if (!(msg & ADF_PF2VF_INT)) { + dev_info(&GET_DEV(accel_dev), + "Spurious PF2VF interrupt, msg %X. Ignored\n", msg); + goto out; + } if (!(msg & ADF_PF2VF_MSGORIGIN_SYSTEM)) /* Ignore legacy non-system (non-kernel) PF2VF messages */ @@ -171,6 +176,7 @@ static void adf_pf2vf_bh_handler(void *data) msg &= ~ADF_PF2VF_INT; ADF_CSR_WR(pmisc_bar_addr, hw_data->get_pf2vf_offset(0), msg); +out: /* Re-enable PF2VF interrupts */ adf_enable_pf2vf_interrupts(accel_dev); return; From de62a75dbbbdee02068b9c19d04e789d76e1a0e4 Mon Sep 17 00:00:00 2001 From: Markus Schneider-Pargmann Date: Thu, 30 Sep 2021 21:12:42 +0200 Subject: [PATCH 1541/5344] hwrng: mtk - Force runtime pm ops for sleep ops [ Upstream commit b6f5f0c8f72d348b2d07b20d7b680ef13a7ffe98 ] Currently mtk_rng_runtime_suspend/resume is called for both runtime pm and system sleep operations. This is wrong as these should only be runtime ops as the name already suggests. Currently freezing the system will lead to a call to mtk_rng_runtime_suspend even if the device currently isn't active. This leads to a clock warning because it is disabled/unprepared although it isn't enabled/prepared currently. This patch fixes this by only setting the runtime pm ops and forces to call the runtime pm ops from the system sleep ops as well if active but not otherwise. Fixes: 81d2b34508c6 ("hwrng: mtk - add runtime PM support") Signed-off-by: Markus Schneider-Pargmann Signed-off-by: Herbert Xu Signed-off-by: Sasha Levin --- drivers/char/hw_random/mtk-rng.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/char/hw_random/mtk-rng.c b/drivers/char/hw_random/mtk-rng.c index e649be5a5f132..6670516fa194d 100644 --- a/drivers/char/hw_random/mtk-rng.c +++ b/drivers/char/hw_random/mtk-rng.c @@ -173,8 +173,13 @@ static int mtk_rng_runtime_resume(struct device *dev) return mtk_rng_init(&priv->rng); } -static UNIVERSAL_DEV_PM_OPS(mtk_rng_pm_ops, mtk_rng_runtime_suspend, - mtk_rng_runtime_resume, NULL); +static const struct dev_pm_ops mtk_rng_pm_ops = { + SET_RUNTIME_PM_OPS(mtk_rng_runtime_suspend, + mtk_rng_runtime_resume, NULL) + SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, + pm_runtime_force_resume) +}; + #define MTK_RNG_PM_OPS (&mtk_rng_pm_ops) #else /* CONFIG_PM */ #define MTK_RNG_PM_OPS NULL From 30c8974c04b959542a261fbf82c78664c189dae2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 6 Oct 2021 10:35:42 +0300 Subject: [PATCH 1542/5344] b43legacy: fix a lower bounds test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit c1c8380b0320ab757e60ed90efc8b1992a943256 ] The problem is that "channel" is an unsigned int, when it's less 5 the value of "channel - 5" is not a negative number as one would expect but is very high positive value instead. This means that "start" becomes a very high positive value. The result of that is that we never enter the "for (i = start; i <= end; i++) {" loop. Instead of storing the result from b43legacy_radio_aci_detect() it just uses zero. Fixes: 75388acd0cd8 ("[B43LEGACY]: add mac80211-based driver for legacy BCM43xx devices") Signed-off-by: Dan Carpenter Acked-by: Michael Büsch Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20211006073542.GD8404@kili Signed-off-by: Sasha Levin --- drivers/net/wireless/broadcom/b43legacy/radio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/broadcom/b43legacy/radio.c b/drivers/net/wireless/broadcom/b43legacy/radio.c index da40d1ca8723d..e4dce76a4481d 100644 --- a/drivers/net/wireless/broadcom/b43legacy/radio.c +++ b/drivers/net/wireless/broadcom/b43legacy/radio.c @@ -283,7 +283,7 @@ u8 b43legacy_radio_aci_scan(struct b43legacy_wldev *dev) & 0x7FFF); b43legacy_set_all_gains(dev, 3, 8, 1); - start = (channel - 5 > 0) ? channel - 5 : 1; + start = (channel > 5) ? channel - 5 : 1; end = (channel + 5 < 14) ? channel + 5 : 13; for (i = start; i <= end; i++) { From 7c43e5e87813eada620c7cfcebbeb6b1e1048db1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 6 Oct 2021 10:36:22 +0300 Subject: [PATCH 1543/5344] b43: fix a lower bounds test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 9b793db5fca44d01f72d3564a168171acf7c4076 ] The problem is that "channel" is an unsigned int, when it's less 5 the value of "channel - 5" is not a negative number as one would expect but is very high positive value instead. This means that "start" becomes a very high positive value. The result of that is that we never enter the "for (i = start; i <= end; i++) {" loop. Instead of storing the result from b43legacy_radio_aci_detect() it just uses zero. Fixes: ef1a628d83fc ("b43: Implement dynamic PHY API") Signed-off-by: Dan Carpenter Acked-by: Michael Büsch Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20211006073621.GE8404@kili Signed-off-by: Sasha Levin --- drivers/net/wireless/broadcom/b43/phy_g.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/broadcom/b43/phy_g.c b/drivers/net/wireless/broadcom/b43/phy_g.c index 1e022ec733a37..8b8fdb4965bde 100644 --- a/drivers/net/wireless/broadcom/b43/phy_g.c +++ b/drivers/net/wireless/broadcom/b43/phy_g.c @@ -2297,7 +2297,7 @@ static u8 b43_gphy_aci_scan(struct b43_wldev *dev) b43_phy_mask(dev, B43_PHY_G_CRS, 0x7FFF); b43_set_all_gains(dev, 3, 8, 1); - start = (channel - 5 > 0) ? channel - 5 : 1; + start = (channel > 5) ? channel - 5 : 1; end = (channel + 5 < 14) ? channel + 5 : 13; for (i = start; i <= end; i++) { From f1c0528781d7958f2e2a4c7d6be26f7b26efb6df Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Tue, 21 Sep 2021 14:00:25 +0300 Subject: [PATCH 1544/5344] mmc: sdhci-omap: Fix NULL pointer exception if regulator is not configured [ Upstream commit 8e0e7bd38b1ec7f9e5d18725ad41828be4e09859 ] If sdhci-omap is configured for an unused device instance and the device is not set as disabled, we can get a NULL pointer dereference: Unable to handle kernel NULL pointer dereference at virtual address 00000045 ... (regulator_set_voltage) from [] (mmc_regulator_set_ocr+0x44/0xd0) (mmc_regulator_set_ocr) from [] (sdhci_set_ios+0xa4/0x490) (sdhci_set_ios) from [] (sdhci_omap_set_ios+0x124/0x160) (sdhci_omap_set_ios) from [] (mmc_power_up.part.0+0x3c/0x154) (mmc_power_up.part.0) from [] (mmc_start_host+0x88/0x9c) (mmc_start_host) from [] (mmc_add_host+0x58/0x7c) (mmc_add_host) from [] (__sdhci_add_host+0xf0/0x22c) (__sdhci_add_host) from [] (sdhci_omap_probe+0x318/0x72c) (sdhci_omap_probe) from [] (platform_probe+0x58/0xb8) AFAIK we are not seeing this with the devices configured in the mainline kernel but this can cause issues for folks bringing up their boards. Fixes: 7d326930d352 ("mmc: sdhci-omap: Add OMAP SDHCI driver") Signed-off-by: Tony Lindgren Link: https://lore.kernel.org/r/20210921110029.21944-2-tony@atomide.com Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- drivers/mmc/host/sdhci-omap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/sdhci-omap.c b/drivers/mmc/host/sdhci-omap.c index d3135249b2e40..346ca41b28f8b 100644 --- a/drivers/mmc/host/sdhci-omap.c +++ b/drivers/mmc/host/sdhci-omap.c @@ -675,7 +675,8 @@ static void sdhci_omap_set_power(struct sdhci_host *host, unsigned char mode, { struct mmc_host *mmc = host->mmc; - mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, vdd); + if (!IS_ERR(mmc->supply.vmmc)) + mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, vdd); } static int sdhci_omap_enable_dma(struct sdhci_host *host) From 9e63d860c7a8640b240f6340195a90b7f9196f70 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 27 Sep 2021 11:44:47 +0200 Subject: [PATCH 1545/5344] memstick: avoid out-of-range warning [ Upstream commit 4853396f03c3019eccf5cd113e464231e9ddf0b3 ] clang-14 complains about a sanity check that always passes when the page size is 64KB or larger: drivers/memstick/core/ms_block.c:1739:21: error: result of comparison of constant 65536 with expression of type 'unsigned short' is always false [-Werror,-Wtautological-constant-out-of-range-compare] if (msb->page_size > PAGE_SIZE) { ~~~~~~~~~~~~~~ ^ ~~~~~~~~~ This is fine, it will still work on all architectures, so just shut up that warning with a cast. Fixes: 0ab30494bc4f ("memstick: add support for legacy memorysticks") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20210927094520.696665-1-arnd@kernel.org Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- drivers/memstick/core/ms_block.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c index d9ee8e3dc72da..55907e4c36b18 100644 --- a/drivers/memstick/core/ms_block.c +++ b/drivers/memstick/core/ms_block.c @@ -1727,7 +1727,7 @@ static int msb_init_card(struct memstick_dev *card) msb->pages_in_block = boot_block->attr.block_size * 2; msb->block_size = msb->page_size * msb->pages_in_block; - if (msb->page_size > PAGE_SIZE) { + if ((size_t)msb->page_size > PAGE_SIZE) { /* this isn't supported by linux at all, anyway*/ dbg("device page %d size isn't supported", msb->page_size); return -EINVAL; From 056f42acc477a2f314275d3df7e697f4c5589a8a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 11 Oct 2021 15:39:12 +0300 Subject: [PATCH 1546/5344] memstick: jmb38x_ms: use appropriate free function in jmb38x_ms_alloc_host() [ Upstream commit beae4a6258e64af609ad5995cc6b6056eb0d898e ] The "msh" pointer is device managed, meaning that memstick_alloc_host() calls device_initialize() on it. That means that it can't be free using kfree() but must instead be freed with memstick_free_host(). Otherwise it leads to a tiny memory leak of device resources. Fixes: 60fdd931d577 ("memstick: add support for JMicron jmb38x MemoryStick host controller") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20211011123912.GD15188@kili Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- drivers/memstick/host/jmb38x_ms.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/memstick/host/jmb38x_ms.c b/drivers/memstick/host/jmb38x_ms.c index 64fff6abe60e8..74d6686b35f77 100644 --- a/drivers/memstick/host/jmb38x_ms.c +++ b/drivers/memstick/host/jmb38x_ms.c @@ -899,7 +899,7 @@ static struct memstick_host *jmb38x_ms_alloc_host(struct jmb38x_ms *jm, int cnt) iounmap(host->addr); err_out_free: - kfree(msh); + memstick_free_host(msh); return NULL; } From dde91ffbc8e0b68c20f53193a506c88d21fb3cac Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 11 Oct 2021 14:12:35 +0200 Subject: [PATCH 1547/5344] net, neigh: Fix NTF_EXT_LEARNED in combination with NTF_USE [ Upstream commit e4400bbf5b15750e1b59bf4722d18d99be60c69f ] The NTF_EXT_LEARNED neigh flag is usually propagated back to user space upon dump of the neighbor table. However, when used in combination with NTF_USE flag this is not the case despite exempting the entry from the garbage collector. This results in inconsistent state since entries are typically marked in neigh->flags with NTF_EXT_LEARNED, but here they are not. Fix it by propagating the creation flag to ___neigh_create(). Before fix: # ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn # ./ip/ip n 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE [...] After fix: # ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn # ./ip/ip n 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a extern_learn REACHABLE [...] Fixes: 9ce33e46531d ("neighbour: support for NTF_EXT_LEARNED flag") Signed-off-by: Daniel Borkmann Acked-by: Roopa Prabhu Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/core/neighbour.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index f94d405358a21..3a4cf53e38416 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -380,7 +380,7 @@ EXPORT_SYMBOL(neigh_ifdown); static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev, - bool exempt_from_gc) + u8 flags, bool exempt_from_gc) { struct neighbour *n = NULL; unsigned long now = jiffies; @@ -413,6 +413,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, n->updated = n->used = now; n->nud_state = NUD_NONE; n->output = neigh_blackhole; + n->flags = flags; seqlock_init(&n->hh.hh_lock); n->parms = neigh_parms_clone(&tbl->parms); timer_setup(&n->timer, neigh_timer_handler, 0); @@ -576,19 +577,18 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, } EXPORT_SYMBOL(neigh_lookup_nodev); -static struct neighbour *___neigh_create(struct neigh_table *tbl, - const void *pkey, - struct net_device *dev, - bool exempt_from_gc, bool want_ref) +static struct neighbour * +___neigh_create(struct neigh_table *tbl, const void *pkey, + struct net_device *dev, u8 flags, + bool exempt_from_gc, bool want_ref) { - struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev, exempt_from_gc); - u32 hash_val; - unsigned int key_len = tbl->key_len; - int error; + u32 hash_val, key_len = tbl->key_len; + struct neighbour *n1, *rc, *n; struct neigh_hash_table *nht; + int error; + n = neigh_alloc(tbl, dev, flags, exempt_from_gc); trace_neigh_create(tbl, dev, pkey, n, exempt_from_gc); - if (!n) { rc = ERR_PTR(-ENOBUFS); goto out; @@ -675,7 +675,7 @@ static struct neighbour *___neigh_create(struct neigh_table *tbl, struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref) { - return ___neigh_create(tbl, pkey, dev, false, want_ref); + return ___neigh_create(tbl, pkey, dev, 0, false, want_ref); } EXPORT_SYMBOL(__neigh_create); @@ -1945,7 +1945,9 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, exempt_from_gc = ndm->ndm_state & NUD_PERMANENT || ndm->ndm_flags & NTF_EXT_LEARNED; - neigh = ___neigh_create(tbl, dst, dev, exempt_from_gc, true); + neigh = ___neigh_create(tbl, dst, dev, + ndm->ndm_flags & NTF_EXT_LEARNED, + exempt_from_gc, true); if (IS_ERR(neigh)) { err = PTR_ERR(neigh); goto out; From cc5d9abf675883d1532eaac1a115a1b302c21ddc Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 12 Oct 2021 19:27:58 +0800 Subject: [PATCH 1548/5344] hwmon: Fix possible memleak in __hwmon_device_register() [ Upstream commit ada61aa0b1184a8fda1a89a340c7d6cc4e59aee5 ] I got memory leak as follows when doing fault injection test: unreferenced object 0xffff888102740438 (size 8): comm "27", pid 859, jiffies 4295031351 (age 143.992s) hex dump (first 8 bytes): 68 77 6d 6f 6e 30 00 00 hwmon0.. backtrace: [<00000000544b5996>] __kmalloc_track_caller+0x1a6/0x300 [<00000000df0d62b9>] kvasprintf+0xad/0x140 [<00000000d3d2a3da>] kvasprintf_const+0x62/0x190 [<000000005f8f0f29>] kobject_set_name_vargs+0x56/0x140 [<00000000b739e4b9>] dev_set_name+0xb0/0xe0 [<0000000095b69c25>] __hwmon_device_register+0xf19/0x1e50 [hwmon] [<00000000a7e65b52>] hwmon_device_register_with_info+0xcb/0x110 [hwmon] [<000000006f181e86>] devm_hwmon_device_register_with_info+0x85/0x100 [hwmon] [<0000000081bdc567>] tmp421_probe+0x2d2/0x465 [tmp421] [<00000000502cc3f8>] i2c_device_probe+0x4e1/0xbb0 [<00000000f90bda3b>] really_probe+0x285/0xc30 [<000000007eac7b77>] __driver_probe_device+0x35f/0x4f0 [<000000004953d43d>] driver_probe_device+0x4f/0x140 [<000000002ada2d41>] __device_attach_driver+0x24c/0x330 [<00000000b3977977>] bus_for_each_drv+0x15d/0x1e0 [<000000005bf2a8e3>] __device_attach+0x267/0x410 When device_register() returns an error, the name allocated in dev_set_name() will be leaked, the put_device() should be used instead of calling hwmon_dev_release() to give up the device reference, then the name will be freed in kobject_cleanup(). Reported-by: Hulk Robot Fixes: bab2243ce189 ("hwmon: Introduce hwmon_device_register_with_groups") Signed-off-by: Yang Yingliang Link: https://lore.kernel.org/r/20211012112758.2681084-1-yangyingliang@huawei.com Signed-off-by: Guenter Roeck Signed-off-by: Sasha Levin --- drivers/hwmon/hwmon.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/hwmon/hwmon.c b/drivers/hwmon/hwmon.c index d018b20089ecd..a2175394cd253 100644 --- a/drivers/hwmon/hwmon.c +++ b/drivers/hwmon/hwmon.c @@ -645,8 +645,10 @@ __hwmon_device_register(struct device *dev, const char *name, void *drvdata, dev_set_drvdata(hdev, drvdata); dev_set_name(hdev, HWMON_ID_FORMAT, id); err = device_register(hdev); - if (err) - goto free_hwmon; + if (err) { + put_device(hdev); + goto ida_remove; + } if (dev && dev->of_node && chip && chip->ops->read && chip->info[0]->type == hwmon_chip && From e30ca60df1241eb406fd9cea8e23a862644ebc3e Mon Sep 17 00:00:00 2001 From: Zev Weiss Date: Tue, 28 Sep 2021 02:22:38 -0700 Subject: [PATCH 1549/5344] hwmon: (pmbus/lm25066) Let compiler determine outer dimension of lm25066_coeff [ Upstream commit b7931a7b0e0df4d2a25fedd895ad32c746b77bc1 ] Maintaining this manually is error prone (there are currently only five chips supported, not six); gcc can do it for us automatically. Signed-off-by: Zev Weiss Fixes: 666c14906b49 ("hwmon: (pmbus/lm25066) Drop support for LM25063") Link: https://lore.kernel.org/r/20210928092242.30036-5-zev@bewilderbeest.net Signed-off-by: Guenter Roeck Signed-off-by: Sasha Levin --- drivers/hwmon/pmbus/lm25066.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/pmbus/lm25066.c b/drivers/hwmon/pmbus/lm25066.c index eba043c59fc73..41c4bbb9c0572 100644 --- a/drivers/hwmon/pmbus/lm25066.c +++ b/drivers/hwmon/pmbus/lm25066.c @@ -51,7 +51,7 @@ struct __coeff { #define PSC_CURRENT_IN_L (PSC_NUM_CLASSES) #define PSC_POWER_L (PSC_NUM_CLASSES + 1) -static struct __coeff lm25066_coeff[6][PSC_NUM_CLASSES + 2] = { +static struct __coeff lm25066_coeff[][PSC_NUM_CLASSES + 2] = { [lm25056] = { [PSC_VOLTAGE_IN] = { .m = 16296, From b30e3e0e846639b073e6bab2d9c5fd2c3022de40 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Tue, 11 Jun 2019 19:21:31 +0200 Subject: [PATCH 1550/5344] ath10k: fix max antenna gain unit [ Upstream commit 0a491167fe0cf9f26062462de2a8688b96125d48 ] Most of the txpower for the ath10k firmware is stored as twicepower (0.5 dB steps). This isn't the case for max_antenna_gain - which is still expected by the firmware as dB. The firmware is converting it from dB to the internal (twicepower) representation when it calculates the limits of a channel. This can be seen in tpc_stats when configuring "12" as max_antenna_gain. Instead of the expected 12 (6 dB), the tpc_stats shows 24 (12 dB). Tested on QCA9888 and IPQ4019 with firmware 10.4-3.5.3-00057. Fixes: 02256930d9b8 ("ath10k: use proper tx power unit") Signed-off-by: Sven Eckelmann Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20190611172131.6064-1-sven@narfation.org Signed-off-by: Sasha Levin --- drivers/net/wireless/ath/ath10k/mac.c | 6 +++--- drivers/net/wireless/ath/ath10k/wmi.h | 3 +++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index 603f817ae3a59..9daaacf789d60 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -1044,7 +1044,7 @@ static int ath10k_monitor_vdev_start(struct ath10k *ar, int vdev_id) arg.channel.min_power = 0; arg.channel.max_power = channel->max_power * 2; arg.channel.max_reg_power = channel->max_reg_power * 2; - arg.channel.max_antenna_gain = channel->max_antenna_gain * 2; + arg.channel.max_antenna_gain = channel->max_antenna_gain; reinit_completion(&ar->vdev_setup_done); reinit_completion(&ar->vdev_delete_done); @@ -1490,7 +1490,7 @@ static int ath10k_vdev_start_restart(struct ath10k_vif *arvif, arg.channel.min_power = 0; arg.channel.max_power = chandef->chan->max_power * 2; arg.channel.max_reg_power = chandef->chan->max_reg_power * 2; - arg.channel.max_antenna_gain = chandef->chan->max_antenna_gain * 2; + arg.channel.max_antenna_gain = chandef->chan->max_antenna_gain; if (arvif->vdev_type == WMI_VDEV_TYPE_AP) { arg.ssid = arvif->u.ap.ssid; @@ -3149,7 +3149,7 @@ static int ath10k_update_channel_list(struct ath10k *ar) ch->min_power = 0; ch->max_power = channel->max_power * 2; ch->max_reg_power = channel->max_reg_power * 2; - ch->max_antenna_gain = channel->max_antenna_gain * 2; + ch->max_antenna_gain = channel->max_antenna_gain; ch->reg_class_id = 0; /* FIXME */ /* FIXME: why use only legacy modes, why not any diff --git a/drivers/net/wireless/ath/ath10k/wmi.h b/drivers/net/wireless/ath/ath10k/wmi.h index 761bc4a7064df..de22396d085ce 100644 --- a/drivers/net/wireless/ath/ath10k/wmi.h +++ b/drivers/net/wireless/ath/ath10k/wmi.h @@ -2045,7 +2045,9 @@ struct wmi_channel { union { __le32 reginfo1; struct { + /* note: power unit is 1 dBm */ u8 antenna_max; + /* note: power unit is 0.5 dBm */ u8 max_tx_power; } __packed; } __packed; @@ -2065,6 +2067,7 @@ struct wmi_channel_arg { u32 min_power; u32 max_power; u32 max_reg_power; + /* note: power unit is 1 dBm */ u32 max_antenna_gain; u32 reg_class_id; enum wmi_phy_mode mode; From 41f99346bc42f186ab9e9828d440113154ee43c9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 13 Oct 2021 11:13:15 +0300 Subject: [PATCH 1551/5344] drm/msm: uninitialized variable in msm_gem_import() [ Upstream commit 2203bd0e5c12ffc53ffdd4fbd7b12d6ba27e0424 ] The msm_gem_new_impl() function cleans up after itself so there is no need to call drm_gem_object_put(). Conceptually, it does not make sense to call a kref_put() function until after the reference counting has been initialized which happens immediately after this call in the drm_gem_(private_)object_init() functions. In the msm_gem_import() function the "obj" pointer is uninitialized, so it will lead to a crash. Fixes: 05b849111c07 ("drm/msm: prime support") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20211013081315.GG6010@kili Signed-off-by: Rob Clark Signed-off-by: Sasha Levin --- drivers/gpu/drm/msm/msm_gem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index d92a0ffe2a767..8e6a4d5f3a405 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -1036,7 +1036,7 @@ static struct drm_gem_object *_msm_gem_new(struct drm_device *dev, ret = msm_gem_new_impl(dev, size, flags, &obj); if (ret) - goto fail; + return ERR_PTR(ret); msm_obj = to_msm_bo(obj); @@ -1124,7 +1124,7 @@ struct drm_gem_object *msm_gem_import(struct drm_device *dev, ret = msm_gem_new_impl(dev, size, MSM_BO_WC, &obj); if (ret) - goto fail; + return ERR_PTR(ret); drm_gem_private_object_init(dev, obj, size); From f883609b6bad70ee93f6edd08ed81021114b92d7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 15 Oct 2021 06:37:39 -0700 Subject: [PATCH 1552/5344] net: stream: don't purge sk_error_queue in sk_stream_kill_queues() [ Upstream commit 24bcbe1cc69fa52dc4f7b5b2456678ed464724d8 ] sk_stream_kill_queues() can be called on close when there are still outstanding skbs to transmit. Those skbs may try to queue notifications to the error queue (e.g. timestamps). If sk_stream_kill_queues() purges the queue without taking its lock the queue may get corrupted, and skbs leaked. This shows up as a warning about an rmem leak: WARNING: CPU: 24 PID: 0 at net/ipv4/af_inet.c:154 inet_sock_destruct+0x... The leak is always a multiple of 0x300 bytes (the value is in %rax on my builds, so RAX: 0000000000000300). 0x300 is truesize of an empty sk_buff. Indeed if we dump the socket state at the time of the warning the sk_error_queue is often (but not always) corrupted. The ->next pointer points back at the list head, but not the ->prev pointer. Indeed we can find the leaked skb by scanning the kernel memory for something that looks like an skb with ->sk = socket in question, and ->truesize = 0x300. The contents of ->cb[] of the skb confirms the suspicion that it is indeed a timestamp notification (as generated in __skb_complete_tx_timestamp()). Removing purging of sk_error_queue should be okay, since inet_sock_destruct() does it again once all socket refs are gone. Eric suggests this may cause sockets that go thru disconnect() to maintain notifications from the previous incarnations of the socket, but that should be okay since the race was there anyway, and disconnect() is not exactly dependable. Thanks to Jonathan Lemon and Omar Sandoval for help at various stages of tracing the issue. Fixes: cb9eff097831 ("net: new user space API for time stamping of incoming and outgoing packets") Signed-off-by: Jakub Kicinski Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/core/stream.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/core/stream.c b/net/core/stream.c index 4f1d4aa5fb38d..a166a32b411fa 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -195,9 +195,6 @@ void sk_stream_kill_queues(struct sock *sk) /* First the read buffer. */ __skb_queue_purge(&sk->sk_receive_queue); - /* Next, the error queue. */ - __skb_queue_purge(&sk->sk_error_queue); - /* Next, the write queue. */ WARN_ON(!skb_queue_empty(&sk->sk_write_queue)); From 11b53da11cf53655b3dccb9c7b39f3058ede3ea6 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 16 Oct 2021 08:21:44 +0200 Subject: [PATCH 1553/5344] mmc: mxs-mmc: disable regulator on error and in the remove function [ Upstream commit ce5f6c2c9b0fcb4094f8e162cfd37fb4294204f7 ] The 'reg_vmmc' regulator is enabled in the probe. It is never disabled. Neither in the error handling path of the probe nor in the remove function. Register a devm_action to disable it when needed. Fixes: 4dc5a79f1350 ("mmc: mxs-mmc: enable regulator for mmc slot") Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/4aadb3c97835f7b80f00819c3d549e6130384e67.1634365151.git.christophe.jaillet@wanadoo.fr Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- drivers/mmc/host/mxs-mmc.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/mmc/host/mxs-mmc.c b/drivers/mmc/host/mxs-mmc.c index 52054931c3507..3a90037254a4d 100644 --- a/drivers/mmc/host/mxs-mmc.c +++ b/drivers/mmc/host/mxs-mmc.c @@ -565,6 +565,11 @@ static const struct of_device_id mxs_mmc_dt_ids[] = { }; MODULE_DEVICE_TABLE(of, mxs_mmc_dt_ids); +static void mxs_mmc_regulator_disable(void *regulator) +{ + regulator_disable(regulator); +} + static int mxs_mmc_probe(struct platform_device *pdev) { const struct of_device_id *of_id = @@ -606,6 +611,11 @@ static int mxs_mmc_probe(struct platform_device *pdev) "Failed to enable vmmc regulator: %d\n", ret); goto out_mmc_free; } + + ret = devm_add_action_or_reset(&pdev->dev, mxs_mmc_regulator_disable, + reg_vmmc); + if (ret) + goto out_mmc_free; } ssp->clk = devm_clk_get(&pdev->dev, NULL); From 6e6321a2fafea4632b10387ad97c6b4e7c501913 Mon Sep 17 00:00:00 2001 From: Michael Schmitz Date: Tue, 19 Oct 2021 19:13:21 +1300 Subject: [PATCH 1554/5344] block: ataflop: fix breakage introduced at blk-mq refactoring [ Upstream commit 86d46fdaa12ae5befc16b8d73fc85a3ca0399ea6 ] Refactoring of the Atari floppy driver when converting to blk-mq has broken the state machine in not-so-subtle ways: finish_fdc() must be called when operations on the floppy device have completed. This is crucial in order to relase the ST-DMA lock, which protects against concurrent access to the ST-DMA controller by other drivers (some DMA related, most just related to device register access - broken beyond compare, I know). When rewriting the driver's old do_request() function, the fact that finish_fdc() was called only when all queued requests had completed appears to have been overlooked. Instead, the new request function calls finish_fdc() immediately after the last request has been queued. finish_fdc() executes a dummy seek after most requests, and this overwrites the state machine's interrupt hander that was set up to wait for completion of the read/write request just prior. To make matters worse, finish_fdc() is called before device interrupts are re-enabled, making certain that the read/write interupt is missed. Shifting the finish_fdc() call into the read/write request completion handler ensures the driver waits for the request to actually complete. With a queue depth of 2, we won't see long request sequences, so calling finish_fdc() unconditionally just adds a little overhead for the dummy seeks, and keeps the code simple. While we're at it, kill ataflop_commit_rqs() which does nothing but run finish_fdc() unconditionally, again likely wiping out an in-flight request. Signed-off-by: Michael Schmitz Fixes: 6ec3938cff95 ("ataflop: convert to blk-mq") CC: linux-block@vger.kernel.org CC: Tetsuo Handa Link: https://lore.kernel.org/r/20211019061321.26425-1-schmitzmic@gmail.com Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- drivers/block/ataflop.c | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index bd7d3bb8b890b..ad4cf10749100 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -653,9 +653,6 @@ static inline void copy_buffer(void *from, void *to) *p2++ = *p1++; } - - - /* General Interrupt Handling */ static void (*FloppyIRQHandler)( int status ) = NULL; @@ -1225,6 +1222,7 @@ static void fd_rwsec_done1(int status) } else { /* all sectors finished */ + finish_fdc(); fd_end_request_cur(BLK_STS_OK); } return; @@ -1472,15 +1470,6 @@ static void setup_req_params( int drive ) ReqTrack, ReqSector, (unsigned long)ReqData )); } -static void ataflop_commit_rqs(struct blk_mq_hw_ctx *hctx) -{ - spin_lock_irq(&ataflop_lock); - atari_disable_irq(IRQ_MFP_FDC); - finish_fdc(); - atari_enable_irq(IRQ_MFP_FDC); - spin_unlock_irq(&ataflop_lock); -} - static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -1488,6 +1477,8 @@ static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx, int drive = floppy - unit; int type = floppy->type; + DPRINT(("Queue request: drive %d type %d last %d\n", drive, type, bd->last)); + spin_lock_irq(&ataflop_lock); if (fd_request) { spin_unlock_irq(&ataflop_lock); @@ -1547,8 +1538,6 @@ static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx, setup_req_params( drive ); do_fd_action( drive ); - if (bd->last) - finish_fdc(); atari_enable_irq( IRQ_MFP_FDC ); out: @@ -1958,7 +1947,6 @@ static const struct block_device_operations floppy_fops = { static const struct blk_mq_ops ataflop_mq_ops = { .queue_rq = ataflop_queue_rq, - .commit_rqs = ataflop_commit_rqs, }; static struct kobject *floppy_find(dev_t dev, int *part, void *data) From 615ec303054d3c0f134bdb5b1727aa4b5c8d431e Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 18 Oct 2021 11:25:37 -0700 Subject: [PATCH 1555/5344] platform/x86: thinkpad_acpi: Fix bitwise vs. logical warning [ Upstream commit fd96e35ea7b95f1e216277805be89d66e4ae962d ] A new warning in clang points out a use of bitwise OR with boolean expressions in this driver: drivers/platform/x86/thinkpad_acpi.c:9061:11: error: use of bitwise '|' with boolean operands [-Werror,-Wbitwise-instead-of-logical] else if ((strlencmp(cmd, "level disengaged") == 0) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ || drivers/platform/x86/thinkpad_acpi.c:9061:11: note: cast one or both operands to int to silence this warning 1 error generated. This should clearly be a logical OR so change it to fix the warning. Fixes: fe98a52ce754 ("ACPI: thinkpad-acpi: add sysfs support to fan subdriver") Link: https://github.com/ClangBuiltLinux/linux/issues/1476 Reported-by: Tor Vic Signed-off-by: Nathan Chancellor Reviewed-by: Nick Desaulniers Link: https://lore.kernel.org/r/20211018182537.2316800-1-nathan@kernel.org Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Signed-off-by: Sasha Levin --- drivers/platform/x86/thinkpad_acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index f027609fdab6d..3028d9f1ac59c 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -9086,7 +9086,7 @@ static int fan_write_cmd_level(const char *cmd, int *rc) if (strlencmp(cmd, "level auto") == 0) level = TP_EC_FAN_AUTO; - else if ((strlencmp(cmd, "level disengaged") == 0) | + else if ((strlencmp(cmd, "level disengaged") == 0) || (strlencmp(cmd, "level full-speed") == 0)) level = TP_EC_FAN_FULLSPEED; else if (sscanf(cmd, "level %d", &level) != 1) From bc04398d75d38524297e34b7a3763842d74457f1 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 22 Jun 2021 09:48:30 +0200 Subject: [PATCH 1556/5344] mt76: mt76x02: fix endianness warnings in mt76x02_mac.c [ Upstream commit c33edef520213feccebc22c9474c685b9fb60611 ] Fix the following sparse warning in mt76x02_mac_write_txwi and mt76x02_mac_tx_rate_val routines: drivers/net/wireless/mediatek/mt76/mt76x02_mac.c:237:19: warning: restricted __le16 degrades to intege warning: cast from restricted __le16 drivers/net/wireless/mediatek/mt76/mt76x02_mac.c:383:28: warning: incorrect type in assignment (different base types) expected restricted __le16 [usertype] rate got unsigned long Fixes: db9f11d3433f7 ("mt76: store wcid tx rate info in one u32 reduce locking") Signed-off-by: Lorenzo Bianconi Signed-off-by: Felix Fietkau Signed-off-by: Sasha Levin --- drivers/net/wireless/mediatek/mt76/mt76x02_mac.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_mac.c b/drivers/net/wireless/mediatek/mt76/mt76x02_mac.c index abacb4ea7179d..5c12cd7fce940 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02_mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x02_mac.c @@ -154,7 +154,7 @@ void mt76x02_mac_wcid_set_drop(struct mt76x02_dev *dev, u8 idx, bool drop) mt76_wr(dev, MT_WCID_DROP(idx), (val & ~bit) | (bit * drop)); } -static __le16 +static u16 mt76x02_mac_tx_rate_val(struct mt76x02_dev *dev, const struct ieee80211_tx_rate *rate, u8 *nss_val) { @@ -200,14 +200,14 @@ mt76x02_mac_tx_rate_val(struct mt76x02_dev *dev, rateval |= MT_RXWI_RATE_SGI; *nss_val = nss; - return cpu_to_le16(rateval); + return rateval; } void mt76x02_mac_wcid_set_rate(struct mt76x02_dev *dev, struct mt76_wcid *wcid, const struct ieee80211_tx_rate *rate) { s8 max_txpwr_adj = mt76x02_tx_get_max_txpwr_adj(dev, rate); - __le16 rateval; + u16 rateval; u32 tx_info; s8 nss; @@ -320,7 +320,7 @@ void mt76x02_mac_write_txwi(struct mt76x02_dev *dev, struct mt76x02_txwi *txwi, struct ieee80211_key_conf *key = info->control.hw_key; u32 wcid_tx_info; u16 rate_ht_mask = FIELD_PREP(MT_RXWI_RATE_PHY, BIT(1) | BIT(2)); - u16 txwi_flags = 0; + u16 txwi_flags = 0, rateval; u8 nss; s8 txpwr_adj, max_txpwr_adj; u8 ccmp_pn[8], nstreams = dev->mt76.chainmask & 0xf; @@ -356,14 +356,15 @@ void mt76x02_mac_write_txwi(struct mt76x02_dev *dev, struct mt76x02_txwi *txwi, if (wcid && (rate->idx < 0 || !rate->count)) { wcid_tx_info = wcid->tx_info; - txwi->rate = FIELD_GET(MT_WCID_TX_INFO_RATE, wcid_tx_info); + rateval = FIELD_GET(MT_WCID_TX_INFO_RATE, wcid_tx_info); max_txpwr_adj = FIELD_GET(MT_WCID_TX_INFO_TXPWR_ADJ, wcid_tx_info); nss = FIELD_GET(MT_WCID_TX_INFO_NSS, wcid_tx_info); } else { - txwi->rate = mt76x02_mac_tx_rate_val(dev, rate, &nss); + rateval = mt76x02_mac_tx_rate_val(dev, rate, &nss); max_txpwr_adj = mt76x02_tx_get_max_txpwr_adj(dev, rate); } + txwi->rate = cpu_to_le16(rateval); txpwr_adj = mt76x02_tx_get_txpwr_adj(dev, dev->mt76.txpower_conf, max_txpwr_adj); From ec5ce25930d57ebf3533adf3e6eadcbd79272eb2 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Fri, 15 Oct 2021 12:03:35 +0800 Subject: [PATCH 1557/5344] rsi: stop thread firstly in rsi_91x_init() error handling [ Upstream commit 515e7184bdf0a3ebf1757cc77fb046b4fe282189 ] When fail to init coex module, free 'common' and 'adapter' directly, but common->tx_thread which will access 'common' and 'adapter' is running at the same time. That will trigger the UAF bug. ================================================================== BUG: KASAN: use-after-free in rsi_tx_scheduler_thread+0x50f/0x520 [rsi_91x] Read of size 8 at addr ffff8880076dc000 by task Tx-Thread/124777 CPU: 0 PID: 124777 Comm: Tx-Thread Not tainted 5.15.0-rc5+ #19 Call Trace: dump_stack_lvl+0xe2/0x152 print_address_description.constprop.0+0x21/0x140 ? rsi_tx_scheduler_thread+0x50f/0x520 kasan_report.cold+0x7f/0x11b ? rsi_tx_scheduler_thread+0x50f/0x520 rsi_tx_scheduler_thread+0x50f/0x520 ... Freed by task 111873: kasan_save_stack+0x1b/0x40 kasan_set_track+0x1c/0x30 kasan_set_free_info+0x20/0x30 __kasan_slab_free+0x109/0x140 kfree+0x117/0x4c0 rsi_91x_init+0x741/0x8a0 [rsi_91x] rsi_probe+0x9f/0x1750 [rsi_usb] Stop thread before free 'common' and 'adapter' to fix it. Fixes: 2108df3c4b18 ("rsi: add coex support") Signed-off-by: Ziyang Xuan Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20211015040335.1021546-1-william.xuanziyang@huawei.com Signed-off-by: Sasha Levin --- drivers/net/wireless/rsi/rsi_91x_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/rsi/rsi_91x_main.c b/drivers/net/wireless/rsi/rsi_91x_main.c index aece1d3a6b055..441fda71f6289 100644 --- a/drivers/net/wireless/rsi/rsi_91x_main.c +++ b/drivers/net/wireless/rsi/rsi_91x_main.c @@ -368,6 +368,7 @@ struct rsi_hw *rsi_91x_init(u16 oper_mode) if (common->coex_mode > 1) { if (rsi_coex_attach(common)) { rsi_dbg(ERR_ZONE, "Failed to init coex module\n"); + rsi_kill_thread(&common->tx_thread); goto err; } } From 788aa5d27184c2eecf945d051b2531ec76cf3027 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonas=20Dre=C3=9Fler?= Date: Sat, 16 Oct 2021 17:32:43 +0200 Subject: [PATCH 1558/5344] mwifiex: Send DELBA requests according to spec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit cc8a8bc37466f79b24d972555237f3d591150602 ] While looking at on-air packets using Wireshark, I noticed we're never setting the initiator bit when sending DELBA requests to the AP: While we set the bit on our del_ba_param_set bitmask, we forget to actually copy that bitmask over to the command struct, which means we never actually set the initiator bit. Fix that and copy the bitmask over to the host_cmd_ds_11n_delba command struct. Fixes: 5e6e3a92b9a4 ("wireless: mwifiex: initial commit for Marvell mwifiex driver") Signed-off-by: Jonas Dreßler Acked-by: Pali Rohár Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20211016153244.24353-5-verdre@v0yd.nl Signed-off-by: Sasha Levin --- drivers/net/wireless/marvell/mwifiex/11n.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/marvell/mwifiex/11n.c b/drivers/net/wireless/marvell/mwifiex/11n.c index e435f801bc912..acbef9f1a83b6 100644 --- a/drivers/net/wireless/marvell/mwifiex/11n.c +++ b/drivers/net/wireless/marvell/mwifiex/11n.c @@ -657,14 +657,15 @@ int mwifiex_send_delba(struct mwifiex_private *priv, int tid, u8 *peer_mac, uint16_t del_ba_param_set; memset(&delba, 0, sizeof(delba)); - delba.del_ba_param_set = cpu_to_le16(tid << DELBA_TID_POS); - del_ba_param_set = le16_to_cpu(delba.del_ba_param_set); + del_ba_param_set = tid << DELBA_TID_POS; + if (initiator) del_ba_param_set |= IEEE80211_DELBA_PARAM_INITIATOR_MASK; else del_ba_param_set &= ~IEEE80211_DELBA_PARAM_INITIATOR_MASK; + delba.del_ba_param_set = cpu_to_le16(del_ba_param_set); memcpy(&delba.peer_mac_addr, peer_mac, ETH_ALEN); /* We don't wait for the response of this command */ From b70dceb0a3f0e0c8445e191d91ff272a8f07c279 Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Tue, 19 Oct 2021 21:16:47 +0200 Subject: [PATCH 1559/5344] phy: micrel: ksz8041nl: do not use power down mode [ Upstream commit 2641b62d2fab52648e34cdc6994b2eacde2d27c1 ] Some Micrel KSZ8041NL PHY chips exhibit continuous RX errors after using the power down mode bit (0.11). If the PHY is taken out of power down mode in a certain temperature range, the PHY enters a weird state which leads to continuously reporting RX errors. In that state, the MAC is not able to receive or send any Ethernet frames and the activity LED is constantly blinking. Since Linux is using the suspend callback when the interface is taken down, ending up in that state can easily happen during a normal startup. Micrel confirmed the issue in errata DS80000700A [*], caused by abnormal clock recovery when using power down mode. Even the latest revision (A4, Revision ID 0x1513) seems to suffer that problem, and according to the errata is not going to be fixed. Remove the suspend/resume callback to avoid using the power down mode completely. [*] https://ww1.microchip.com/downloads/en/DeviceDoc/80000700A.pdf Fixes: 1a5465f5d6a2 ("phy/micrel: Add suspend/resume support to Micrel PHYs") Signed-off-by: Stefan Agner Acked-by: Marcel Ziswiler Signed-off-by: Francesco Dolcini Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/phy/micrel.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index f95bd1b0fb965..0b61d80ea3f8c 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -1040,8 +1040,9 @@ static struct phy_driver ksphy_driver[] = { .get_sset_count = kszphy_get_sset_count, .get_strings = kszphy_get_strings, .get_stats = kszphy_get_stats, - .suspend = genphy_suspend, - .resume = genphy_resume, + /* No suspend/resume callbacks because of errata DS80000700A, + * receiver error following software power down. + */ }, { .phy_id = PHY_ID_KSZ8041RNLI, .phy_id_mask = MICREL_PHY_ID_MASK, From 1ff00c92027745ab683bf6e5055a7f3887b0a791 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 17 Oct 2021 11:58:16 +0300 Subject: [PATCH 1560/5344] nvme-rdma: fix error code in nvme_rdma_setup_ctrl [ Upstream commit 09748122009aed7bfaa7acc33c10c083a4758322 ] In case that icdoff is not zero or mandatory keyed sgls are not supported by the NVMe/RDMA target, we'll go to error flow but we'll return 0 to the caller. Fix it by returning an appropriate error code. Fixes: c66e2998c8ca ("nvme-rdma: centralize controller setup sequence") Signed-off-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Sasha Levin --- drivers/nvme/host/rdma.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index dcc3d2393605e..08a23bb4b8b57 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1019,11 +1019,13 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) return ret; if (ctrl->ctrl.icdoff) { + ret = -EOPNOTSUPP; dev_err(ctrl->ctrl.device, "icdoff is not supported!\n"); goto destroy_admin; } if (!(ctrl->ctrl.sgls & (1 << 2))) { + ret = -EOPNOTSUPP; dev_err(ctrl->ctrl.device, "Mandatory keyed sgls are not supported!\n"); goto destroy_admin; From b5eeae405951f240e7c3098cb75dba8fbf7c98f7 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Thu, 7 Oct 2021 21:13:37 +0200 Subject: [PATCH 1561/5344] PM: hibernate: fix sparse warnings [ Upstream commit 01de5fcd8b1ac0ca28d2bb0921226a54fdd62684 ] When building the kernel with sparse enabled 'C=1' the following warnings shows up: kernel/power/swap.c:390:29: warning: incorrect type in assignment (different base types) kernel/power/swap.c:390:29: expected int ret kernel/power/swap.c:390:29: got restricted blk_status_t This is due to function hib_wait_io() returns a 'blk_status_t' which is a bitwise u8. Commit 5416da01ff6e ("PM: hibernate: Remove blk_status_to_errno in hib_wait_io") seemed to have mixed up the return type. However, the 4e4cbee93d56 ("block: switch bios to blk_status_t") actually broke the behaviour by returning the wrong type. Rework so function hib_wait_io() returns a 'int' instead of 'blk_status_t' and make sure to call function blk_status_to_errno(hb->error)' when returning from function hib_wait_io() a int gets returned. Fixes: 4e4cbee93d56 ("block: switch bios to blk_status_t") Fixes: 5416da01ff6e ("PM: hibernate: Remove blk_status_to_errno in hib_wait_io") Signed-off-by: Anders Roxell Signed-off-by: Rafael J. Wysocki Signed-off-by: Sasha Levin --- kernel/power/swap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index d32cd03d5ff8c..bcc9769e8a3b5 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -292,7 +292,7 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, return error; } -static blk_status_t hib_wait_io(struct hib_bio_batch *hb) +static int hib_wait_io(struct hib_bio_batch *hb) { wait_event(hb->wait, atomic_read(&hb->count) == 0); return blk_status_to_errno(hb->error); From a2544cb69ffdc50fd7bd4973112c40af55cdbbcd Mon Sep 17 00:00:00 2001 From: Jessica Zhang Date: Wed, 20 Oct 2021 10:57:33 -0700 Subject: [PATCH 1562/5344] drm/msm: Fix potential NULL dereference in DPU SSPP [ Upstream commit 8bf71a5719b6cc5b6ba358096081e5d50ea23ab6 ] Move initialization of sblk in _sspp_subblk_offset() after NULL check to avoid potential NULL pointer dereference. Fixes: 25fdd5933e4c ("drm/msm: Add SDM845 DPU support") Reported-by: Dan Carpenter Signed-off-by: Jessica Zhang Link: https://lore.kernel.org/r/20211020175733.3379-1-jesszhan@codeaurora.org Signed-off-by: Rob Clark Signed-off-by: Sasha Levin --- drivers/gpu/drm/msm/disp/dpu1/dpu_hw_sspp.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_sspp.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_sspp.c index 4f8b813aab810..8256f06218d0f 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_sspp.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_sspp.c @@ -137,11 +137,13 @@ static int _sspp_subblk_offset(struct dpu_hw_pipe *ctx, u32 *idx) { int rc = 0; - const struct dpu_sspp_sub_blks *sblk = ctx->cap->sblk; + const struct dpu_sspp_sub_blks *sblk; - if (!ctx) + if (!ctx || !ctx->cap || !ctx->cap->sblk) return -EINVAL; + sblk = ctx->cap->sblk; + switch (s_id) { case DPU_SSPP_SRC: *idx = sblk->src_blk.base; @@ -404,7 +406,7 @@ static void _dpu_hw_sspp_setup_scaler3(struct dpu_hw_pipe *ctx, (void)pe; if (_sspp_subblk_offset(ctx, DPU_SSPP_SCALER_QSEED3, &idx) || !sspp - || !scaler3_cfg || !ctx || !ctx->cap || !ctx->cap->sblk) + || !scaler3_cfg) return; dpu_hw_setup_scaler3(&ctx->hw, scaler3_cfg, idx, From 253b4de53179d257ecc4488cd975a0ddc75c732f Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 19 Oct 2021 20:27:26 +0900 Subject: [PATCH 1563/5344] smackfs: use netlbl_cfg_cipsov4_del() for deleting cipso_v4_doi [ Upstream commit 0934ad42bb2c5df90a1b9de690f93de735b622fe ] syzbot is reporting UAF at cipso_v4_doi_search() [1], for smk_cipso_doi() is calling kfree() without removing from the cipso_v4_doi_list list after netlbl_cfg_cipsov4_map_add() returned an error. We need to use netlbl_cfg_cipsov4_del() in order to remove from the list and wait for RCU grace period before kfree(). Link: https://syzkaller.appspot.com/bug?extid=93dba5b91f0fed312cbd [1] Reported-by: syzbot Signed-off-by: Tetsuo Handa Fixes: 6c2e8ac0953fccdd ("netlabel: Update kernel configuration API") Signed-off-by: Casey Schaufler Signed-off-by: Sasha Levin --- security/smack/smackfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c index fdf5f336f834a..6b6fec04c412b 100644 --- a/security/smack/smackfs.c +++ b/security/smack/smackfs.c @@ -712,7 +712,7 @@ static void smk_cipso_doi(void) if (rc != 0) { printk(KERN_WARNING "%s:%d map add rc = %d\n", __func__, __LINE__, rc); - kfree(doip); + netlbl_cfg_cipsov4_del(doip->doi, &nai); return; } } From 54d19694a343238b14d0d8f35f49cc41c8ff3ad0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 4 Nov 2020 20:33:57 -0800 Subject: [PATCH 1564/5344] libbpf: Fix BTF data layout checks and allow empty BTF [ Upstream commit d8123624506cd62730c9cd9c7672c698e462703d ] Make data section layout checks stricter, disallowing overlap of types and strings data. Additionally, allow BTFs with no type data. There is nothing inherently wrong with having BTF with no types (put potentially with some strings). This could be a situation with kernel module BTFs, if module doesn't introduce any new type information. Also fix invalid offset alignment check for btf->hdr->type_off. Fixes: 8a138aed4a80 ("bpf: btf: Add BTF support to libbpf") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201105043402.2530976-8-andrii@kernel.org Signed-off-by: Sasha Levin --- tools/lib/bpf/btf.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index d606a358480da..41daf0fa95b9f 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -100,22 +100,18 @@ static int btf_parse_hdr(struct btf *btf) return -EINVAL; } - if (meta_left < hdr->type_off) { - pr_debug("Invalid BTF type section offset:%u\n", hdr->type_off); + if (meta_left < hdr->str_off + hdr->str_len) { + pr_debug("Invalid BTF total size:%u\n", btf->data_size); return -EINVAL; } - if (meta_left < hdr->str_off) { - pr_debug("Invalid BTF string section offset:%u\n", hdr->str_off); + if (hdr->type_off + hdr->type_len > hdr->str_off) { + pr_debug("Invalid BTF data sections layout: type data at %u + %u, strings data at %u + %u\n", + hdr->type_off, hdr->type_len, hdr->str_off, hdr->str_len); return -EINVAL; } - if (hdr->type_off >= hdr->str_off) { - pr_debug("BTF type section offset >= string section offset. No type?\n"); - return -EINVAL; - } - - if (hdr->type_off & 0x02) { + if (hdr->type_off % 4) { pr_debug("BTF type section is not aligned to 4 bytes\n"); return -EINVAL; } From 310f467e293c325521d8e31ba3eb6595f30f98f7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 9 Sep 2021 18:22:41 +0200 Subject: [PATCH 1565/5344] s390/gmap: don't unconditionally call pte_unmap_unlock() in __gmap_zap() [ Upstream commit b159f94c86b43cf7e73e654bc527255b1f4eafc4 ] ... otherwise we will try unlocking a spinlock that was never locked via a garbage pointer. At the time we reach this code path, we usually successfully looked up a PGSTE already; however, evil user space could have manipulated the VMA layout in the meantime and triggered removal of the page table. Fixes: 1e133ab296f3 ("s390/mm: split arch/s390/mm/pgtable.c") Signed-off-by: David Hildenbrand Reviewed-by: Claudio Imbrenda Acked-by: Heiko Carstens Link: https://lore.kernel.org/r/20210909162248.14969-3-david@redhat.com Signed-off-by: Christian Borntraeger Signed-off-by: Sasha Levin --- arch/s390/mm/gmap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 7e46dca8347f6..a642a3642fdae 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -684,9 +684,10 @@ void __gmap_zap(struct gmap *gmap, unsigned long gaddr) vmaddr |= gaddr & ~PMD_MASK; /* Get pointer to the page table entry */ ptep = get_locked_pte(gmap->mm, vmaddr, &ptl); - if (likely(ptep)) + if (likely(ptep)) { ptep_zap_unused(gmap->mm, vmaddr, ptep, 0); - pte_unmap_unlock(ptep, ptl); + pte_unmap_unlock(ptep, ptl); + } } } EXPORT_SYMBOL_GPL(__gmap_zap); From 5d32c45a2a5dee3db5ca42a8afea289c78a1b539 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 20 Oct 2021 17:25:22 +0100 Subject: [PATCH 1566/5344] irq: mips: avoid nested irq_enter() [ Upstream commit c65b52d02f6c1a06ddb20cba175ad49eccd6410d ] As bcm6345_l1_irq_handle() is a chained irqchip handler, it will be invoked within the context of the root irqchip handler, which must have entered IRQ context already. When bcm6345_l1_irq_handle() calls arch/mips's do_IRQ() , this will nest another call to irq_enter(), and the resulting nested increment to `rcu_data.dynticks_nmi_nesting` will cause rcu_is_cpu_rrupt_from_idle() to fail to identify wakeups from idle, resulting in failure to preempt, and RCU stalls. Chained irqchip handlers must invoke IRQ handlers by way of thee core irqchip code, i.e. generic_handle_irq() or generic_handle_domain_irq() and should not call do_IRQ(), which is intended only for root irqchip handlers. Fix bcm6345_l1_irq_handle() by calling generic_handle_irq() directly. Fixes: c7c42ec2baa1de7a ("irqchips/bmips: Add bcm6345-l1 interrupt controller") Signed-off-by: Mark Rutland Reviewed-by: Marc Zyngier Acked-by: Thomas Bogendoerfer Cc: Thomas Gleixner Signed-off-by: Sasha Levin --- drivers/irqchip/irq-bcm6345-l1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-bcm6345-l1.c b/drivers/irqchip/irq-bcm6345-l1.c index e3483789f4df3..1bd0621c4ce2a 100644 --- a/drivers/irqchip/irq-bcm6345-l1.c +++ b/drivers/irqchip/irq-bcm6345-l1.c @@ -140,7 +140,7 @@ static void bcm6345_l1_irq_handle(struct irq_desc *desc) for_each_set_bit(hwirq, &pending, IRQS_PER_WORD) { irq = irq_linear_revmap(intc->domain, base + hwirq); if (irq) - do_IRQ(irq); + generic_handle_irq(irq); else spurious_interrupt(); } From 0cf16297697a3416419e925db88cfa5ab998fbd0 Mon Sep 17 00:00:00 2001 From: Jon Maxwell Date: Mon, 25 Oct 2021 10:59:03 +1100 Subject: [PATCH 1567/5344] tcp: don't free a FIN sk_buff in tcp_remove_empty_skb() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit cf12e6f9124629b18a6182deefc0315f0a73a199 ] v1: Implement a more general statement as recommended by Eric Dumazet. The sequence number will be advanced, so this check will fix the FIN case and other cases. A customer reported sockets stuck in the CLOSING state. A Vmcore revealed that the write_queue was not empty as determined by tcp_write_queue_empty() but the sk_buff containing the FIN flag had been freed and the socket was zombied in that state. Corresponding pcaps show no FIN from the Linux kernel on the wire. Some instrumentation was added to the kernel and it was found that there is a timing window where tcp_sendmsg() can run after tcp_send_fin(). tcp_sendmsg() will hit an error, for example: 1269 ▹ if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))↩ 1270 ▹ ▹ goto do_error;↩ tcp_remove_empty_skb() will then free the FIN sk_buff as "skb->len == 0". The TCP socket is now wedged in the FIN-WAIT-1 state because the FIN is never sent. If the other side sends a FIN packet the socket will transition to CLOSING and remain that way until the system is rebooted. Fix this by checking for the FIN flag in the sk_buff and don't free it if that is the case. Testing confirmed that fixed the issue. Fixes: fdfc5c8594c2 ("tcp: remove empty skb from write queue in error cases") Signed-off-by: Jon Maxwell Reported-by: Monir Zouaoui Reported-by: Simon Stier Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/ipv4/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1e5830db71f74..70029625b2881 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -970,7 +970,7 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) */ static void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb) { - if (skb && !skb->len) { + if (skb && TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { tcp_unlink_write_queue(skb, sk); if (tcp_write_queue_empty(sk)) tcp_chrono_stop(sk, TCP_CHRONO_BUSY); From 1edbf4297b06217bb43e7b60429671d09b7ad284 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 26 Oct 2021 09:51:28 +0800 Subject: [PATCH 1568/5344] samples/kretprobes: Fix return value if register_kretprobe() failed [ Upstream commit f76fbbbb5061fe14824ba5807c44bd7400a6b4e1 ] Use the actual return value instead of always -1 if register_kretprobe() failed. E.g. without this patch: # insmod samples/kprobes/kretprobe_example.ko func=no_such_func insmod: ERROR: could not insert module samples/kprobes/kretprobe_example.ko: Operation not permitted With this patch: # insmod samples/kprobes/kretprobe_example.ko func=no_such_func insmod: ERROR: could not insert module samples/kprobes/kretprobe_example.ko: Unknown symbol in module Link: https://lkml.kernel.org/r/1635213091-24387-2-git-send-email-yangtiezhu@loongson.cn Fixes: 804defea1c02 ("Kprobes: move kprobe examples to samples/") Signed-off-by: Tiezhu Yang Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Sasha Levin --- samples/kprobes/kretprobe_example.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/kprobes/kretprobe_example.c b/samples/kprobes/kretprobe_example.c index 186315ca88b3f..2701549ee7b3a 100644 --- a/samples/kprobes/kretprobe_example.c +++ b/samples/kprobes/kretprobe_example.c @@ -84,7 +84,7 @@ static int __init kretprobe_init(void) ret = register_kretprobe(&my_kretprobe); if (ret < 0) { pr_err("register_kretprobe failed, returned %d\n", ret); - return -1; + return ret; } pr_info("Planted return probe at %s: %p\n", my_kretprobe.kp.symbol_name, my_kretprobe.kp.addr); From be83a02492c871755882436536d2541412fd61da Mon Sep 17 00:00:00 2001 From: Janis Schoetterl-Glausch Date: Fri, 22 Oct 2021 17:26:48 +0200 Subject: [PATCH 1569/5344] KVM: s390: Fix handle_sske page fault handling [ Upstream commit 85f517b29418158d3e6e90c3f0fc01b306d2f1a1 ] If handle_sske cannot set the storage key, because there is no page table entry or no present large page entry, it calls fixup_user_fault. However, currently, if the call succeeds, handle_sske returns -EAGAIN, without having set the storage key. Instead, retry by continue'ing the loop without incrementing the address. The same issue in handle_pfmf was fixed by a11bdb1a6b78 ("KVM: s390: Fix pfmf and conditional skey emulation"). Fixes: bd096f644319 ("KVM: s390: Add skey emulation fault handling") Signed-off-by: Janis Schoetterl-Glausch Reviewed-by: Christian Borntraeger Reviewed-by: Claudio Imbrenda Link: https://lore.kernel.org/r/20211022152648.26536-1-scgl@linux.ibm.com Signed-off-by: Christian Borntraeger Signed-off-by: Sasha Levin --- arch/s390/kvm/priv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index bca3d96eb464a..5f7f047cba599 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -398,6 +398,8 @@ static int handle_sske(struct kvm_vcpu *vcpu) mmap_read_unlock(current->mm); if (rc == -EFAULT) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + if (rc == -EAGAIN) + continue; if (rc < 0) return rc; start += PAGE_SIZE; From 3f84966585cfa5f47229258e2054787be948c1ad Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Wed, 20 Oct 2021 20:03:44 +0800 Subject: [PATCH 1570/5344] libertas_tf: Fix possible memory leak in probe and disconnect [ Upstream commit d549107305b4634c81223a853701c06bcf657bc3 ] I got memory leak as follows when doing fault injection test: unreferenced object 0xffff88810a2ddc00 (size 512): comm "kworker/6:1", pid 176, jiffies 4295009893 (age 757.220s) hex dump (first 32 bytes): 00 50 05 18 81 88 ff ff 00 00 00 00 00 00 00 00 .P.............. 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] slab_post_alloc_hook+0x9c/0x490 [] kmem_cache_alloc_trace+0x1f7/0x470 [] if_usb_probe+0x60/0x37c [libertas_tf_usb] [] usb_probe_interface+0x1aa/0x3c0 [usbcore] [] really_probe+0x190/0x480 [] __driver_probe_device+0xf9/0x180 [] driver_probe_device+0x53/0x130 [] __device_attach_driver+0x105/0x130 [] bus_for_each_drv+0x129/0x190 [] __device_attach+0x1c9/0x270 [] device_initial_probe+0x20/0x30 [] bus_probe_device+0x142/0x160 [] device_add+0x829/0x1300 [] usb_set_configuration+0xb01/0xcc0 [usbcore] [] usb_generic_driver_probe+0x6e/0x90 [usbcore] [] usb_probe_device+0x6f/0x130 [usbcore] cardp is missing being freed in the error handling path of the probe and the path of the disconnect, which will cause memory leak. This patch adds the missing kfree(). Fixes: c305a19a0d0a ("libertas_tf: usb specific functions") Reported-by: Hulk Robot Signed-off-by: Wang Hai Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20211020120345.2016045-2-wanghai38@huawei.com Signed-off-by: Sasha Levin --- drivers/net/wireless/marvell/libertas_tf/if_usb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wireless/marvell/libertas_tf/if_usb.c b/drivers/net/wireless/marvell/libertas_tf/if_usb.c index bedc092150884..b30bcb28503ae 100644 --- a/drivers/net/wireless/marvell/libertas_tf/if_usb.c +++ b/drivers/net/wireless/marvell/libertas_tf/if_usb.c @@ -230,6 +230,7 @@ static int if_usb_probe(struct usb_interface *intf, dealloc: if_usb_free(cardp); + kfree(cardp); error: lbtf_deb_leave(LBTF_DEB_MAIN); return -ENOMEM; @@ -254,6 +255,7 @@ static void if_usb_disconnect(struct usb_interface *intf) /* Unlink and free urb */ if_usb_free(cardp); + kfree(cardp); usb_set_intfdata(intf, NULL); usb_put_dev(interface_to_usbdev(intf)); From 8d4dadcb092d4e8a439fb9012c55dabcfec7eb16 Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Wed, 20 Oct 2021 20:03:45 +0800 Subject: [PATCH 1571/5344] libertas: Fix possible memory leak in probe and disconnect [ Upstream commit 9692151e2fe7a326bafe99836fd1f20a2cc3a049 ] I got memory leak as follows when doing fault injection test: unreferenced object 0xffff88812c7d7400 (size 512): comm "kworker/6:1", pid 176, jiffies 4295003332 (age 822.830s) hex dump (first 32 bytes): 00 68 1e 04 81 88 ff ff 01 00 00 00 00 00 00 00 .h.............. 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] slab_post_alloc_hook+0x9c/0x490 [] kmem_cache_alloc_trace+0x1f7/0x470 [] if_usb_probe+0x63/0x446 [usb8xxx] [] usb_probe_interface+0x1aa/0x3c0 [usbcore] [] really_probe+0x190/0x480 [] __driver_probe_device+0xf9/0x180 [] driver_probe_device+0x53/0x130 [] __device_attach_driver+0x105/0x130 [] bus_for_each_drv+0x129/0x190 [] __device_attach+0x1c9/0x270 [] device_initial_probe+0x20/0x30 [] bus_probe_device+0x142/0x160 [] device_add+0x829/0x1300 [] usb_set_configuration+0xb01/0xcc0 [usbcore] [] usb_generic_driver_probe+0x6e/0x90 [usbcore] [] usb_probe_device+0x6f/0x130 [usbcore] cardp is missing being freed in the error handling path of the probe and the path of the disconnect, which will cause memory leak. This patch adds the missing kfree(). Fixes: 876c9d3aeb98 ("[PATCH] Marvell Libertas 8388 802.11b/g USB driver") Reported-by: Hulk Robot Signed-off-by: Wang Hai Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20211020120345.2016045-3-wanghai38@huawei.com Signed-off-by: Sasha Levin --- drivers/net/wireless/marvell/libertas/if_usb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wireless/marvell/libertas/if_usb.c b/drivers/net/wireless/marvell/libertas/if_usb.c index 20436a289d5cd..5d6dc1dd050d4 100644 --- a/drivers/net/wireless/marvell/libertas/if_usb.c +++ b/drivers/net/wireless/marvell/libertas/if_usb.c @@ -292,6 +292,7 @@ static int if_usb_probe(struct usb_interface *intf, if_usb_reset_device(cardp); dealloc: if_usb_free(cardp); + kfree(cardp); error: return r; @@ -316,6 +317,7 @@ static void if_usb_disconnect(struct usb_interface *intf) /* Unlink and free urb */ if_usb_free(cardp); + kfree(cardp); usb_set_intfdata(intf, NULL); usb_put_dev(interface_to_usbdev(intf)); From 15de6aa44ddc80fce6fff6d6351a53b2a138a327 Mon Sep 17 00:00:00 2001 From: Benjamin Li Date: Fri, 22 Oct 2021 17:15:28 -0700 Subject: [PATCH 1572/5344] wcn36xx: add proper DMA memory barriers in rx path [ Upstream commit 9bfe38e064af5decba2ffce66a2958ab8b10eaa4 ] This is essentially exactly following the dma_wmb()/dma_rmb() usage instructions in Documentation/memory-barriers.txt. The theoretical races here are: 1. DXE (the DMA Transfer Engine in the Wi-Fi subsystem) seeing the dxe->ctrl & WCN36xx_DXE_CTRL_VLD write before the dxe->dst_addr_l write, thus performing DMA into the wrong address. 2. CPU reading dxe->dst_addr_l before DXE unsets dxe->ctrl & WCN36xx_DXE_CTRL_VLD. This should generally be harmless since DXE doesn't write dxe->dst_addr_l (no risk of freeing the wrong skb). Fixes: 8e84c2582169 ("wcn36xx: mac80211 driver for Qualcomm WCN3660/WCN3680 hardware") Signed-off-by: Benjamin Li Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20211023001528.3077822-1-benl@squareup.com Signed-off-by: Sasha Levin --- drivers/net/wireless/ath/wcn36xx/dxe.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/wcn36xx/dxe.c b/drivers/net/wireless/ath/wcn36xx/dxe.c index bab30f7a443ce..4da25e84793b7 100644 --- a/drivers/net/wireless/ath/wcn36xx/dxe.c +++ b/drivers/net/wireless/ath/wcn36xx/dxe.c @@ -563,6 +563,10 @@ static int wcn36xx_rx_handle_packets(struct wcn36xx *wcn, dxe = ctl->desc; while (!(READ_ONCE(dxe->ctrl) & WCN36xx_DXE_CTRL_VLD)) { + /* do not read until we own DMA descriptor */ + dma_rmb(); + + /* read/modify DMA descriptor */ skb = ctl->skb; dma_addr = dxe->dst_addr_l; ret = wcn36xx_dxe_fill_skb(wcn->dev, ctl, GFP_ATOMIC); @@ -573,9 +577,15 @@ static int wcn36xx_rx_handle_packets(struct wcn36xx *wcn, dma_unmap_single(wcn->dev, dma_addr, WCN36XX_PKT_SIZE, DMA_FROM_DEVICE); wcn36xx_rx_skb(wcn, skb); - } /* else keep old skb not submitted and use it for rx DMA */ + } + /* else keep old skb not submitted and reuse it for rx DMA + * (dropping the packet that it contained) + */ + /* flush descriptor changes before re-marking as valid */ + dma_wmb(); dxe->ctrl = ctrl; + ctl = ctl->next; dxe = ctl->desc; } From f3161987f8c333eb3f76439aa535b1b46146d256 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 27 Oct 2021 13:26:19 -0400 Subject: [PATCH 1573/5344] drm/amdgpu/gmc6: fix DMA mask from 44 to 40 bits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 403475be6d8b122c3e6b8a47e075926d7299e5ef ] The DMA mask on SI parts is 40 bits not 44. Copy paste typo. Fixes: 244511f386ccb9 ("drm/amdgpu: simplify and cleanup setting the dma mask") Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1762 Acked-by: Christian König Tested-by: Paul Menzel Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c index 9fb1765e92d15..e9f5de35f7953 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c @@ -863,12 +863,12 @@ static int gmc_v6_0_sw_init(void *handle) adev->gmc.mc_mask = 0xffffffffffULL; - r = dma_set_mask_and_coherent(adev->dev, DMA_BIT_MASK(44)); + r = dma_set_mask_and_coherent(adev->dev, DMA_BIT_MASK(40)); if (r) { dev_warn(adev->dev, "amdgpu: No suitable DMA available.\n"); return r; } - adev->need_swiotlb = drm_need_swiotlb(44); + adev->need_swiotlb = drm_need_swiotlb(40); r = gmc_v6_0_init_microcode(adev); if (r) { From 28c0f8659f943f9203f18ad0eade15a5e4610b44 Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Wed, 27 Oct 2021 15:27:27 +0530 Subject: [PATCH 1574/5344] net: amd-xgbe: Toggle PLL settings during rate change [ Upstream commit daf182d360e509a494db18666799f4e85d83dda0 ] For each rate change command submission, the FW has to do a phy power off sequence internally. For this to happen correctly, the PLL re-initialization control setting has to be turned off before sending mailbox commands and re-enabled once the command submission is complete. Without the PLL control setting, the link up takes longer time in a fixed phy configuration. Fixes: 47f164deab22 ("amd-xgbe: Add PCI device support") Co-developed-by: Sudheesh Mavila Signed-off-by: Sudheesh Mavila Signed-off-by: Shyam Sundar S K Acked-by: Tom Lendacky Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/amd/xgbe/xgbe-common.h | 8 ++++++++ drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c | 20 +++++++++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-common.h b/drivers/net/ethernet/amd/xgbe/xgbe-common.h index b2cd3bdba9f89..533b8519ec352 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-common.h +++ b/drivers/net/ethernet/amd/xgbe/xgbe-common.h @@ -1331,6 +1331,10 @@ #define MDIO_VEND2_PMA_CDR_CONTROL 0x8056 #endif +#ifndef MDIO_VEND2_PMA_MISC_CTRL0 +#define MDIO_VEND2_PMA_MISC_CTRL0 0x8090 +#endif + #ifndef MDIO_CTRL1_SPEED1G #define MDIO_CTRL1_SPEED1G (MDIO_CTRL1_SPEED10G & ~BMCR_SPEED100) #endif @@ -1389,6 +1393,10 @@ #define XGBE_PMA_RX_RST_0_RESET_ON 0x10 #define XGBE_PMA_RX_RST_0_RESET_OFF 0x00 +#define XGBE_PMA_PLL_CTRL_MASK BIT(15) +#define XGBE_PMA_PLL_CTRL_ENABLE BIT(15) +#define XGBE_PMA_PLL_CTRL_DISABLE 0x0000 + /* Bit setting and getting macros * The get macro will extract the current bit field value from within * the variable diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c index d6f6afb67bcc6..0b325ae875b52 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c @@ -1972,12 +1972,26 @@ static void xgbe_phy_rx_reset(struct xgbe_prv_data *pdata) } } +static void xgbe_phy_pll_ctrl(struct xgbe_prv_data *pdata, bool enable) +{ + XMDIO_WRITE_BITS(pdata, MDIO_MMD_PMAPMD, MDIO_VEND2_PMA_MISC_CTRL0, + XGBE_PMA_PLL_CTRL_MASK, + enable ? XGBE_PMA_PLL_CTRL_ENABLE + : XGBE_PMA_PLL_CTRL_DISABLE); + + /* Wait for command to complete */ + usleep_range(100, 200); +} + static void xgbe_phy_perform_ratechange(struct xgbe_prv_data *pdata, unsigned int cmd, unsigned int sub_cmd) { unsigned int s0 = 0; unsigned int wait; + /* Disable PLL re-initialization during FW command processing */ + xgbe_phy_pll_ctrl(pdata, false); + /* Log if a previous command did not complete */ if (XP_IOREAD_BITS(pdata, XP_DRIVER_INT_RO, STATUS)) { netif_dbg(pdata, link, pdata->netdev, @@ -1998,7 +2012,7 @@ static void xgbe_phy_perform_ratechange(struct xgbe_prv_data *pdata, wait = XGBE_RATECHANGE_COUNT; while (wait--) { if (!XP_IOREAD_BITS(pdata, XP_DRIVER_INT_RO, STATUS)) - return; + goto reenable_pll; usleep_range(1000, 2000); } @@ -2008,6 +2022,10 @@ static void xgbe_phy_perform_ratechange(struct xgbe_prv_data *pdata, /* Reset on error */ xgbe_phy_rx_reset(pdata); + +reenable_pll: + /* Enable PLL re-initialization */ + xgbe_phy_pll_ctrl(pdata, true); } static void xgbe_phy_rrc(struct xgbe_prv_data *pdata) From 54b1fe1927d1213b372e3072e5d6f1db90ea7ff5 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 28 Oct 2021 15:55:34 +0100 Subject: [PATCH 1575/5344] net: phylink: avoid mvneta warning when setting pause parameters [ Upstream commit fd8d9731bcdfb22d28e45bce789bcb211c868c78 ] mvneta does not support asymetric pause modes, and it flags this by the lack of AsymPause in the supported field. When setting pause modes, we check that pause->rx_pause == pause->tx_pause, but only when pause autoneg is enabled. When pause autoneg is disabled, we still allow pause->rx_pause != pause->tx_pause, which is incorrect when the MAC does not support asymetric pause, and causes mvneta to issue a warning. Fix this by removing the test for pause->autoneg, so we always check that pause->rx_pause == pause->tx_pause for network devices that do not support AsymPause. Fixes: 9525ae83959b ("phylink: add phylink infrastructure") Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/phy/phylink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index bf5bbb565cf5e..7be43a1eaefda 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -1331,7 +1331,7 @@ int phylink_ethtool_set_pauseparam(struct phylink *pl, return -EOPNOTSUPP; if (!phylink_test(pl->supported, Asym_Pause) && - !pause->autoneg && pause->rx_pause != pause->tx_pause) + pause->rx_pause != pause->tx_pause) return -EINVAL; config->pause &= ~(MLO_PAUSE_AN | MLO_PAUSE_TXRX_MASK); From e8a7fdfc21ccc333896fba9c8a1df66588b34303 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Tue, 26 Oct 2021 16:34:09 +0200 Subject: [PATCH 1576/5344] selftests/bpf: Fix fclose/pclose mismatch in test_progs [ Upstream commit f48ad69097fe79d1de13c4d8fef556d4c11c5e68 ] Make sure to use pclose() to properly close the pipe opened by popen(). Fixes: 81f77fd0deeb ("bpf: add selftest for stackmap with BPF_F_STACK_BUILD_ID") Signed-off-by: Andrea Righi Signed-off-by: Daniel Borkmann Reviewed-by: Shuah Khan Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20211026143409.42666-1-andrea.righi@canonical.com Signed-off-by: Sasha Levin --- tools/testing/selftests/bpf/test_progs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 48bbe8e0ce48d..4369bc46bf9c2 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -289,7 +289,7 @@ int extract_build_id(char *build_id, size_t size) if (getline(&line, &len, fp) == -1) goto err; - fclose(fp); + pclose(fp); if (len > size) len = size; @@ -298,7 +298,7 @@ int extract_build_id(char *build_id, size_t size) free(line); return 0; err: - fclose(fp); + pclose(fp); return -1; } From 708ca18d3493c945fb37b8ca90eb34dc0347fa3b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 29 Oct 2021 08:51:34 -0700 Subject: [PATCH 1577/5344] udp6: allow SO_MARK ctrl msg to affect routing [ Upstream commit 42dcfd850e514b229d616a53dec06d0f2533217c ] Commit c6af0c227a22 ("ip: support SO_MARK cmsg") added propagation of SO_MARK from cmsg to skb->mark. For IPv4 and raw sockets the mark also affects route lookup, but in case of IPv6 the flow info is initialized before cmsg is parsed. Fixes: c6af0c227a22 ("ip: support SO_MARK cmsg") Reported-and-tested-by: Xintong Hu Signed-off-by: Jakub Kicinski Reviewed-by: David Ahern Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/ipv6/udp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 5248fd11e619c..8c15a98096703 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1366,7 +1366,6 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (!fl6.flowi6_oif) fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex; - fl6.flowi6_mark = ipc6.sockc.mark; fl6.flowi6_uid = sk->sk_uid; if (msg->msg_controllen) { @@ -1402,6 +1401,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc6.opt = opt; fl6.flowi6_proto = sk->sk_protocol; + fl6.flowi6_mark = ipc6.sockc.mark; fl6.daddr = *daddr; if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr)) fl6.saddr = np->saddr; From 8e24a47f8133487f01d623e2c1d6ae507a5ccbd1 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Fri, 29 Oct 2021 15:03:14 -0700 Subject: [PATCH 1578/5344] ibmvnic: don't stop queue in xmit [ Upstream commit 8878e46fcfd46b19964bd90e13b25dd94cbfc9be ] If adapter's resetting bit is on, discard the packet but don't stop the transmit queue - instead leave that to the reset code. With this change, it is possible that we may get several calls to ibmvnic_xmit() that simply discard packets and return. But if we stop the queue here, we might end up doing so just after __ibmvnic_open() started the queues (during a hard/soft reset) and before the ->resetting bit was cleared. If that happens, there will be no one to restart queue and transmissions will be blocked indefinitely. This can cause a TIMEOUT reset and with auto priority failover enabled, an unnecessary FAILOVER reset to less favored backing device and then a FAILOVER back to the most favored backing device. If we hit the window repeatedly, we can get stuck in a loop of TIMEOUT, FAILOVER, FAILOVER resets leaving the adapter unusable for extended periods of time. Fixes: 7f5b030830fe ("ibmvnic: Free skb's in cases of failure in transmit") Reported-by: Abdul Haleem Reported-by: Vaishnavi Bhat Signed-off-by: Sukadev Bhattiprolu Reviewed-by: Dany Madden Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/ibm/ibmvnic.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index cfe7229593ead..059eaa13e2c6d 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1462,8 +1462,6 @@ static netdev_tx_t ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) netdev_tx_t ret = NETDEV_TX_OK; if (test_bit(0, &adapter->resetting)) { - if (!netif_subqueue_stopped(netdev, skb)) - netif_stop_subqueue(netdev, queue_num); dev_kfree_skb_any(skb); tx_send_failed++; From 8593593f3f186aca5cff21a20798a53bac86b22b Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Fri, 29 Oct 2021 15:03:15 -0700 Subject: [PATCH 1579/5344] ibmvnic: Process crqs after enabling interrupts [ Upstream commit 6e20d00158f31f7631d68b86996b7e951c4451c8 ] Soon after registering a CRQ it is possible that we get a fail over or maybe a CRQ_INIT from the VIOS while interrupts were disabled. Look for any such CRQs after enabling interrupts. Otherwise we can intermittently fail to bring up ibmvnic adapters during boot, specially in kexec/kdump kernels. Fixes: 032c5e82847a ("Driver for IBM System i/p VNIC protocol") Reported-by: Vaishnavi Bhat Signed-off-by: Sukadev Bhattiprolu Reviewed-by: Dany Madden Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/ibm/ibmvnic.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 059eaa13e2c6d..9adfc0a7ab823 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -4934,6 +4934,9 @@ static int init_crq_queue(struct ibmvnic_adapter *adapter) crq->cur = 0; spin_lock_init(&crq->lock); + /* process any CRQs that were queued before we enabled interrupts */ + tasklet_schedule(&adapter->tasklet); + return retrc; req_irq_failed: From 7c5371a256f03a2a8532d9f552d77ed6f19d82b1 Mon Sep 17 00:00:00 2001 From: Junji Wei Date: Tue, 31 Aug 2021 16:32:23 +0800 Subject: [PATCH 1580/5344] RDMA/rxe: Fix wrong port_cap_flags [ Upstream commit dcd3f985b20ffcc375f82ca0ca9f241c7025eb5e ] The port->attr.port_cap_flags should be set to enum ib_port_capability_mask_bits in ib_mad.h, not RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP. Fixes: 8700e3e7c485 ("Soft RoCE driver") Link: https://lore.kernel.org/r/20210831083223.65797-1-weijunji@bytedance.com Signed-off-by: Junji Wei Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe Signed-off-by: Sasha Levin --- drivers/infiniband/sw/rxe/rxe_param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h index fe52073867006..8d3f6d93dfb8d 100644 --- a/drivers/infiniband/sw/rxe/rxe_param.h +++ b/drivers/infiniband/sw/rxe/rxe_param.h @@ -140,7 +140,7 @@ enum rxe_device_param { /* default/initial rxe port parameters */ enum rxe_port_param { RXE_PORT_GID_TBL_LEN = 1024, - RXE_PORT_PORT_CAP_FLAGS = RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP, + RXE_PORT_PORT_CAP_FLAGS = IB_PORT_CM_SUP, RXE_PORT_MAX_MSG_SZ = 0x800000, RXE_PORT_BAD_PKEY_CNTR = 0, RXE_PORT_QKEY_VIOL_CNTR = 0, From 19649354e342cf5f1a0819e4afcdaef73629694d Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 23 Apr 2021 09:02:26 +0200 Subject: [PATCH 1581/5344] clk: mvebu: ap-cpu-clk: Fix a memory leak in error handling paths [ Upstream commit af9617b419f77cf0b99702a7b2b0519da0d27715 ] If we exit the for_each_of_cpu_node loop early, the reference on the current node must be decremented, otherwise there is a leak. Fixes: f756e362d938 ("clk: mvebu: add CPU clock driver for Armada 7K/8K") Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/545df946044fc1fc05a4217cdf0054be7a79e49e.1619161112.git.christophe.jaillet@wanadoo.fr Reviewed-by: Dan Carpenter Signed-off-by: Stephen Boyd Signed-off-by: Sasha Levin --- drivers/clk/mvebu/ap-cpu-clk.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/clk/mvebu/ap-cpu-clk.c b/drivers/clk/mvebu/ap-cpu-clk.c index af5e5acad3706..bde4a7d6a1d33 100644 --- a/drivers/clk/mvebu/ap-cpu-clk.c +++ b/drivers/clk/mvebu/ap-cpu-clk.c @@ -256,12 +256,15 @@ static int ap_cpu_clock_probe(struct platform_device *pdev) int cpu, err; err = of_property_read_u32(dn, "reg", &cpu); - if (WARN_ON(err)) + if (WARN_ON(err)) { + of_node_put(dn); return err; + } /* If cpu2 or cpu3 is enabled */ if (cpu & APN806_CLUSTER_NUM_MASK) { nclusters = 2; + of_node_put(dn); break; } } @@ -288,8 +291,10 @@ static int ap_cpu_clock_probe(struct platform_device *pdev) int cpu, err; err = of_property_read_u32(dn, "reg", &cpu); - if (WARN_ON(err)) + if (WARN_ON(err)) { + of_node_put(dn); return err; + } cluster_index = cpu & APN806_CLUSTER_NUM_MASK; cluster_index >>= APN806_CLUSTER_NUM_OFFSET; @@ -301,6 +306,7 @@ static int ap_cpu_clock_probe(struct platform_device *pdev) parent = of_clk_get(np, cluster_index); if (IS_ERR(parent)) { dev_err(dev, "Could not get the clock parent\n"); + of_node_put(dn); return -EINVAL; } parent_name = __clk_get_name(parent); @@ -319,8 +325,10 @@ static int ap_cpu_clock_probe(struct platform_device *pdev) init.parent_names = &parent_name; ret = devm_clk_hw_register(dev, &ap_cpu_clk[cluster_index].hw); - if (ret) + if (ret) { + of_node_put(dn); return ret; + } ap_cpu_data->hws[cluster_index] = &ap_cpu_clk[cluster_index].hw; } From cb238b88f6fba32b87d59b8a3355f0da8f363222 Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Wed, 1 Sep 2021 20:35:57 +0800 Subject: [PATCH 1582/5344] ARM: s3c: irq-s3c24xx: Fix return value check for s3c24xx_init_intc() [ Upstream commit 2aa717473ce96c93ae43a5dc8c23cedc8ce7dd9f ] The s3c24xx_init_intc() returns an error pointer upon failure, not NULL. let's add an error pointer check in s3c24xx_handle_irq. s3c_intc[0] is not NULL or ERR, we can simplify the code. Fixes: 1f629b7a3ced ("ARM: S3C24XX: transform irq handling into a declarative form") Signed-off-by: Jackie Liu Link: https://lore.kernel.org/r/20210901123557.1043953-1-liu.yun@linux.dev Signed-off-by: Krzysztof Kozlowski Signed-off-by: Sasha Levin --- drivers/irqchip/irq-s3c24xx.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/drivers/irqchip/irq-s3c24xx.c b/drivers/irqchip/irq-s3c24xx.c index d2031fecc3861..5e97ae54782d6 100644 --- a/drivers/irqchip/irq-s3c24xx.c +++ b/drivers/irqchip/irq-s3c24xx.c @@ -359,11 +359,25 @@ static inline int s3c24xx_handle_intc(struct s3c_irq_intc *intc, asmlinkage void __exception_irq_entry s3c24xx_handle_irq(struct pt_regs *regs) { do { - if (likely(s3c_intc[0])) - if (s3c24xx_handle_intc(s3c_intc[0], regs, 0)) - continue; + /* + * For platform based machines, neither ERR nor NULL can happen here. + * The s3c24xx_handle_irq() will be set as IRQ handler iff this succeeds: + * + * s3c_intc[0] = s3c24xx_init_intc() + * + * If this fails, the next calls to s3c24xx_init_intc() won't be executed. + * + * For DT machine, s3c_init_intc_of() could set the IRQ handler without + * setting s3c_intc[0] only if it was called with num_ctrl=0. There is no + * such code path, so again the s3c_intc[0] will have a valid pointer if + * set_handle_irq() is called. + * + * Therefore in s3c24xx_handle_irq(), the s3c_intc[0] is always something. + */ + if (s3c24xx_handle_intc(s3c_intc[0], regs, 0)) + continue; - if (s3c_intc[2]) + if (!IS_ERR_OR_NULL(s3c_intc[2])) if (s3c24xx_handle_intc(s3c_intc[2], regs, 64)) continue; From 67044737cb3cd892143f0794b10387eb74efbd8f Mon Sep 17 00:00:00 2001 From: Alex Bee Date: Wed, 23 Jun 2021 13:59:26 +0200 Subject: [PATCH 1583/5344] arm64: dts: rockchip: Fix GPU register width for RK3328 [ Upstream commit 932b4610f55b49f3a158b0db451137bab7ed0e1f ] As can be seen in RK3328's TRM the register range for the GPU is 0xff300000 to 0xff330000. It would (and does in vendor kernel) overlap with the registers of the HEVC encoder (node/driver do not exist yet in upstream kernel). See already existing h265e_mmu node. Fixes: 752fbc0c8da7 ("arm64: dts: rockchip: add rk3328 mali gpu node") Signed-off-by: Alex Bee Link: https://lore.kernel.org/r/20210623115926.164861-1-knaerzche@gmail.com Signed-off-by: Heiko Stuebner Signed-off-by: Sasha Levin --- arch/arm64/boot/dts/rockchip/rk3328.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3328.dtsi b/arch/arm64/boot/dts/rockchip/rk3328.dtsi index 44ad744c4710d..6ddb6b8c1fad5 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3328.dtsi @@ -555,7 +555,7 @@ gpu: gpu@ff300000 { compatible = "rockchip,rk3328-mali", "arm,mali-450"; - reg = <0x0 0xff300000 0x0 0x40000>; + reg = <0x0 0xff300000 0x0 0x30000>; interrupts = , , , From bb9b8abe4db740e9ab52364617d54fea3bd53445 Mon Sep 17 00:00:00 2001 From: Marijn Suijten Date: Mon, 30 Aug 2021 19:57:39 +0200 Subject: [PATCH 1584/5344] ARM: dts: qcom: msm8974: Add xo_board reference clock to DSI0 PHY [ Upstream commit 8ccecf6c710b8c048eecc65709640642e5357d6e ] According to YAML validation, and for a future patchset putting this xo_board reference clock to use as VCO reference parent, add the missing clock to dsi_phy0. Fixes: 5a9fc531f6ec ("ARM: dts: msm8974: add display support") Signed-off-by: Marijn Suijten Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20210830175739.143401-1-marijn.suijten@somainline.org Signed-off-by: Sasha Levin --- arch/arm/boot/dts/qcom-msm8974.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/qcom-msm8974.dtsi b/arch/arm/boot/dts/qcom-msm8974.dtsi index 369e58f64145d..9de4f17955e31 100644 --- a/arch/arm/boot/dts/qcom-msm8974.dtsi +++ b/arch/arm/boot/dts/qcom-msm8974.dtsi @@ -1213,8 +1213,8 @@ #phy-cells = <0>; qcom,dsi-phy-index = <0>; - clocks = <&mmcc MDSS_AHB_CLK>; - clock-names = "iface"; + clocks = <&mmcc MDSS_AHB_CLK>, <&xo_board>; + clock-names = "iface", "ref"; }; }; }; From 27772b220ddf78de3b1224d2a6740ef93b7f2496 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Wed, 15 Sep 2021 05:32:38 -0700 Subject: [PATCH 1585/5344] RDMA/bnxt_re: Fix query SRQ failure [ Upstream commit 598d16fa1bf93431ad35bbab3ed1affe4fb7b562 ] Fill the missing parameters for the FW command while querying SRQ. Fixes: 37cb11acf1f7 ("RDMA/bnxt_re: Add SRQ support for Broadcom adapters") Link: https://lore.kernel.org/r/1631709163-2287-8-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Selvin Xavier Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe Signed-off-by: Sasha Levin --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 4d07d22bfa7b1..5fc5ab7813c0f 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -642,12 +642,13 @@ int bnxt_qplib_query_srq(struct bnxt_qplib_res *res, int rc = 0; RCFW_CMD_PREP(req, QUERY_SRQ, cmd_flags); - req.srq_cid = cpu_to_le32(srq->id); /* Configure the request */ sbuf = bnxt_qplib_rcfw_alloc_sbuf(rcfw, sizeof(*sb)); if (!sbuf) return -ENOMEM; + req.resp_size = sizeof(*sb) / BNXT_QPLIB_CMDQE_UNITS; + req.srq_cid = cpu_to_le32(srq->id); sb = sbuf->sb; rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp, (void *)sbuf, 0); From 1b6a1b958dc187040e3b6798c96672da999248d4 Mon Sep 17 00:00:00 2001 From: Anand Moon Date: Sun, 19 Sep 2021 20:29:09 +0000 Subject: [PATCH 1586/5344] arm64: dts: meson-g12a: Fix the pwm regulator supply properties [ Upstream commit 085675117ecf5e02c4220698fd549024ec64ad2c ] After enabling CONFIG_REGULATOR_DEBUG=y we observe below debug logs. Changes help link VDDCPU pwm regulator to 12V regulator supply instead of dummy regulator. [ 11.602281] pwm-regulator regulator-vddcpu: Looking up pwm-supply property in node /regulator-vddcpu failed [ 11.602344] VDDCPU: supplied by regulator-dummy [ 11.602365] regulator-dummy: could not add device link regulator.11: -ENOENT [ 11.602548] VDDCPU: 721 <--> 1022 mV at 1022 mV, enabled Fixes: e9bc0765cc12 ("arm64: dts: meson-g12a: enable DVFS on G12A boards") Cc: Neil Armstrong Signed-off-by: Anand Moon Reviewed-by: Martin Blumenstingl Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20210919202918.3556-2-linux.amoon@gmail.com Signed-off-by: Sasha Levin --- arch/arm64/boot/dts/amlogic/meson-g12a-sei510.dts | 2 +- arch/arm64/boot/dts/amlogic/meson-g12a-u200.dts | 2 +- arch/arm64/boot/dts/amlogic/meson-g12a-x96-max.dts | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/boot/dts/amlogic/meson-g12a-sei510.dts b/arch/arm64/boot/dts/amlogic/meson-g12a-sei510.dts index c9fa23a565624..b8d9e92197ac8 100644 --- a/arch/arm64/boot/dts/amlogic/meson-g12a-sei510.dts +++ b/arch/arm64/boot/dts/amlogic/meson-g12a-sei510.dts @@ -139,7 +139,7 @@ regulator-min-microvolt = <721000>; regulator-max-microvolt = <1022000>; - vin-supply = <&dc_in>; + pwm-supply = <&dc_in>; pwms = <&pwm_AO_cd 1 1250 0>; pwm-dutycycle-range = <100 0>; diff --git a/arch/arm64/boot/dts/amlogic/meson-g12a-u200.dts b/arch/arm64/boot/dts/amlogic/meson-g12a-u200.dts index 2a324f0136e3f..02ec6eda03b1c 100644 --- a/arch/arm64/boot/dts/amlogic/meson-g12a-u200.dts +++ b/arch/arm64/boot/dts/amlogic/meson-g12a-u200.dts @@ -139,7 +139,7 @@ regulator-min-microvolt = <721000>; regulator-max-microvolt = <1022000>; - vin-supply = <&main_12v>; + pwm-supply = <&main_12v>; pwms = <&pwm_AO_cd 1 1250 0>; pwm-dutycycle-range = <100 0>; diff --git a/arch/arm64/boot/dts/amlogic/meson-g12a-x96-max.dts b/arch/arm64/boot/dts/amlogic/meson-g12a-x96-max.dts index c48125bf9d1e3..5209c44fda01a 100644 --- a/arch/arm64/boot/dts/amlogic/meson-g12a-x96-max.dts +++ b/arch/arm64/boot/dts/amlogic/meson-g12a-x96-max.dts @@ -139,7 +139,7 @@ regulator-min-microvolt = <721000>; regulator-max-microvolt = <1022000>; - vin-supply = <&dc_in>; + pwm-supply = <&dc_in>; pwms = <&pwm_AO_cd 1 1250 0>; pwm-dutycycle-range = <100 0>; From bd72d6ebe77b3737824a9735035131ad36c30eb9 Mon Sep 17 00:00:00 2001 From: Peter Rosin Date: Mon, 20 Sep 2021 22:37:38 +0200 Subject: [PATCH 1587/5344] ARM: dts: at91: tse850: the emac<->phy interface is rmii [ Upstream commit dcdbc335a91a26e022a803e1a6b837266989c032 ] This went unnoticed until commit 7897b071ac3b ("net: macb: convert to phylink") which tickled the problem. The sama5d3 emac has never been capable of rgmii, and it all just happened to work before that commit. Fixes: 21dd0ece34c2 ("ARM: dts: at91: add devicetree for the Axentia TSE-850") Signed-off-by: Peter Rosin Signed-off-by: Nicolas Ferre Link: https://lore.kernel.org/r/ea781f5e-422f-6cbf-3cf4-d5a7bac9392d@axentia.se Signed-off-by: Sasha Levin --- arch/arm/boot/dts/at91-tse850-3.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/at91-tse850-3.dts b/arch/arm/boot/dts/at91-tse850-3.dts index 3ca97b47c69ce..7e5c598e7e68f 100644 --- a/arch/arm/boot/dts/at91-tse850-3.dts +++ b/arch/arm/boot/dts/at91-tse850-3.dts @@ -262,7 +262,7 @@ &macb1 { status = "okay"; - phy-mode = "rgmii"; + phy-mode = "rmii"; #address-cells = <1>; #size-cells = <0>; From a9fae3ac496f1465b2be638bd2dd56cf0c80b9ce Mon Sep 17 00:00:00 2001 From: Tong Zhang Date: Mon, 6 Sep 2021 21:07:02 -0700 Subject: [PATCH 1588/5344] scsi: dc395: Fix error case unwinding [ Upstream commit cbd9a3347c757383f3d2b50cf7cfd03eb479c481 ] dc395x_init_one()->adapter_init() might fail. In this case, the acb is already cleaned up by adapter_init(), no need to do that in adapter_uninit(acb) again. [ 1.252251] dc395x: adapter init failed [ 1.254900] RIP: 0010:adapter_uninit+0x94/0x170 [dc395x] [ 1.260307] Call Trace: [ 1.260442] dc395x_init_one.cold+0x72a/0x9bb [dc395x] Link: https://lore.kernel.org/r/20210907040702.1846409-1-ztong0001@gmail.com Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reviewed-by: Finn Thain Signed-off-by: Tong Zhang Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/dc395x.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/dc395x.c b/drivers/scsi/dc395x.c index 13fbb2eab842e..5fb06930912a0 100644 --- a/drivers/scsi/dc395x.c +++ b/drivers/scsi/dc395x.c @@ -4698,6 +4698,7 @@ static int dc395x_init_one(struct pci_dev *dev, const struct pci_device_id *id) /* initialise the adapter and everything we need */ if (adapter_init(acb, io_port_base, io_port_len, irq)) { dprintkl(KERN_INFO, "adapter init failed\n"); + acb = NULL; goto fail; } From b0adfec5536dc1187b28ba3e2e28e96b9889026b Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Mon, 13 Sep 2021 14:19:08 +0800 Subject: [PATCH 1589/5344] MIPS: loongson64: make CPU_LOONGSON64 depends on MIPS_FP_SUPPORT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 7f3b3c2bfa9c93ab9b5595543496f570983dc330 ] mach/loongson64 fails to build when the FPU support is disabled: arch/mips/loongson64/cop2-ex.c:45:15: error: implicit declaration of function ‘__is_fpu_owner’; did you mean ‘is_fpu_owner’? [-Werror=implicit-function-declaration] arch/mips/loongson64/cop2-ex.c:98:30: error: ‘struct thread_struct’ has no member named ‘fpu’ arch/mips/loongson64/cop2-ex.c:99:30: error: ‘struct thread_struct’ has no member named ‘fpu’ arch/mips/loongson64/cop2-ex.c:131:43: error: ‘struct thread_struct’ has no member named ‘fpu’ arch/mips/loongson64/cop2-ex.c:137:38: error: ‘struct thread_struct’ has no member named ‘fpu’ arch/mips/loongson64/cop2-ex.c:203:30: error: ‘struct thread_struct’ has no member named ‘fpu’ arch/mips/loongson64/cop2-ex.c:219:30: error: ‘struct thread_struct’ has no member named ‘fpu’ arch/mips/loongson64/cop2-ex.c:283:38: error: ‘struct thread_struct’ has no member named ‘fpu’ arch/mips/loongson64/cop2-ex.c:301:38: error: ‘struct thread_struct’ has no member named ‘fpu’ Fixes: ef2f826c8f2f ("MIPS: Loongson-3: Enable the COP2 usage") Suggested-by: Huacai Chen Reviewed-by: Huacai Chen Reported-by: k2ci robot Signed-off-by: Jackie Liu Signed-off-by: Thomas Bogendoerfer Signed-off-by: Sasha Levin --- arch/mips/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 2bfef67d52c63..041d34975ea2c 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -1393,6 +1393,7 @@ config CPU_LOONGSON3 select WEAK_REORDERING_BEYOND_LLSC select MIPS_PGD_C0_CONTEXT select MIPS_L1_CACHE_SHIFT_6 + select MIPS_FP_SUPPORT select GPIOLIB select SWIOTLB help From 2457915bcca2d6210b0bf41045a056af3e81964f Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Sat, 4 Sep 2021 10:37:41 +0800 Subject: [PATCH 1590/5344] JFS: fix memleak in jfs_mount [ Upstream commit c48a14dca2cb57527dde6b960adbe69953935f10 ] In jfs_mount, when diMount(ipaimap2) fails, it goes to errout35. However, the following code does not free ipaimap2 allocated by diReadSpecial. Fix this by refactoring the error handling code of jfs_mount. To be specific, modify the lable name and free ipaimap2 when the above error ocurrs. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Dongliang Mu Signed-off-by: Dave Kleikamp Signed-off-by: Sasha Levin --- fs/jfs/jfs_mount.c | 51 ++++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 29 deletions(-) diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c index 616de103dccc5..d41733540df91 100644 --- a/fs/jfs/jfs_mount.c +++ b/fs/jfs/jfs_mount.c @@ -80,14 +80,14 @@ int jfs_mount(struct super_block *sb) * (initialize mount inode from the superblock) */ if ((rc = chkSuper(sb))) { - goto errout20; + goto out; } ipaimap = diReadSpecial(sb, AGGREGATE_I, 0); if (ipaimap == NULL) { jfs_err("jfs_mount: Failed to read AGGREGATE_I"); rc = -EIO; - goto errout20; + goto out; } sbi->ipaimap = ipaimap; @@ -98,7 +98,7 @@ int jfs_mount(struct super_block *sb) */ if ((rc = diMount(ipaimap))) { jfs_err("jfs_mount: diMount(ipaimap) failed w/rc = %d", rc); - goto errout21; + goto err_ipaimap; } /* @@ -107,7 +107,7 @@ int jfs_mount(struct super_block *sb) ipbmap = diReadSpecial(sb, BMAP_I, 0); if (ipbmap == NULL) { rc = -EIO; - goto errout22; + goto err_umount_ipaimap; } jfs_info("jfs_mount: ipbmap:0x%p", ipbmap); @@ -119,7 +119,7 @@ int jfs_mount(struct super_block *sb) */ if ((rc = dbMount(ipbmap))) { jfs_err("jfs_mount: dbMount failed w/rc = %d", rc); - goto errout22; + goto err_ipbmap; } /* @@ -138,7 +138,7 @@ int jfs_mount(struct super_block *sb) if (!ipaimap2) { jfs_err("jfs_mount: Failed to read AGGREGATE_I"); rc = -EIO; - goto errout35; + goto err_umount_ipbmap; } sbi->ipaimap2 = ipaimap2; @@ -150,7 +150,7 @@ int jfs_mount(struct super_block *sb) if ((rc = diMount(ipaimap2))) { jfs_err("jfs_mount: diMount(ipaimap2) failed, rc = %d", rc); - goto errout35; + goto err_ipaimap2; } } else /* Secondary aggregate inode table is not valid */ @@ -167,7 +167,7 @@ int jfs_mount(struct super_block *sb) jfs_err("jfs_mount: Failed to read FILESYSTEM_I"); /* open fileset secondary inode allocation map */ rc = -EIO; - goto errout40; + goto err_umount_ipaimap2; } jfs_info("jfs_mount: ipimap:0x%p", ipimap); @@ -177,41 +177,34 @@ int jfs_mount(struct super_block *sb) /* initialize fileset inode allocation map */ if ((rc = diMount(ipimap))) { jfs_err("jfs_mount: diMount failed w/rc = %d", rc); - goto errout41; + goto err_ipimap; } - goto out; + return rc; /* * unwind on error */ - errout41: /* close fileset inode allocation map inode */ +err_ipimap: + /* close fileset inode allocation map inode */ diFreeSpecial(ipimap); - - errout40: /* fileset closed */ - +err_umount_ipaimap2: /* close secondary aggregate inode allocation map */ - if (ipaimap2) { + if (ipaimap2) diUnmount(ipaimap2, 1); +err_ipaimap2: + /* close aggregate inodes */ + if (ipaimap2) diFreeSpecial(ipaimap2); - } - - errout35: - - /* close aggregate block allocation map */ +err_umount_ipbmap: /* close aggregate block allocation map */ dbUnmount(ipbmap, 1); +err_ipbmap: /* close aggregate inodes */ diFreeSpecial(ipbmap); - - errout22: /* close aggregate inode allocation map */ - +err_umount_ipaimap: /* close aggregate inode allocation map */ diUnmount(ipaimap, 1); - - errout21: /* close aggregate inodes */ +err_ipaimap: /* close aggregate inodes */ diFreeSpecial(ipaimap); - errout20: /* aggregate closed */ - - out: - +out: if (rc) jfs_err("Mount JFS Failure: %d", rc); From 5d4d11b440e277efc306c2899dea34122fdc6dbb Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 29 Sep 2021 09:29:33 +0200 Subject: [PATCH 1591/5344] ALSA: hda: Reduce udelay() at SKL+ position reporting [ Upstream commit 46243b85b0ec5d2cee7545e5ce18c015ce91957e ] The position reporting on Intel Skylake and later chips via azx_get_pos_skl() contains a udelay(20) call for the capture streams. A call for this alone doesn't sound too harmful. However, as the pointer PCM ops is one of the hottest path in the PCM operations -- especially for the timer-scheduled operations like PulseAudio -- such a delay hogs CPU usage significantly in the total performance. The code there was taken from the original code in ASoC SST Skylake driver blindly. The udelay() is a workaround for the case where the reported position is behind the period boundary at the timing triggered from interrupts; applications often expect that the full data is available for the whole period when returned (and also that's the definition of the ALSA PCM period). OTOH, HD-audio (legacy) driver has already some workarounds for the delayed position reporting due to its relatively large FIFO, such as the BDL position adjustment and the delayed period-elapsed call in the work. That said, the udelay() is almost superfluous for HD-audio driver unlike SST, and we can drop the udelay(). Though, the current code doesn't guarantee the full period readiness as mentioned in the above, but rather it checks the wallclock and detects the unexpected jump. That's one missing piece, and the drop of udelay() needs a bit more sanity checks for the delayed handling. This patch implements those: the drop of udelay() call in azx_get_pos_skl() and the more proper check of hwptr in azx_position_ok(). The latter change is applied only for the case where the stream is running in the normal mode without no_period_wakeup flag. When no_period_wakeup is set, it essentially ignores the period handling and rather concentrates only on the current position; which implies that we don't need to care about the period boundary at all. Fixes: f87e7f25893d ("ALSA: hda - Improved position reporting on SKL+") Reported-by: Jens Axboe Reviewed-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20210929072934.6809-2-tiwai@suse.de Signed-off-by: Takashi Iwai Signed-off-by: Sasha Levin --- sound/pci/hda/hda_intel.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index ebb1ee69dd0c3..95d472d433e70 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -671,13 +671,17 @@ static int azx_position_check(struct azx *chip, struct azx_dev *azx_dev) * the update-IRQ timing. The IRQ is issued before actually the * data is processed. So, we need to process it afterwords in a * workqueue. + * + * Returns 1 if OK to proceed, 0 for delay handling, -1 for skipping update */ static int azx_position_ok(struct azx *chip, struct azx_dev *azx_dev) { struct snd_pcm_substream *substream = azx_dev->core.substream; + struct snd_pcm_runtime *runtime = substream->runtime; int stream = substream->stream; u32 wallclk; unsigned int pos; + snd_pcm_uframes_t hwptr, target; wallclk = azx_readl(chip, WALLCLK) - azx_dev->core.start_wallclk; if (wallclk < (azx_dev->core.period_wallclk * 2) / 3) @@ -714,6 +718,24 @@ static int azx_position_ok(struct azx *chip, struct azx_dev *azx_dev) /* NG - it's below the first next period boundary */ return chip->bdl_pos_adj ? 0 : -1; azx_dev->core.start_wallclk += wallclk; + + if (azx_dev->core.no_period_wakeup) + return 1; /* OK, no need to check period boundary */ + + if (runtime->hw_ptr_base != runtime->hw_ptr_interrupt) + return 1; /* OK, already in hwptr updating process */ + + /* check whether the period gets really elapsed */ + pos = bytes_to_frames(runtime, pos); + hwptr = runtime->hw_ptr_base + pos; + if (hwptr < runtime->status->hw_ptr) + hwptr += runtime->buffer_size; + target = runtime->hw_ptr_interrupt + runtime->period_size; + if (hwptr < target) { + /* too early wakeup, process it later */ + return chip->bdl_pos_adj ? 0 : -1; + } + return 1; /* OK, it's fine */ } @@ -907,11 +929,7 @@ static unsigned int azx_get_pos_skl(struct azx *chip, struct azx_dev *azx_dev) if (azx_dev->core.substream->stream == SNDRV_PCM_STREAM_PLAYBACK) return azx_skl_get_dpib_pos(chip, azx_dev); - /* For capture, we need to read posbuf, but it requires a delay - * for the possible boundary overlap; the read of DPIB fetches the - * actual posbuf - */ - udelay(20); + /* read of DPIB fetches the actual posbuf */ azx_skl_get_dpib_pos(chip, azx_dev); return azx_get_pos_posbuf(chip, azx_dev); } From 99a1c2539b6d2acd7c8ed0dfc37921edd78bbc8b Mon Sep 17 00:00:00 2001 From: Andreas Kemnade Date: Fri, 1 Oct 2021 09:34:15 +0200 Subject: [PATCH 1592/5344] arm: dts: omap3-gta04a4: accelerometer irq fix [ Upstream commit 884ea75d79a36faf3731ad9d6b9c29f58697638d ] Fix typo in pinctrl. It did only work because the bootloader seems to have initialized it. Fixes: ee327111953b ("ARM: dts: omap3-gta04: Define and use bma180 irq pin") Signed-off-by: Andreas Kemnade Signed-off-by: Tony Lindgren Signed-off-by: Sasha Levin --- arch/arm/boot/dts/omap3-gta04.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/omap3-gta04.dtsi b/arch/arm/boot/dts/omap3-gta04.dtsi index b6ef1a7ac8a4e..186b2af7743e3 100644 --- a/arch/arm/boot/dts/omap3-gta04.dtsi +++ b/arch/arm/boot/dts/omap3-gta04.dtsi @@ -515,7 +515,7 @@ compatible = "bosch,bma180"; reg = <0x41>; pinctrl-names = "default"; - pintcrl-0 = <&bma180_pins>; + pinctrl-0 = <&bma180_pins>; interrupt-parent = <&gpio4>; interrupts = <19 IRQ_TYPE_LEVEL_HIGH>; /* GPIO_115 */ }; From 22c2eb4b59ede606e00b783c89ebb094d6d2e10b Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 27 Jun 2021 17:54:31 +0200 Subject: [PATCH 1593/5344] soc/tegra: Fix an error handling path in tegra_powergate_power_up() [ Upstream commit 986b5094708e508baa452a23ffe809870934a7df ] If an error occurs after a successful tegra_powergate_enable_clocks() call, it must be undone by a tegra_powergate_disable_clocks() call, as already done in the below and above error handling paths of this function. Update the 'goto' to branch at the correct place of the error handling path. Fixes: a38045121bf4 ("soc/tegra: pmc: Add generic PM domain support") Signed-off-by: Christophe JAILLET Reviewed-by: Jon Hunter Signed-off-by: Thierry Reding Signed-off-by: Sasha Levin --- drivers/soc/tegra/pmc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/soc/tegra/pmc.c b/drivers/soc/tegra/pmc.c index 0447afa970f5e..ab75f41e9c0c9 100644 --- a/drivers/soc/tegra/pmc.c +++ b/drivers/soc/tegra/pmc.c @@ -591,7 +591,7 @@ static int tegra_powergate_power_up(struct tegra_powergate *pg, err = reset_control_deassert(pg->reset); if (err) - goto powergate_off; + goto disable_clks; usleep_range(10, 20); From 3dea83917505bc7c1c22b82d038d20c2529e155f Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Sat, 25 Sep 2021 23:14:32 +0800 Subject: [PATCH 1594/5344] memory: fsl_ifc: fix leak of irq and nand_irq in fsl_ifc_ctrl_probe [ Upstream commit 4ed2f3545c2e5acfbccd7f85fea5b1a82e9862d7 ] The error handling code of fsl_ifc_ctrl_probe is problematic. When fsl_ifc_ctrl_init fails or request_irq of fsl_ifc_ctrl_dev->irq fails, it forgets to free the irq and nand_irq. Meanwhile, if request_irq of fsl_ifc_ctrl_dev->nand_irq fails, it will still free nand_irq even if the request_irq is not successful. Fix this by refactoring the error handling code. Fixes: d2ae2e20fbdd ("driver/memory:Move Freescale IFC driver to a common driver") Signed-off-by: Dongliang Mu Link: https://lore.kernel.org/r/20210925151434.8170-1-mudongliangabcd@gmail.com Signed-off-by: Krzysztof Kozlowski Signed-off-by: Sasha Levin --- drivers/memory/fsl_ifc.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/memory/fsl_ifc.c b/drivers/memory/fsl_ifc.c index 2790258346070..84fa32f288c80 100644 --- a/drivers/memory/fsl_ifc.c +++ b/drivers/memory/fsl_ifc.c @@ -263,7 +263,7 @@ static int fsl_ifc_ctrl_probe(struct platform_device *dev) ret = fsl_ifc_ctrl_init(fsl_ifc_ctrl_dev); if (ret < 0) - goto err; + goto err_unmap_nandirq; init_waitqueue_head(&fsl_ifc_ctrl_dev->nand_wait); @@ -272,7 +272,7 @@ static int fsl_ifc_ctrl_probe(struct platform_device *dev) if (ret != 0) { dev_err(&dev->dev, "failed to install irq (%d)\n", fsl_ifc_ctrl_dev->irq); - goto err_irq; + goto err_unmap_nandirq; } if (fsl_ifc_ctrl_dev->nand_irq) { @@ -281,17 +281,16 @@ static int fsl_ifc_ctrl_probe(struct platform_device *dev) if (ret != 0) { dev_err(&dev->dev, "failed to install irq (%d)\n", fsl_ifc_ctrl_dev->nand_irq); - goto err_nandirq; + goto err_free_irq; } } return 0; -err_nandirq: - free_irq(fsl_ifc_ctrl_dev->nand_irq, fsl_ifc_ctrl_dev); - irq_dispose_mapping(fsl_ifc_ctrl_dev->nand_irq); -err_irq: +err_free_irq: free_irq(fsl_ifc_ctrl_dev->irq, fsl_ifc_ctrl_dev); +err_unmap_nandirq: + irq_dispose_mapping(fsl_ifc_ctrl_dev->nand_irq); irq_dispose_mapping(fsl_ifc_ctrl_dev->irq); err: iounmap(fsl_ifc_ctrl_dev->gregs); From c9a18558833aa0ad206194ba3a1ed078aec5fa43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= Date: Mon, 13 Sep 2021 10:26:33 +0200 Subject: [PATCH 1595/5344] clk: at91: check pmc node status before registering syscore ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit c405f5c15e9f6094f2fa1658e73e56f3058e2122 ] Currently, at91 pmc driver always register the syscore_ops whatever the status of the pmc node that has been found. When set as secure and disabled, the pmc should not be accessed or this will generate abort exceptions. To avoid this, add a check on node availability before registering the syscore operations. Signed-off-by: Clément Léger Link: https://lore.kernel.org/r/20210913082633.110168-1-clement.leger@bootlin.com Acked-by: Nicolas Ferre Reviewed-by: Claudiu Beznea Fixes: b3b02eac33ed ("clk: at91: Add sama5d2 suspend/resume") Signed-off-by: Stephen Boyd Signed-off-by: Sasha Levin --- drivers/clk/at91/pmc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/clk/at91/pmc.c b/drivers/clk/at91/pmc.c index b71515acdec1f..976ca41e9157e 100644 --- a/drivers/clk/at91/pmc.c +++ b/drivers/clk/at91/pmc.c @@ -275,6 +275,11 @@ static int __init pmc_register_ops(void) np = of_find_matching_node(NULL, sama5d2_pmc_dt_ids); + if (!of_device_is_available(np)) { + of_node_put(np); + return -ENODEV; + } + pmcreg = device_node_to_regmap(np); if (IS_ERR(pmcreg)) return PTR_ERR(pmcreg); From b8dd0315ce36c9e72531d1f77227913c49c7970c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 15 Sep 2021 15:34:35 +0200 Subject: [PATCH 1596/5344] video: fbdev: chipsfb: use memset_io() instead of memset() [ Upstream commit f2719b26ae27282c145202ffd656d5ff1fe737cc ] While investigating a lockup at startup on Powerbook 3400C, it was identified that the fbdev driver generates alignment exception at startup: --- interrupt: 600 at memset+0x60/0xc0 NIP: c0021414 LR: c03fc49c CTR: 00007fff REGS: ca021c10 TRAP: 0600 Tainted: G W (5.14.2-pmac-00727-g12a41fa69492) MSR: 00009032 CR: 44008442 XER: 20000100 DAR: cab80020 DSISR: 00017c07 GPR00: 00000007 ca021cd0 c14412e0 cab80000 00000000 00100000 cab8001c 00000004 GPR08: 00100000 00007fff 00000000 00000000 84008442 00000000 c0006fb4 00000000 GPR16: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00100000 GPR24: 00000000 81800000 00000320 c15fa400 c14d1878 00000000 c14d1800 c094e19c NIP [c0021414] memset+0x60/0xc0 LR [c03fc49c] chipsfb_pci_init+0x160/0x580 --- interrupt: 600 [ca021cd0] [c03fc46c] chipsfb_pci_init+0x130/0x580 (unreliable) [ca021d20] [c03a3a70] pci_device_probe+0xf8/0x1b8 [ca021d50] [c043d584] really_probe.part.0+0xac/0x388 [ca021d70] [c043d914] __driver_probe_device+0xb4/0x170 [ca021d90] [c043da18] driver_probe_device+0x48/0x144 [ca021dc0] [c043e318] __driver_attach+0x11c/0x1c4 [ca021de0] [c043ad30] bus_for_each_dev+0x88/0xf0 [ca021e10] [c043c724] bus_add_driver+0x190/0x22c [ca021e40] [c043ee94] driver_register+0x9c/0x170 [ca021e60] [c0006c28] do_one_initcall+0x54/0x1ec [ca021ed0] [c08246e4] kernel_init_freeable+0x1c0/0x270 [ca021f10] [c0006fdc] kernel_init+0x28/0x11c [ca021f30] [c0017148] ret_from_kernel_thread+0x14/0x1c Instruction dump: 7d4601a4 39490777 7d4701a4 39490888 7d4801a4 39490999 7d4901a4 39290aaa 7d2a01a4 4c00012c 4bfffe88 0fe00000 <4bfffe80> 9421fff0 38210010 48001970 This is due to 'dcbz' instruction being used on non-cached memory. 'dcbz' instruction is used by memset() to zeroize a complete cacheline at once, and memset() is not expected to be used on non cached memory. When performing a 'sparse' check on fbdev driver, it also appears that the use of memset() is unexpected: drivers/video/fbdev/chipsfb.c:334:17: warning: incorrect type in argument 1 (different address spaces) drivers/video/fbdev/chipsfb.c:334:17: expected void * drivers/video/fbdev/chipsfb.c:334:17: got char [noderef] __iomem *screen_base drivers/video/fbdev/chipsfb.c:334:15: warning: memset with byte count of 1048576 Use fb_memset() instead of memset(). fb_memset() is defined as memset_io() for powerpc. Fixes: 8c8709334cec ("[PATCH] ppc32: Remove CONFIG_PMAC_PBOOK") Reported-by: Stan Johnson Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/884a54f1e5cb774c1d9b4db780209bee5d4f6718.1631712563.git.christophe.leroy@csgroup.eu Signed-off-by: Sasha Levin --- drivers/video/fbdev/chipsfb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/fbdev/chipsfb.c b/drivers/video/fbdev/chipsfb.c index f4dc320dcafe2..80fdd3ee0565f 100644 --- a/drivers/video/fbdev/chipsfb.c +++ b/drivers/video/fbdev/chipsfb.c @@ -331,7 +331,7 @@ static const struct fb_var_screeninfo chipsfb_var = { static void init_chips(struct fb_info *p, unsigned long addr) { - memset(p->screen_base, 0, 0x100000); + fb_memset(p->screen_base, 0, 0x100000); p->fix = chipsfb_fix; p->fix.smem_start = addr; From 52739b4bc5ee74ded682c01c5aa17400ec62ab8d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 5 Oct 2021 16:45:16 +0300 Subject: [PATCH 1597/5344] serial: 8250_dw: Drop wrong use of ACPI_PTR() [ Upstream commit ebabb77a2a115b6c5e68f7364b598310b5f61fb2 ] ACPI_PTR() is more harmful than helpful. For example, in this case if CONFIG_ACPI=n, the ID table left unused which is not what we want. Instead of adding ifdeffery here and there, drop ACPI_PTR(). Fixes: 6a7320c4669f ("serial: 8250_dw: Add ACPI 5.0 support") Reported-by: Daniel Palmer Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20211005134516.23218-1-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/tty/serial/8250/8250_dw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/8250/8250_dw.c b/drivers/tty/serial/8250/8250_dw.c index 51a7d3b19b394..381c5117aec1b 100644 --- a/drivers/tty/serial/8250/8250_dw.c +++ b/drivers/tty/serial/8250/8250_dw.c @@ -660,7 +660,7 @@ static struct platform_driver dw8250_platform_driver = { .name = "dw-apb-uart", .pm = &dw8250_pm_ops, .of_match_table = dw8250_of_match, - .acpi_match_table = ACPI_PTR(dw8250_acpi_match), + .acpi_match_table = dw8250_acpi_match, }, .probe = dw8250_probe, .remove = dw8250_remove, From 34c49e3975070cf1c3f7be97e3cd84b4d5f37cb2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 11 Oct 2021 15:37:39 +0300 Subject: [PATCH 1598/5344] usb: gadget: hid: fix error code in do_config() [ Upstream commit 68e7c510fdf4f6167404609da52e1979165649f6 ] Return an error code if usb_get_function() fails. Don't return success. Fixes: 4bc8a33f2407 ("usb: gadget: hid: convert to new interface of f_hid") Acked-by: Felipe Balbi Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20211011123739.GC15188@kili Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/usb/gadget/legacy/hid.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/legacy/hid.c b/drivers/usb/gadget/legacy/hid.c index 5b27d289443fe..3912cc805f3af 100644 --- a/drivers/usb/gadget/legacy/hid.c +++ b/drivers/usb/gadget/legacy/hid.c @@ -99,8 +99,10 @@ static int do_config(struct usb_configuration *c) list_for_each_entry(e, &hidg_func_list, node) { e->f = usb_get_function(e->fi); - if (IS_ERR(e->f)) + if (IS_ERR(e->f)) { + status = PTR_ERR(e->f); goto put; + } status = usb_add_function(c, e->f); if (status < 0) { usb_put_function(e->f); From 1eb6eeefc0b462bc7136ae48e87251b93c99aa19 Mon Sep 17 00:00:00 2001 From: Jakob Hauser Date: Fri, 8 Oct 2021 10:32:45 +0200 Subject: [PATCH 1599/5344] =?UTF-8?q?power:=20supply:=20rt5033=5Fbattery:?= =?UTF-8?q?=20Change=20voltage=20values=20to=20=C2=B5V?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit bf895295e9a73411889816f1a0c1f4f1a2d9c678 ] Currently the rt5033_battery driver provides voltage values in mV. It should be µV as stated in Documentation/power/power_supply_class.rst. Fixes: b847dd96e659 ("power: rt5033_battery: Add RT5033 Fuel gauge device driver") Cc: Beomho Seo Cc: Chanwoo Choi Signed-off-by: Jakob Hauser Signed-off-by: Sebastian Reichel Signed-off-by: Sasha Levin --- drivers/power/supply/rt5033_battery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/power/supply/rt5033_battery.c b/drivers/power/supply/rt5033_battery.c index 6609f8cb8ca01..ef53891b88bbc 100644 --- a/drivers/power/supply/rt5033_battery.c +++ b/drivers/power/supply/rt5033_battery.c @@ -60,7 +60,7 @@ static int rt5033_battery_get_watt_prop(struct i2c_client *client, regmap_read(battery->regmap, regh, &msb); regmap_read(battery->regmap, regl, &lsb); - ret = ((msb << 4) + (lsb >> 4)) * 1250 / 1000; + ret = ((msb << 4) + (lsb >> 4)) * 1250; return ret; } From adc78c2250a210db50ff6acc0a4e9cd723290a5e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 6 Oct 2021 10:32:43 +0300 Subject: [PATCH 1600/5344] scsi: csiostor: Uninitialized data in csio_ln_vnp_read_cbfn() [ Upstream commit f4875d509a0a78ad294a1a538d534b5ba94e685a ] This variable is just a temporary variable, used to do an endian conversion. The problem is that the last byte is not initialized. After the conversion is completely done, the last byte is discarded so it doesn't cause a problem. But static checkers and the KMSan runtime checker can detect the uninitialized read and will complain about it. Link: https://lore.kernel.org/r/20211006073242.GA8404@kili Fixes: 5036f0a0ecd3 ("[SCSI] csiostor: Fix sparse warnings.") Signed-off-by: Dan Carpenter Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/csiostor/csio_lnode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/csiostor/csio_lnode.c b/drivers/scsi/csiostor/csio_lnode.c index 23cbe4cda760e..c3bf590f5d685 100644 --- a/drivers/scsi/csiostor/csio_lnode.c +++ b/drivers/scsi/csiostor/csio_lnode.c @@ -619,7 +619,7 @@ csio_ln_vnp_read_cbfn(struct csio_hw *hw, struct csio_mb *mbp) struct fc_els_csp *csp; struct fc_els_cssp *clsp; enum fw_retval retval; - __be32 nport_id; + __be32 nport_id = 0; retval = FW_CMD_RETVAL_G(ntohl(rsp->alloc_to_len16)); if (retval != FW_SUCCESS) { From eec5063f0e0d32fbd984258612127640837ffec8 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 12 Oct 2021 10:28:43 +0300 Subject: [PATCH 1601/5344] RDMA/mlx4: Return missed an error if device doesn't support steering [ Upstream commit f4e56ec4452f48b8292dcf0e1c4bdac83506fb8b ] The error flow fixed in this patch is not possible because all kernel users of create QP interface check that device supports steering before set IB_QP_CREATE_NETIF_QP flag. Fixes: c1c98501121e ("IB/mlx4: Add support for steerable IB UD QPs") Link: https://lore.kernel.org/r/91c61f6e60eb0240f8bbc321fda7a1d2986dd03c.1634023677.git.leonro@nvidia.com Reported-by: Dan Carpenter Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe Signed-off-by: Sasha Levin --- drivers/infiniband/hw/mlx4/qp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 17ce928e41bde..bca5358f3ef29 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -1149,8 +1149,10 @@ static int create_qp_common(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, if (dev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) qp->flags |= MLX4_IB_QP_NETIF; - else + else { + err = -EINVAL; goto err; + } } err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp); From 64284e218e82a9e81524be7284d54fd519316a90 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Mon, 11 Oct 2021 17:29:41 +0200 Subject: [PATCH 1602/5344] staging: ks7010: select CRYPTO_HASH/CRYPTO_MICHAEL_MIC [ Upstream commit 9ca0e55e52c7b2a99f3c2051fc4bd1c63a061519 ] Fix the following build/link errors: ld: drivers/staging/ks7010/ks_hostif.o: in function `michael_mic.constprop.0': ks_hostif.c:(.text+0x95b): undefined reference to `crypto_alloc_shash' ld: ks_hostif.c:(.text+0x97a): undefined reference to `crypto_shash_setkey' ld: ks_hostif.c:(.text+0xa13): undefined reference to `crypto_shash_update' ld: ks_hostif.c:(.text+0xa28): undefined reference to `crypto_shash_update' ld: ks_hostif.c:(.text+0xa48): undefined reference to `crypto_shash_finup' ld: ks_hostif.c:(.text+0xa6d): undefined reference to `crypto_destroy_tfm' Fixes: 8b523f20417d ("staging: ks7010: removed custom Michael MIC implementation.") Fixes: 3e5bc68fa5968 ("staging: ks7010: Fix build error") Fixes: a4961427e7494 ("Revert "staging: ks7010: Fix build error"") Signed-off-by: Vegard Nossum Link: https://lore.kernel.org/r/20211011152941.12847-1-vegard.nossum@oracle.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/staging/ks7010/Kconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/staging/ks7010/Kconfig b/drivers/staging/ks7010/Kconfig index 0987fdc2f70db..8ea6c09286798 100644 --- a/drivers/staging/ks7010/Kconfig +++ b/drivers/staging/ks7010/Kconfig @@ -5,6 +5,9 @@ config KS7010 select WIRELESS_EXT select WEXT_PRIV select FW_LOADER + select CRYPTO + select CRYPTO_HASH + select CRYPTO_MICHAEL_MIC help This is a driver for KeyStream KS7010 based SDIO WIFI cards. It is found on at least later Spectec SDW-821 (FCC-ID "S2Y-WLAN-11G-K" only, From 4bf0160dde2cd70d905d97ff916e1d5b36d5e232 Mon Sep 17 00:00:00 2001 From: Olivier Moysan Date: Fri, 24 Sep 2021 18:02:21 +0200 Subject: [PATCH 1603/5344] ARM: dts: stm32: fix SAI sub nodes register range [ Upstream commit 6f87a74d31277f0896dcf8c0850ec14bde03c423 ] The STM32 SAI subblocks registers offsets are in the range 0x0004 (SAIx_CR1) to 0x0020 (SAIx_DR). The corresponding range length is 0x20 instead of 0x1c. Change reg property accordingly. Fixes: 5afd65c3a060 ("ARM: dts: stm32: add sai support on stm32mp157c") Signed-off-by: Olivier Moysan Signed-off-by: Alexandre Torgue Signed-off-by: Sasha Levin --- arch/arm/boot/dts/stm32mp157c.dtsi | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/arm/boot/dts/stm32mp157c.dtsi b/arch/arm/boot/dts/stm32mp157c.dtsi index eca469a64a977..a687c024daa92 100644 --- a/arch/arm/boot/dts/stm32mp157c.dtsi +++ b/arch/arm/boot/dts/stm32mp157c.dtsi @@ -773,7 +773,7 @@ #sound-dai-cells = <0>; compatible = "st,stm32-sai-sub-a"; - reg = <0x4 0x1c>; + reg = <0x4 0x20>; clocks = <&rcc SAI1_K>; clock-names = "sai_ck"; dmas = <&dmamux1 87 0x400 0x01>; @@ -783,7 +783,7 @@ sai1b: audio-controller@4400a024 { #sound-dai-cells = <0>; compatible = "st,stm32-sai-sub-b"; - reg = <0x24 0x1c>; + reg = <0x24 0x20>; clocks = <&rcc SAI1_K>; clock-names = "sai_ck"; dmas = <&dmamux1 88 0x400 0x01>; @@ -804,7 +804,7 @@ sai2a: audio-controller@4400b004 { #sound-dai-cells = <0>; compatible = "st,stm32-sai-sub-a"; - reg = <0x4 0x1c>; + reg = <0x4 0x20>; clocks = <&rcc SAI2_K>; clock-names = "sai_ck"; dmas = <&dmamux1 89 0x400 0x01>; @@ -814,7 +814,7 @@ sai2b: audio-controller@4400b024 { #sound-dai-cells = <0>; compatible = "st,stm32-sai-sub-b"; - reg = <0x24 0x1c>; + reg = <0x24 0x20>; clocks = <&rcc SAI2_K>; clock-names = "sai_ck"; dmas = <&dmamux1 90 0x400 0x01>; @@ -835,7 +835,7 @@ sai3a: audio-controller@4400c004 { #sound-dai-cells = <0>; compatible = "st,stm32-sai-sub-a"; - reg = <0x04 0x1c>; + reg = <0x04 0x20>; clocks = <&rcc SAI3_K>; clock-names = "sai_ck"; dmas = <&dmamux1 113 0x400 0x01>; @@ -845,7 +845,7 @@ sai3b: audio-controller@4400c024 { #sound-dai-cells = <0>; compatible = "st,stm32-sai-sub-b"; - reg = <0x24 0x1c>; + reg = <0x24 0x20>; clocks = <&rcc SAI3_K>; clock-names = "sai_ck"; dmas = <&dmamux1 114 0x400 0x01>; @@ -1191,7 +1191,7 @@ sai4a: audio-controller@50027004 { #sound-dai-cells = <0>; compatible = "st,stm32-sai-sub-a"; - reg = <0x04 0x1c>; + reg = <0x04 0x20>; clocks = <&rcc SAI4_K>; clock-names = "sai_ck"; dmas = <&dmamux1 99 0x400 0x01>; @@ -1201,7 +1201,7 @@ sai4b: audio-controller@50027024 { #sound-dai-cells = <0>; compatible = "st,stm32-sai-sub-b"; - reg = <0x24 0x1c>; + reg = <0x24 0x20>; clocks = <&rcc SAI4_K>; clock-names = "sai_ck"; dmas = <&dmamux1 100 0x400 0x01>; From 2e419ef29cc0f084b04116be0c95e32abbbb5596 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Fri, 15 Oct 2021 14:36:06 +0100 Subject: [PATCH 1604/5344] ASoC: cs42l42: Correct some register default values [ Upstream commit d591d4b32aa9552af14a0c7c586a2d3fe9ecc6e0 ] Some registers had wrong default values in cs42l42_reg_defaults[]. Signed-off-by: Richard Fitzgerald Fixes: 2c394ca79604 ("ASoC: Add support for CS42L42 codec") Link: https://lore.kernel.org/r/20211015133619.4698-4-rf@opensource.cirrus.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/codecs/cs42l42.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/cs42l42.c b/sound/soc/codecs/cs42l42.c index 6825e874785f2..710b43205ea05 100644 --- a/sound/soc/codecs/cs42l42.c +++ b/sound/soc/codecs/cs42l42.c @@ -91,7 +91,7 @@ static const struct reg_default cs42l42_reg_defaults[] = { { CS42L42_ASP_RX_INT_MASK, 0x1F }, { CS42L42_ASP_TX_INT_MASK, 0x0F }, { CS42L42_CODEC_INT_MASK, 0x03 }, - { CS42L42_SRCPL_INT_MASK, 0xFF }, + { CS42L42_SRCPL_INT_MASK, 0x7F }, { CS42L42_VPMON_INT_MASK, 0x01 }, { CS42L42_PLL_LOCK_INT_MASK, 0x01 }, { CS42L42_TSRS_PLUG_INT_MASK, 0x0F }, @@ -128,7 +128,7 @@ static const struct reg_default cs42l42_reg_defaults[] = { { CS42L42_MIXER_CHA_VOL, 0x3F }, { CS42L42_MIXER_ADC_VOL, 0x3F }, { CS42L42_MIXER_CHB_VOL, 0x3F }, - { CS42L42_EQ_COEF_IN0, 0x22 }, + { CS42L42_EQ_COEF_IN0, 0x00 }, { CS42L42_EQ_COEF_IN1, 0x00 }, { CS42L42_EQ_COEF_IN2, 0x00 }, { CS42L42_EQ_COEF_IN3, 0x00 }, From aa20f174b12786dec5377477bfa5d6a5ee2038e6 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Fri, 15 Oct 2021 14:36:08 +0100 Subject: [PATCH 1605/5344] ASoC: cs42l42: Defer probe if request_threaded_irq() returns EPROBE_DEFER [ Upstream commit 0306988789d9d91a18ff70bd2bf165d3ae0ef1dd ] The driver can run without an interrupt so if devm_request_threaded_irq() failed, the probe() just carried on. But if this was EPROBE_DEFER the driver would continue without an interrupt instead of deferring to wait for the interrupt to become available. Fixes: 2c394ca79604 ("ASoC: Add support for CS42L42 codec") Signed-off-by: Richard Fitzgerald Link: https://lore.kernel.org/r/20211015133619.4698-6-rf@opensource.cirrus.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/codecs/cs42l42.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/cs42l42.c b/sound/soc/codecs/cs42l42.c index 710b43205ea05..ebee58eca4d51 100644 --- a/sound/soc/codecs/cs42l42.c +++ b/sound/soc/codecs/cs42l42.c @@ -1798,8 +1798,9 @@ static int cs42l42_i2c_probe(struct i2c_client *i2c_client, NULL, cs42l42_irq_thread, IRQF_ONESHOT | IRQF_TRIGGER_LOW, "cs42l42", cs42l42); - - if (ret != 0) + if (ret == -EPROBE_DEFER) + goto err_disable; + else if (ret != 0) dev_err(&i2c_client->dev, "Failed to request IRQ: %d\n", ret); From 604073f55b7d52089f767fce93b2490df414409d Mon Sep 17 00:00:00 2001 From: Vladimir Zapolskiy Date: Thu, 23 Sep 2021 02:35:48 +0300 Subject: [PATCH 1606/5344] phy: qcom-qusb2: Fix a memory leak on probe [ Upstream commit bf7ffcd0069d30e2e7ba2b827f08c89f471cd1f3 ] On success nvmem_cell_read() returns a pointer to a dynamically allocated buffer, and therefore it shall be freed after usage. The issue is reported by kmemleak: # cat /sys/kernel/debug/kmemleak unreferenced object 0xffff3b3803e4b280 (size 128): comm "kworker/u16:1", pid 107, jiffies 4294892861 (age 94.120s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<000000007739afdc>] __kmalloc+0x27c/0x41c [<0000000071c0fbf8>] nvmem_cell_read+0x40/0xe0 [<00000000e803ef1f>] qusb2_phy_init+0x258/0x5bc [<00000000fc81fcfa>] phy_init+0x70/0x110 [<00000000e3d48a57>] dwc3_core_soft_reset+0x4c/0x234 [<0000000027d1dbd4>] dwc3_core_init+0x68/0x990 [<000000001965faf9>] dwc3_probe+0x4f4/0x730 [<000000002f7617ca>] platform_probe+0x74/0xf0 [<00000000a2576cac>] really_probe+0xc4/0x470 [<00000000bc77f2c5>] __driver_probe_device+0x11c/0x190 [<00000000130db71f>] driver_probe_device+0x48/0x110 [<0000000019f36c2b>] __device_attach_driver+0xa4/0x140 [<00000000e5812ff7>] bus_for_each_drv+0x84/0xe0 [<00000000f4bac574>] __device_attach+0xe4/0x1c0 [<00000000d3beb631>] device_initial_probe+0x20/0x30 [<000000008019b9db>] bus_probe_device+0xa4/0xb0 Fixes: ca04d9d3e1b1 ("phy: qcom-qusb2: New driver for QUSB2 PHY on Qcom chips") Signed-off-by: Vladimir Zapolskiy Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20210922233548.2150244-1-vladimir.zapolskiy@linaro.org Signed-off-by: Vinod Koul Signed-off-by: Sasha Levin --- drivers/phy/qualcomm/phy-qcom-qusb2.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/phy/qualcomm/phy-qcom-qusb2.c b/drivers/phy/qualcomm/phy-qcom-qusb2.c index bf94a52d30871..946e9b05f0ae6 100644 --- a/drivers/phy/qualcomm/phy-qcom-qusb2.c +++ b/drivers/phy/qualcomm/phy-qcom-qusb2.c @@ -432,7 +432,7 @@ static void qusb2_phy_set_tune2_param(struct qusb2_phy *qphy) { struct device *dev = &qphy->phy->dev; const struct qusb2_phy_cfg *cfg = qphy->cfg; - u8 *val; + u8 *val, hstx_trim; /* efuse register is optional */ if (!qphy->cell) @@ -446,7 +446,13 @@ static void qusb2_phy_set_tune2_param(struct qusb2_phy *qphy) * set while configuring the phy. */ val = nvmem_cell_read(qphy->cell, NULL); - if (IS_ERR(val) || !val[0]) { + if (IS_ERR(val)) { + dev_dbg(dev, "failed to read a valid hs-tx trim value\n"); + return; + } + hstx_trim = val[0]; + kfree(val); + if (!hstx_trim) { dev_dbg(dev, "failed to read a valid hs-tx trim value\n"); return; } @@ -454,12 +460,10 @@ static void qusb2_phy_set_tune2_param(struct qusb2_phy *qphy) /* Fused TUNE1/2 value is the higher nibble only */ if (cfg->update_tune1_with_efuse) qusb2_write_mask(qphy->base, cfg->regs[QUSB2PHY_PORT_TUNE1], - val[0] << HSTX_TRIM_SHIFT, - HSTX_TRIM_MASK); + hstx_trim << HSTX_TRIM_SHIFT, HSTX_TRIM_MASK); else qusb2_write_mask(qphy->base, cfg->regs[QUSB2PHY_PORT_TUNE2], - val[0] << HSTX_TRIM_SHIFT, - HSTX_TRIM_MASK); + hstx_trim << HSTX_TRIM_SHIFT, HSTX_TRIM_MASK); } static int qusb2_phy_set_mode(struct phy *phy, From c34483a9bbb66ae08d5f06891691e33aea2505a6 Mon Sep 17 00:00:00 2001 From: Anssi Hannula Date: Tue, 26 Oct 2021 13:27:41 +0300 Subject: [PATCH 1607/5344] serial: xilinx_uartps: Fix race condition causing stuck TX [ Upstream commit 88b20f84f0fe47409342669caf3e58a3fc64c316 ] xilinx_uartps .start_tx() clears TXEMPTY when enabling TXEMPTY to avoid any previous TXEVENT event asserting the UART interrupt. This clear operation is done immediately after filling the TX FIFO. However, if the bytes inserted by cdns_uart_handle_tx() are consumed by the UART before the TXEMPTY is cleared, the clear operation eats the new TXEMPTY event as well, causing cdns_uart_isr() to never receive the TXEMPTY event. If there are bytes still queued in circbuf, TX will get stuck as they will never get transferred to FIFO (unless new bytes are queued to circbuf in which case .start_tx() is called again). While the racy missed TXEMPTY occurs fairly often with short data sequences (e.g. write 1 byte), in those cases circbuf is usually empty so no action on TXEMPTY would have been needed anyway. On the other hand, longer data sequences make the race much more unlikely as UART takes longer to consume the TX FIFO. Therefore it is rare for this race to cause visible issues in general. Fix the race by clearing the TXEMPTY bit in ISR *before* filling the FIFO. The TXEMPTY bit in ISR will only get asserted at the exact moment the TX FIFO *becomes* empty, so clearing the bit before filling FIFO does not cause an extra immediate assertion even if the FIFO is initially empty. This is hard to reproduce directly on a normal system, but inserting e.g. udelay(200) after cdns_uart_handle_tx(port), setting 4000000 baud, and then running "dd if=/dev/zero bs=128 of=/dev/ttyPS0 count=50" reliably reproduces the issue on my ZynqMP test system unless this fix is applied. Fixes: 85baf542d54e ("tty: xuartps: support 64 byte FIFO size") Signed-off-by: Anssi Hannula Link: https://lore.kernel.org/r/20211026102741.2910441-1-anssi.hannula@bitwise.fi Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/tty/serial/xilinx_uartps.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/tty/serial/xilinx_uartps.c b/drivers/tty/serial/xilinx_uartps.c index 9359c80fbb9f5..a1409251fbcc3 100644 --- a/drivers/tty/serial/xilinx_uartps.c +++ b/drivers/tty/serial/xilinx_uartps.c @@ -595,9 +595,10 @@ static void cdns_uart_start_tx(struct uart_port *port) if (uart_circ_empty(&port->state->xmit)) return; + writel(CDNS_UART_IXR_TXEMPTY, port->membase + CDNS_UART_ISR); + cdns_uart_handle_tx(port); - writel(CDNS_UART_IXR_TXEMPTY, port->membase + CDNS_UART_ISR); /* Enable the TX Empty interrupt */ writel(CDNS_UART_IXR_TXEMPTY, port->membase + CDNS_UART_IER); } From 1b6acece7393f0a8a2d7c498b2e2b9974f571c2e Mon Sep 17 00:00:00 2001 From: Andrej Shadura Date: Tue, 19 Oct 2021 17:29:16 +0200 Subject: [PATCH 1608/5344] HID: u2fzero: clarify error check and length calculations [ Upstream commit b7abf78b7a6c4a29a6e0ba0bb883fe44a2f3d693 ] The previous commit fixed handling of incomplete packets but broke error handling: offsetof returns an unsigned value (size_t), but when compared against the signed return value, the return value is interpreted as if it were unsigned, so negative return values are never less than the offset. To make the code easier to read, calculate the minimal packet length once and separately, and assign it to a signed int variable to eliminate unsigned math and the need for type casts. It then becomes immediately obvious how the actual data length is calculated and why the return value cannot be less than the minimal length. Fixes: 22d65765f211 ("HID: u2fzero: ignore incomplete packets without data") Fixes: 42337b9d4d95 ("HID: add driver for U2F Zero built-in LED and RNG") Signed-off-by: Andrej Shadura Signed-off-by: Jiri Kosina Signed-off-by: Sasha Levin --- drivers/hid/hid-u2fzero.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/hid/hid-u2fzero.c b/drivers/hid/hid-u2fzero.c index d70cd3d7f583b..94f78ffb76d04 100644 --- a/drivers/hid/hid-u2fzero.c +++ b/drivers/hid/hid-u2fzero.c @@ -191,6 +191,8 @@ static int u2fzero_rng_read(struct hwrng *rng, void *data, struct u2f_hid_msg resp; int ret; size_t actual_length; + /* valid packets must have a correct header */ + int min_length = offsetof(struct u2f_hid_msg, init.data); if (!dev->present) { hid_dbg(dev->hdev, "device not present"); @@ -200,12 +202,12 @@ static int u2fzero_rng_read(struct hwrng *rng, void *data, ret = u2fzero_recv(dev, &req, &resp); /* ignore errors or packets without data */ - if (ret < offsetof(struct u2f_hid_msg, init.data)) + if (ret < min_length) return 0; /* only take the minimum amount of data it is safe to take */ - actual_length = min3((size_t)ret - offsetof(struct u2f_hid_msg, - init.data), U2F_HID_MSG_LEN(resp), max); + actual_length = min3((size_t)ret - min_length, + U2F_HID_MSG_LEN(resp), max); memcpy(data, resp.init.data, actual_length); From 0fd9a1ae2a25395de5f4acf7ce967b3a56118afe Mon Sep 17 00:00:00 2001 From: Andrej Shadura Date: Tue, 19 Oct 2021 17:29:17 +0200 Subject: [PATCH 1609/5344] HID: u2fzero: properly handle timeouts in usb_submit_urb [ Upstream commit 43775e62c4b784f44a159e13ba80e6146a42d502 ] The wait_for_completion_timeout function returns 0 if timed out or a positive value if completed. Hence, "less than zero" comparison always misses timeouts and doesn't kill the URB as it should, leading to re-sending it while it is active. Fixes: 42337b9d4d95 ("HID: add driver for U2F Zero built-in LED and RNG") Signed-off-by: Andrej Shadura Signed-off-by: Jiri Kosina Signed-off-by: Sasha Levin --- drivers/hid/hid-u2fzero.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/hid-u2fzero.c b/drivers/hid/hid-u2fzero.c index 94f78ffb76d04..67ae2b18e33ac 100644 --- a/drivers/hid/hid-u2fzero.c +++ b/drivers/hid/hid-u2fzero.c @@ -132,7 +132,7 @@ static int u2fzero_recv(struct u2fzero_device *dev, ret = (wait_for_completion_timeout( &ctx.done, msecs_to_jiffies(USB_CTRL_SET_TIMEOUT))); - if (ret < 0) { + if (ret == 0) { usb_kill_urb(dev->urb); hid_err(hdev, "urb submission timed out"); } else { From 109941fde08b952c11f5ca70192a9401075112c1 Mon Sep 17 00:00:00 2001 From: Bixuan Cui Date: Thu, 28 Oct 2021 15:28:22 +0800 Subject: [PATCH 1610/5344] powerpc/44x/fsp2: add missing of_node_put [ Upstream commit 290fe8aa69ef5c51c778c0bb33f8ef0181c769f5 ] Early exits from for_each_compatible_node() should decrement the node reference counter. Reported by Coccinelle: ./arch/powerpc/platforms/44x/fsp2.c:206:1-25: WARNING: Function "for_each_compatible_node" should have of_node_put() before return around line 218. Fixes: 7813043e1bbc ("powerpc/44x/fsp2: Add irq error handlers") Signed-off-by: Bixuan Cui Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1635406102-88719-1-git-send-email-cuibixuan@linux.alibaba.com Signed-off-by: Sasha Levin --- arch/powerpc/platforms/44x/fsp2.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/platforms/44x/fsp2.c b/arch/powerpc/platforms/44x/fsp2.c index b299e43f5ef94..823397c802def 100644 --- a/arch/powerpc/platforms/44x/fsp2.c +++ b/arch/powerpc/platforms/44x/fsp2.c @@ -208,6 +208,7 @@ static void node_irq_request(const char *compat, irq_handler_t errirq_handler) if (irq == NO_IRQ) { pr_err("device tree node %pOFn is missing a interrupt", np); + of_node_put(np); return; } @@ -215,6 +216,7 @@ static void node_irq_request(const char *compat, irq_handler_t errirq_handler) if (rc) { pr_err("fsp_of_probe: request_irq failed: np=%pOF rc=%d", np, rc); + of_node_put(np); return; } } From d8513a10d252f88148b5d345ddefacffd41aa238 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 29 Oct 2021 11:58:16 +0200 Subject: [PATCH 1611/5344] mips: cm: Convert to bitfield API to fix out-of-bounds access [ Upstream commit 18b8f5b6fc53d097cadb94a93d8d6566ba88e389 ] mips_cm_error_report() extracts the cause and other cause from the error register using shifts. This works fine for the former, as it is stored in the top bits, and the shift will thus remove all non-related bits. However, the latter is stored in the bottom bits, hence thus needs masking to get rid of non-related bits. Without such masking, using it as an index into the cm2_causes[] array will lead to an out-of-bounds access, probably causing a crash. Fix this by using FIELD_GET() instead. Bite the bullet and convert all MIPS CM handling to the bitfield API, to improve readability and safety. Fixes: 3885c2b463f6a236 ("MIPS: CM: Add support for reporting CM cache errors") Signed-off-by: Geert Uytterhoeven Reviewed-by: Jiaxun Yang Signed-off-by: Thomas Bogendoerfer Signed-off-by: Sasha Levin --- arch/mips/include/asm/mips-cm.h | 12 ++++++------ arch/mips/kernel/mips-cm.c | 21 ++++++++++----------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/arch/mips/include/asm/mips-cm.h b/arch/mips/include/asm/mips-cm.h index aeae2effa123d..23c67c0871b17 100644 --- a/arch/mips/include/asm/mips-cm.h +++ b/arch/mips/include/asm/mips-cm.h @@ -11,6 +11,7 @@ #ifndef __MIPS_ASM_MIPS_CM_H__ #define __MIPS_ASM_MIPS_CM_H__ +#include #include #include @@ -153,8 +154,8 @@ GCR_ACCESSOR_RO(32, 0x030, rev) #define CM_GCR_REV_MINOR GENMASK(7, 0) #define CM_ENCODE_REV(major, minor) \ - (((major) << __ffs(CM_GCR_REV_MAJOR)) | \ - ((minor) << __ffs(CM_GCR_REV_MINOR))) + (FIELD_PREP(CM_GCR_REV_MAJOR, major) | \ + FIELD_PREP(CM_GCR_REV_MINOR, minor)) #define CM_REV_CM2 CM_ENCODE_REV(6, 0) #define CM_REV_CM2_5 CM_ENCODE_REV(7, 0) @@ -362,10 +363,10 @@ static inline int mips_cm_revision(void) static inline unsigned int mips_cm_max_vp_width(void) { extern int smp_num_siblings; - uint32_t cfg; if (mips_cm_revision() >= CM_REV_CM3) - return read_gcr_sys_config2() & CM_GCR_SYS_CONFIG2_MAXVPW; + return FIELD_GET(CM_GCR_SYS_CONFIG2_MAXVPW, + read_gcr_sys_config2()); if (mips_cm_present()) { /* @@ -373,8 +374,7 @@ static inline unsigned int mips_cm_max_vp_width(void) * number of VP(E)s, and if that ever changes then this will * need revisiting. */ - cfg = read_gcr_cl_config() & CM_GCR_Cx_CONFIG_PVPE; - return (cfg >> __ffs(CM_GCR_Cx_CONFIG_PVPE)) + 1; + return FIELD_GET(CM_GCR_Cx_CONFIG_PVPE, read_gcr_cl_config()) + 1; } if (IS_ENABLED(CONFIG_SMP)) diff --git a/arch/mips/kernel/mips-cm.c b/arch/mips/kernel/mips-cm.c index a9eab83d9148d..611ef512c0b81 100644 --- a/arch/mips/kernel/mips-cm.c +++ b/arch/mips/kernel/mips-cm.c @@ -179,8 +179,7 @@ static void mips_cm_probe_l2sync(void) phys_addr_t addr; /* L2-only sync was introduced with CM major revision 6 */ - major_rev = (read_gcr_rev() & CM_GCR_REV_MAJOR) >> - __ffs(CM_GCR_REV_MAJOR); + major_rev = FIELD_GET(CM_GCR_REV_MAJOR, read_gcr_rev()); if (major_rev < 6) return; @@ -263,13 +262,13 @@ void mips_cm_lock_other(unsigned int cluster, unsigned int core, preempt_disable(); if (cm_rev >= CM_REV_CM3) { - val = core << __ffs(CM3_GCR_Cx_OTHER_CORE); - val |= vp << __ffs(CM3_GCR_Cx_OTHER_VP); + val = FIELD_PREP(CM3_GCR_Cx_OTHER_CORE, core) | + FIELD_PREP(CM3_GCR_Cx_OTHER_VP, vp); if (cm_rev >= CM_REV_CM3_5) { val |= CM_GCR_Cx_OTHER_CLUSTER_EN; - val |= cluster << __ffs(CM_GCR_Cx_OTHER_CLUSTER); - val |= block << __ffs(CM_GCR_Cx_OTHER_BLOCK); + val |= FIELD_PREP(CM_GCR_Cx_OTHER_CLUSTER, cluster); + val |= FIELD_PREP(CM_GCR_Cx_OTHER_BLOCK, block); } else { WARN_ON(cluster != 0); WARN_ON(block != CM_GCR_Cx_OTHER_BLOCK_LOCAL); @@ -299,7 +298,7 @@ void mips_cm_lock_other(unsigned int cluster, unsigned int core, spin_lock_irqsave(&per_cpu(cm_core_lock, curr_core), per_cpu(cm_core_lock_flags, curr_core)); - val = core << __ffs(CM_GCR_Cx_OTHER_CORENUM); + val = FIELD_PREP(CM_GCR_Cx_OTHER_CORENUM, core); } write_gcr_cl_other(val); @@ -343,8 +342,8 @@ void mips_cm_error_report(void) cm_other = read_gcr_error_mult(); if (revision < CM_REV_CM3) { /* CM2 */ - cause = cm_error >> __ffs(CM_GCR_ERROR_CAUSE_ERRTYPE); - ocause = cm_other >> __ffs(CM_GCR_ERROR_MULT_ERR2ND); + cause = FIELD_GET(CM_GCR_ERROR_CAUSE_ERRTYPE, cm_error); + ocause = FIELD_GET(CM_GCR_ERROR_MULT_ERR2ND, cm_other); if (!cause) return; @@ -386,8 +385,8 @@ void mips_cm_error_report(void) ulong core_id_bits, vp_id_bits, cmd_bits, cmd_group_bits; ulong cm3_cca_bits, mcp_bits, cm3_tr_bits, sched_bit; - cause = cm_error >> __ffs64(CM3_GCR_ERROR_CAUSE_ERRTYPE); - ocause = cm_other >> __ffs(CM_GCR_ERROR_MULT_ERR2ND); + cause = FIELD_GET(CM3_GCR_ERROR_CAUSE_ERRTYPE, cm_error); + ocause = FIELD_GET(CM_GCR_ERROR_MULT_ERR2ND, cm_other); if (!cause) return; From 37f00de847e880475774ef95e82a4dccf516ba0f Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 31 Oct 2021 16:25:22 +0100 Subject: [PATCH 1612/5344] power: supply: bq27xxx: Fix kernel crash on IRQ handler register error [ Upstream commit cdf10ffe8f626d8a2edc354abf063df0078b2d71 ] When registering the IRQ handler fails, do not just return the error code, this will free the devm_kzalloc()-ed data struct while leaving the queued work queued and the registered power_supply registered with both of them now pointing to free-ed memory, resulting in various kernel crashes soon afterwards. Instead properly tear-down things on IRQ handler register errors. Fixes: 703df6c09795 ("power: bq27xxx_battery: Reorganize I2C into a module") Cc: Andrew F. Davis Signed-off-by: Hans de Goede Reviewed-by: Andy Shevchenko Signed-off-by: Sebastian Reichel Signed-off-by: Sasha Levin --- drivers/power/supply/bq27xxx_battery_i2c.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/power/supply/bq27xxx_battery_i2c.c b/drivers/power/supply/bq27xxx_battery_i2c.c index 2677c38a8a424..34229c1f43e31 100644 --- a/drivers/power/supply/bq27xxx_battery_i2c.c +++ b/drivers/power/supply/bq27xxx_battery_i2c.c @@ -195,7 +195,8 @@ static int bq27xxx_battery_i2c_probe(struct i2c_client *client, dev_err(&client->dev, "Unable to register IRQ %d error %d\n", client->irq, ret); - return ret; + bq27xxx_battery_teardown(di); + goto err_failed; } } From f22c2e3c1a94e6d0b35de695978f0604ae098b99 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sun, 4 Oct 2020 07:24:22 -0700 Subject: [PATCH 1613/5344] apparmor: fix error check [ Upstream commit d108370c644b153382632b3e5511ade575c91c86 ] clang static analysis reports this representative problem: label.c:1463:16: warning: Assigned value is garbage or undefined label->hname = name; ^ ~~~~ In aa_update_label_name(), this the problem block of code if (aa_label_acntsxprint(&name, ...) == -1) return res; On failure, aa_label_acntsxprint() has a more complicated return that just -1. So check for a negative return. It was also noted that the aa_label_acntsxprint() main comment refers to a nonexistent parameter, so clean up the comment. Fixes: f1bd904175e8 ("apparmor: add the base fns() for domain labels") Signed-off-by: Tom Rix Reviewed-by: Nick Desaulniers Signed-off-by: John Johansen Signed-off-by: Sasha Levin --- security/apparmor/label.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/security/apparmor/label.c b/security/apparmor/label.c index 5f324d63ceaa3..747a734a08246 100644 --- a/security/apparmor/label.c +++ b/security/apparmor/label.c @@ -1459,7 +1459,7 @@ bool aa_update_label_name(struct aa_ns *ns, struct aa_label *label, gfp_t gfp) if (label->hname || labels_ns(label) != ns) return res; - if (aa_label_acntsxprint(&name, ns, label, FLAGS_NONE, gfp) == -1) + if (aa_label_acntsxprint(&name, ns, label, FLAGS_NONE, gfp) < 0) return res; ls = labels_set(label); @@ -1709,7 +1709,7 @@ int aa_label_asxprint(char **strp, struct aa_ns *ns, struct aa_label *label, /** * aa_label_acntsxprint - allocate a __counted string buffer and print label - * @strp: buffer to write to. (MAY BE NULL if @size == 0) + * @strp: buffer to write to. * @ns: namespace profile is being viewed from * @label: label to view (NOT NULL) * @flags: flags controlling what label info is printed From bf5329154db33acd8b07b8ede173add9055dd378 Mon Sep 17 00:00:00 2001 From: Arnaud Pouliquen Date: Mon, 12 Jul 2021 14:39:12 +0200 Subject: [PATCH 1614/5344] rpmsg: Fix rpmsg_create_ept return when RPMSG config is not defined [ Upstream commit 537d3af1bee8ad1415fda9b622d1ea6d1ae76dfa ] According to the description of the rpmsg_create_ept in rpmsg_core.c the function should return NULL on error. Fixes: 2c8a57088045 ("rpmsg: Provide function stubs for API") Signed-off-by: Arnaud Pouliquen Reviewed-by: Mathieu Poirier Link: https://lore.kernel.org/r/20210712123912.10672-1-arnaud.pouliquen@foss.st.com Signed-off-by: Bjorn Andersson Signed-off-by: Sasha Levin --- include/linux/rpmsg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/rpmsg.h b/include/linux/rpmsg.h index 9fe156d1c018e..a68972b097b72 100644 --- a/include/linux/rpmsg.h +++ b/include/linux/rpmsg.h @@ -177,7 +177,7 @@ static inline struct rpmsg_endpoint *rpmsg_create_ept(struct rpmsg_device *rpdev /* This shouldn't be possible */ WARN_ON(1); - return ERR_PTR(-ENXIO); + return NULL; } static inline int rpmsg_send(struct rpmsg_endpoint *ept, void *data, int len) From c5a9e89f520f93f1b1b422b8105a3fa358f9f313 Mon Sep 17 00:00:00 2001 From: Baptiste Lepers Date: Mon, 6 Sep 2021 11:59:24 +1000 Subject: [PATCH 1615/5344] pnfs/flexfiles: Fix misplaced barrier in nfs4_ff_layout_prepare_ds [ Upstream commit a2915fa06227b056a8f9b0d79b61dca08ad5cfc6 ] _nfs4_pnfs_v3/v4_ds_connect do some work smp_wmb ds->ds_clp = clp; And nfs4_ff_layout_prepare_ds currently does smp_rmb if(ds->ds_clp) ... This patch places the smp_rmb after the if. This ensures that following reads only happen once nfs4_ff_layout_prepare_ds has checked that data has been properly initialized. Fixes: d67ae825a59d6 ("pnfs/flexfiles: Add the FlexFile Layout Driver") Signed-off-by: Baptiste Lepers Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- fs/nfs/flexfilelayout/flexfilelayoutdev.c | 4 ++-- fs/nfs/pnfs_nfs.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 3eda40a320a53..1f12297109b41 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -378,10 +378,10 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, goto noconnect; ds = mirror->mirror_ds->ds; + if (READ_ONCE(ds->ds_clp)) + goto out; /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */ smp_rmb(); - if (ds->ds_clp) - goto out; /* FIXME: For now we assume the server sent only one version of NFS * to use for the DS. diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 249cf9037dbd7..aff44a7b98f86 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -641,7 +641,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, } smp_wmb(); - ds->ds_clp = clp; + WRITE_ONCE(ds->ds_clp, clp); dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr); out: return status; @@ -714,7 +714,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, } smp_wmb(); - ds->ds_clp = clp; + WRITE_ONCE(ds->ds_clp, clp); dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr); out: return status; From 1f76ac9e937b3f05e519639bff726a5a9e852b88 Mon Sep 17 00:00:00 2001 From: "Alex Xu (Hello71)" Date: Thu, 7 Oct 2021 02:37:06 -0400 Subject: [PATCH 1616/5344] drm/plane-helper: fix uninitialized variable reference [ Upstream commit 7be28bd73f23e53d6e7f5fe891ba9503fc0c7210 ] drivers/gpu/drm/drm_plane_helper.c: In function 'drm_primary_helper_update': drivers/gpu/drm/drm_plane_helper.c:113:32: error: 'visible' is used uninitialized [-Werror=uninitialized] 113 | struct drm_plane_state plane_state = { | ^~~~~~~~~~~ drivers/gpu/drm/drm_plane_helper.c:178:14: note: 'visible' was declared here 178 | bool visible; | ^~~~~~~ cc1: all warnings being treated as errors visible is an output, not an input. in practice this use might turn out OK but it's still UB. Fixes: df86af9133b4 ("drm/plane-helper: Add drm_plane_helper_check_state()") Reviewed-by: Simon Ser Signed-off-by: Alex Xu (Hello71) Signed-off-by: Simon Ser Link: https://patchwork.freedesktop.org/patch/msgid/20211007063706.305984-1-alex_y_xu@yahoo.ca Signed-off-by: Sasha Levin --- drivers/gpu/drm/drm_plane_helper.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/drm_plane_helper.c b/drivers/gpu/drm/drm_plane_helper.c index 3aae7ea522f23..c3f2292dc93d5 100644 --- a/drivers/gpu/drm/drm_plane_helper.c +++ b/drivers/gpu/drm/drm_plane_helper.c @@ -123,7 +123,6 @@ static int drm_plane_helper_check_update(struct drm_plane *plane, .crtc_w = drm_rect_width(dst), .crtc_h = drm_rect_height(dst), .rotation = rotation, - .visible = *visible, }; struct drm_crtc_state crtc_state = { .crtc = crtc, From fe5f13af4e242211cf9481f781577ec1f9264897 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Tue, 5 Oct 2021 20:09:42 +0200 Subject: [PATCH 1617/5344] PCI: aardvark: Don't spam about PIO Response Status MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 464de7e7fff767e87429cd7be09c4f2cb50a6ccb ] Use dev_dbg() instead of dev_err() in advk_pcie_check_pio_status(). For example CRS is not an error status, it just says that the request should be retried. Link: https://lore.kernel.org/r/20211005180952.6812-4-kabel@kernel.org Fixes: 8c39d710363c1 ("PCI: aardvark: Add Aardvark PCI host controller driver") Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Signed-off-by: Sasha Levin --- drivers/pci/controller/pci-aardvark.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 7f4c83bde3fe0..785f7ad4ac9c0 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -539,7 +539,7 @@ static int advk_pcie_check_pio_status(struct advk_pcie *pcie, bool allow_crs, u3 else str_posted = "Posted"; - dev_err(dev, "%s PIO Response Status: %s, %#x @ %#x\n", + dev_dbg(dev, "%s PIO Response Status: %s, %#x @ %#x\n", str_posted, strcomp_status, reg, advk_readl(pcie, PIO_ADDR_LS)); return -EFAULT; From f19fbb90e5c2fb0a95ebc5684a789715835a8a90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Tue, 5 Oct 2021 20:09:43 +0200 Subject: [PATCH 1618/5344] PCI: aardvark: Fix preserving PCI_EXP_RTCTL_CRSSVE flag on emulated bridge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit d419052bc6c60fa4ab2b5a51d5f1e55a66e2b4ff ] Commit 43f5c77bcbd2 ("PCI: aardvark: Fix reporting CRS value") started using CRSSVE flag for handling CRS responses. PCI_EXP_RTCTL_CRSSVE flag is stored only in emulated config space buffer and there is handler for PCI_EXP_RTCTL register. So every read operation from config space automatically clears CRSSVE flag as it is not defined in PCI_EXP_RTCTL read handler. Fix this by reading current CRSSVE bit flag from emulated space buffer and appending it to PCI_EXP_RTCTL read response. Link: https://lore.kernel.org/r/20211005180952.6812-5-kabel@kernel.org Fixes: 43f5c77bcbd2 ("PCI: aardvark: Fix reporting CRS value") Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marek Behún Signed-off-by: Sasha Levin --- drivers/pci/controller/pci-aardvark.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 785f7ad4ac9c0..45794ba643d40 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -580,6 +580,7 @@ advk_pci_bridge_emul_pcie_conf_read(struct pci_bridge_emul *bridge, case PCI_EXP_RTCTL: { u32 val = advk_readl(pcie, PCIE_ISR0_MASK_REG); *value = (val & PCIE_MSG_PM_PME_MASK) ? 0 : PCI_EXP_RTCTL_PMEIE; + *value |= le16_to_cpu(bridge->pcie_conf.rootctl) & PCI_EXP_RTCTL_CRSSVE; *value |= PCI_EXP_RTCAP_CRSVIS << 16; return PCI_BRIDGE_EMUL_HANDLED; } From 1919d495c07cbe4c15b002ff84b90e66d89a4d42 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 8 Oct 2021 15:46:52 +0800 Subject: [PATCH 1619/5344] opp: Fix return in _opp_add_static_v2() [ Upstream commit 27ff8187f13ecfec8a26fb1928e906f46f326cc5 ] Fix sparse warning: drivers/opp/of.c:924 _opp_add_static_v2() warn: passing zero to 'ERR_PTR' For duplicate OPPs 'ret' be set to zero. Fixes: deac8703da5f ("PM / OPP: _of_add_opp_table_v2(): increment count only if OPP is added") Signed-off-by: YueHaibing Signed-off-by: Viresh Kumar Signed-off-by: Sasha Levin --- drivers/opp/of.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 30cc407c8f93f..ba30694508153 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -639,7 +639,7 @@ static struct dev_pm_opp *_opp_add_static_v2(struct opp_table *opp_table, free_opp: _opp_free(new_opp); - return ERR_PTR(ret); + return ret ? ERR_PTR(ret) : NULL; } /* Initializes OPP tables based on new bindings */ From 50d0bd1c8e85a4ac84af93ea097ba1b24009a135 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 4 Oct 2021 15:44:16 -0400 Subject: [PATCH 1620/5344] NFS: Fix deadlocks in nfs_scan_commit_list() [ Upstream commit 64a93dbf25d3a1368bb58ddf0f61d0a92d7479e3 ] Partially revert commit 2ce209c42c01 ("NFS: Wait for requests that are locked on the commit list"), since it can lead to deadlocks between commit requests and nfs_join_page_group(). For now we should assume that any locked requests on the commit list are either about to be removed and committed by another task, or the writes they describe are about to be retransmitted. In either case, we should not need to worry. Fixes: 2ce209c42c01 ("NFS: Wait for requests that are locked on the commit list") Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- fs/nfs/write.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 613c3ef23e07b..30d8e7bc1cef3 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1050,25 +1050,11 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst, struct nfs_page *req, *tmp; int ret = 0; -restart: list_for_each_entry_safe(req, tmp, src, wb_list) { kref_get(&req->wb_kref); if (!nfs_lock_request(req)) { - int status; - - /* Prevent deadlock with nfs_lock_and_join_requests */ - if (!list_empty(dst)) { - nfs_release_request(req); - continue; - } - /* Ensure we make progress to prevent livelock */ - mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); - status = nfs_wait_on_request(req); nfs_release_request(req); - mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); - if (status < 0) - break; - goto restart; + continue; } nfs_request_remove_commit_list(req, cinfo); clear_bit(PG_COMMIT_TO_DS, &req->wb_flags); @@ -1935,6 +1921,7 @@ static int __nfs_commit_inode(struct inode *inode, int how, int may_wait = how & FLUSH_SYNC; int ret, nscan; + how &= ~FLUSH_SYNC; nfs_init_cinfo_from_inode(&cinfo, inode); nfs_commit_begin(cinfo.mds); for (;;) { From e1f22bd74bb8f635a9b2880e9af8dac5187c87cd Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Tue, 9 Mar 2021 00:00:20 -0800 Subject: [PATCH 1621/5344] fs: orangefs: fix error return code of orangefs_revalidate_lookup() [ Upstream commit 4c2b46c824a78fc8190d8eafaaea5a9078fe7479 ] When op_alloc() returns NULL to new_op, no error return code of orangefs_revalidate_lookup() is assigned. To fix this bug, ret is assigned with -ENOMEM in this case. Fixes: 8bb8aefd5afb ("OrangeFS: Change almost all instances of the string PVFS2 to OrangeFS.") Reported-by: TOTE Robot Signed-off-by: Jia-Ju Bai Signed-off-by: Mike Marshall Signed-off-by: Sasha Levin --- fs/orangefs/dcache.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c index fe484cf93e5cd..8bbe9486e3a62 100644 --- a/fs/orangefs/dcache.c +++ b/fs/orangefs/dcache.c @@ -26,8 +26,10 @@ static int orangefs_revalidate_lookup(struct dentry *dentry) gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: attempting lookup.\n", __func__); new_op = op_alloc(ORANGEFS_VFS_OP_LOOKUP); - if (!new_op) + if (!new_op) { + ret = -ENOMEM; goto out_put_parent; + } new_op->upcall.req.lookup.sym_follow = ORANGEFS_LOOKUP_LINK_NO_FOLLOW; new_op->upcall.req.lookup.parent_refn = parent->refn; From dd1f9bf2e083636afa3ab9f8c87164b11a83981f Mon Sep 17 00:00:00 2001 From: Evgeny Novikov Date: Fri, 9 Jul 2021 17:45:29 +0300 Subject: [PATCH 1622/5344] mtd: spi-nor: hisi-sfc: Remove excessive clk_disable_unprepare() [ Upstream commit 78e4d342187625585932bb437ec26e1060f7fc6f ] hisi_spi_nor_probe() invokes clk_disable_unprepare() on all paths after successful call of clk_prepare_enable(). Besides, the clock is enabled by hispi_spi_nor_prep() and disabled by hispi_spi_nor_unprep(). So at remove time it is not possible to have the clock enabled. The patch removes excessive clk_disable_unprepare() from hisi_spi_nor_remove(). Found by Linux Driver Verification project (linuxtesting.org). Fixes: e523f11141bd ("mtd: spi-nor: add hisilicon spi-nor flash controller driver") Signed-off-by: Evgeny Novikov Signed-off-by: Tudor Ambarus Reviewed-by: Pratyush Yadav Link: https://lore.kernel.org/r/20210709144529.31379-1-novikov@ispras.ru Signed-off-by: Sasha Levin --- drivers/mtd/spi-nor/hisi-sfc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/mtd/spi-nor/hisi-sfc.c b/drivers/mtd/spi-nor/hisi-sfc.c index 8fcc48056a8bc..569cfd473c87b 100644 --- a/drivers/mtd/spi-nor/hisi-sfc.c +++ b/drivers/mtd/spi-nor/hisi-sfc.c @@ -474,7 +474,6 @@ static int hisi_spi_nor_remove(struct platform_device *pdev) hisi_spi_nor_unregister_all(host); mutex_destroy(&host->lock); - clk_disable_unprepare(host->clk); return 0; } From df4990efddcf71d551b86a64ea4da0ba9c62e553 Mon Sep 17 00:00:00 2001 From: Zev Weiss Date: Thu, 14 Oct 2021 13:39:52 -0700 Subject: [PATCH 1623/5344] mtd: core: don't remove debugfs directory if device is in use [ Upstream commit c13de2386c78e890d4ae6f01a85eefd0b293fb08 ] Previously, if del_mtd_device() failed with -EBUSY due to a non-zero usecount, a subsequent call to attempt the deletion again would try to remove a debugfs directory that had already been removed and panic. With this change the second call can instead proceed safely. Fixes: e8e3edb95ce6 ("mtd: create per-device and module-scope debugfs entries") Signed-off-by: Zev Weiss Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20211014203953.5424-1-zev@bewilderbeest.net Signed-off-by: Sasha Levin --- drivers/mtd/mtdcore.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index 32a76b8feaa5d..ac5d3b6db9b84 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -727,8 +727,6 @@ int del_mtd_device(struct mtd_info *mtd) mutex_lock(&mtd_table_mutex); - debugfs_remove_recursive(mtd->dbg.dfs_dir); - if (idr_find(&mtd_idr, mtd->index) != mtd) { ret = -ENODEV; goto out_error; @@ -744,6 +742,8 @@ int del_mtd_device(struct mtd_info *mtd) mtd->index, mtd->name, mtd->usecount); ret = -EBUSY; } else { + debugfs_remove_recursive(mtd->dbg.dfs_dir); + /* Try to remove the NVMEM provider */ if (mtd->nvmem) nvmem_unregister(mtd->nvmem); From 19c09a623dfa047e68fdad1e5e7150940f0cd1dd Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Thu, 7 Oct 2021 14:12:28 +0300 Subject: [PATCH 1624/5344] dmaengine: at_xdmac: fix AT_XDMAC_CC_PERID() macro [ Upstream commit 320c88a3104dc955f928a1eecebd551ff89530c0 ] AT_XDMAC_CC_PERID() should be used to setup bits 24..30 of XDMAC_CC register. Using it without parenthesis around 0x7f & (i) will lead to setting all the time zero for bits 24..30 of XDMAC_CC as the << operator has higher precedence over bitwise &. Thus, add paranthesis around 0x7f & (i). Fixes: 15a03850ab8f ("dmaengine: at_xdmac: fix macro typo") Signed-off-by: Claudiu Beznea Reviewed-by: Tudor Ambarus Link: https://lore.kernel.org/r/20211007111230.2331837-3-claudiu.beznea@microchip.com Signed-off-by: Vinod Koul Signed-off-by: Sasha Levin --- drivers/dma/at_xdmac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c index b58ac720d9a12..6f1e97ba3e786 100644 --- a/drivers/dma/at_xdmac.c +++ b/drivers/dma/at_xdmac.c @@ -145,7 +145,7 @@ #define AT_XDMAC_CC_WRIP (0x1 << 23) /* Write in Progress (read only) */ #define AT_XDMAC_CC_WRIP_DONE (0x0 << 23) #define AT_XDMAC_CC_WRIP_IN_PROGRESS (0x1 << 23) -#define AT_XDMAC_CC_PERID(i) (0x7f & (i) << 24) /* Channel Peripheral Identifier */ +#define AT_XDMAC_CC_PERID(i) ((0x7f & (i)) << 24) /* Channel Peripheral Identifier */ #define AT_XDMAC_CDS_MSP 0x2C /* Channel Data Stride Memory Set Pattern */ #define AT_XDMAC_CSUS 0x30 /* Channel Source Microblock Stride */ #define AT_XDMAC_CDUS 0x34 /* Channel Destination Microblock Stride */ From 5da8f745d5df18d8b4309df32027b97f1fa52c6c Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 19 Oct 2021 16:45:02 +0200 Subject: [PATCH 1625/5344] auxdisplay: img-ascii-lcd: Fix lock-up when displaying empty string [ Upstream commit afcb5a811ff3ab3969f09666535eb6018a160358 ] While writing an empty string to a device attribute is a no-op, and thus does not need explicit safeguards, the user can still write a single newline to an attribute file: echo > .../message If that happens, img_ascii_lcd_display() trims the newline, yielding an empty string, and causing an infinite loop in img_ascii_lcd_scroll(). Fix this by adding a check for empty strings. Clear the display in case one is encountered. Fixes: 0cad855fbd083ee5 ("auxdisplay: img-ascii-lcd: driver for simple ASCII LCD displays") Signed-off-by: Geert Uytterhoeven Signed-off-by: Miguel Ojeda Signed-off-by: Sasha Levin --- drivers/auxdisplay/img-ascii-lcd.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/auxdisplay/img-ascii-lcd.c b/drivers/auxdisplay/img-ascii-lcd.c index efb928e25aef3..9556d6827f005 100644 --- a/drivers/auxdisplay/img-ascii-lcd.c +++ b/drivers/auxdisplay/img-ascii-lcd.c @@ -280,6 +280,16 @@ static int img_ascii_lcd_display(struct img_ascii_lcd_ctx *ctx, if (msg[count - 1] == '\n') count--; + if (!count) { + /* clear the LCD */ + devm_kfree(&ctx->pdev->dev, ctx->message); + ctx->message = NULL; + ctx->message_len = 0; + memset(ctx->curr, ' ', ctx->cfg->num_chars); + ctx->cfg->update(ctx); + return 0; + } + new_msg = devm_kmalloc(&ctx->pdev->dev, count + 1, GFP_KERNEL); if (!new_msg) return -ENOMEM; From d87820c02066b775c8c48d71407c796bd15fca42 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 19 Oct 2021 16:45:08 +0200 Subject: [PATCH 1626/5344] auxdisplay: ht16k33: Connect backlight to fbdev [ Upstream commit 80f9eb70fd9276938f0a131f76d438021bfd8b34 ] Currently /sys/class/graphics/fb0/bl_curve is not accessible (-ENODEV), as the driver does not connect the backlight to the frame buffer device. Fix this moving backlight initialization up, and filling in fb_info.bl_dev. Fixes: 8992da44c6805d53 ("auxdisplay: ht16k33: Driver for LED controller") Signed-off-by: Geert Uytterhoeven Reviewed-by: Robin van der Gracht Signed-off-by: Miguel Ojeda Signed-off-by: Sasha Levin --- drivers/auxdisplay/ht16k33.c | 56 ++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/drivers/auxdisplay/ht16k33.c b/drivers/auxdisplay/ht16k33.c index 33b887b389061..1b0e8232517dd 100644 --- a/drivers/auxdisplay/ht16k33.c +++ b/drivers/auxdisplay/ht16k33.c @@ -418,6 +418,33 @@ static int ht16k33_probe(struct i2c_client *client, if (err) return err; + /* Backlight */ + memset(&bl_props, 0, sizeof(struct backlight_properties)); + bl_props.type = BACKLIGHT_RAW; + bl_props.max_brightness = MAX_BRIGHTNESS; + + bl = devm_backlight_device_register(&client->dev, DRIVER_NAME"-bl", + &client->dev, priv, + &ht16k33_bl_ops, &bl_props); + if (IS_ERR(bl)) { + dev_err(&client->dev, "failed to register backlight\n"); + return PTR_ERR(bl); + } + + err = of_property_read_u32(node, "default-brightness-level", + &dft_brightness); + if (err) { + dft_brightness = MAX_BRIGHTNESS; + } else if (dft_brightness > MAX_BRIGHTNESS) { + dev_warn(&client->dev, + "invalid default brightness level: %u, using %u\n", + dft_brightness, MAX_BRIGHTNESS); + dft_brightness = MAX_BRIGHTNESS; + } + + bl->props.brightness = dft_brightness; + ht16k33_bl_update_status(bl); + /* Framebuffer (2 bytes per column) */ BUILD_BUG_ON(PAGE_SIZE < HT16K33_FB_SIZE); fbdev->buffer = (unsigned char *) get_zeroed_page(GFP_KERNEL); @@ -450,6 +477,7 @@ static int ht16k33_probe(struct i2c_client *client, fbdev->info->screen_size = HT16K33_FB_SIZE; fbdev->info->fix = ht16k33_fb_fix; fbdev->info->var = ht16k33_fb_var; + fbdev->info->bl_dev = bl; fbdev->info->pseudo_palette = NULL; fbdev->info->flags = FBINFO_FLAG_DEFAULT; fbdev->info->par = priv; @@ -462,34 +490,6 @@ static int ht16k33_probe(struct i2c_client *client, if (err) goto err_fbdev_unregister; - /* Backlight */ - memset(&bl_props, 0, sizeof(struct backlight_properties)); - bl_props.type = BACKLIGHT_RAW; - bl_props.max_brightness = MAX_BRIGHTNESS; - - bl = devm_backlight_device_register(&client->dev, DRIVER_NAME"-bl", - &client->dev, priv, - &ht16k33_bl_ops, &bl_props); - if (IS_ERR(bl)) { - dev_err(&client->dev, "failed to register backlight\n"); - err = PTR_ERR(bl); - goto err_fbdev_unregister; - } - - err = of_property_read_u32(node, "default-brightness-level", - &dft_brightness); - if (err) { - dft_brightness = MAX_BRIGHTNESS; - } else if (dft_brightness > MAX_BRIGHTNESS) { - dev_warn(&client->dev, - "invalid default brightness level: %u, using %u\n", - dft_brightness, MAX_BRIGHTNESS); - dft_brightness = MAX_BRIGHTNESS; - } - - bl->props.brightness = dft_brightness; - ht16k33_bl_update_status(bl); - ht16k33_fb_queue(priv); return 0; From 97a4182d4ba702ec75ff4b176821bea46a5ca690 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 19 Oct 2021 16:45:09 +0200 Subject: [PATCH 1627/5344] auxdisplay: ht16k33: Fix frame buffer device blanking [ Upstream commit 840fe258332544aa7321921e1723d37b772af7a9 ] As the ht16k33 frame buffer sub-driver does not register an fb_ops.fb_blank() handler, blanking does not work: $ echo 1 > /sys/class/graphics/fb0/blank sh: write error: Invalid argument Fix this by providing a handler that always returns zero, to make sure blank events will be sent to the actual device handling the backlight. Reported-by: Robin van der Gracht Suggested-by: Robin van der Gracht Fixes: 8992da44c6805d53 ("auxdisplay: ht16k33: Driver for LED controller") Signed-off-by: Geert Uytterhoeven Signed-off-by: Miguel Ojeda Signed-off-by: Sasha Levin --- drivers/auxdisplay/ht16k33.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/auxdisplay/ht16k33.c b/drivers/auxdisplay/ht16k33.c index 1b0e8232517dd..59109b410c67f 100644 --- a/drivers/auxdisplay/ht16k33.c +++ b/drivers/auxdisplay/ht16k33.c @@ -219,6 +219,15 @@ static const struct backlight_ops ht16k33_bl_ops = { .check_fb = ht16k33_bl_check_fb, }; +/* + * Blank events will be passed to the actual device handling the backlight when + * we return zero here. + */ +static int ht16k33_blank(int blank, struct fb_info *info) +{ + return 0; +} + static int ht16k33_mmap(struct fb_info *info, struct vm_area_struct *vma) { struct ht16k33_priv *priv = info->par; @@ -231,6 +240,7 @@ static struct fb_ops ht16k33_fb_ops = { .owner = THIS_MODULE, .fb_read = fb_sys_read, .fb_write = fb_sys_write, + .fb_blank = ht16k33_blank, .fb_fillrect = sys_fillrect, .fb_copyarea = sys_copyarea, .fb_imageblit = sys_imageblit, From 54cc2f6fc5c3c164b394757285c9b523c72aee99 Mon Sep 17 00:00:00 2001 From: Robert-Ionut Alexa Date: Fri, 23 Apr 2021 12:01:51 +0300 Subject: [PATCH 1628/5344] soc: fsl: dpaa2-console: free buffer before returning from dpaa2_console_read [ Upstream commit 8120bd469f5525da229953c1197f2b826c0109f4 ] Free the kbuf buffer before returning from the dpaa2_console_read() function. The variable no longer goes out of scope, leaking the storage it points to. Fixes: c93349d8c170 ("soc: fsl: add DPAA2 console support") Signed-off-by: Robert-Ionut Alexa Signed-off-by: Ioana Ciornei Signed-off-by: Li Yang Signed-off-by: Sasha Levin --- drivers/soc/fsl/dpaa2-console.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/soc/fsl/dpaa2-console.c b/drivers/soc/fsl/dpaa2-console.c index 27243f706f376..53917410f2bdb 100644 --- a/drivers/soc/fsl/dpaa2-console.c +++ b/drivers/soc/fsl/dpaa2-console.c @@ -231,6 +231,7 @@ static ssize_t dpaa2_console_read(struct file *fp, char __user *buf, cd->cur_ptr += bytes; written += bytes; + kfree(kbuf); return written; err_free_buf: From 04dabca8ff09d4749fd4b8c0a962a2ddfb846581 Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Sat, 23 Oct 2021 15:41:01 +0200 Subject: [PATCH 1629/5344] dmaengine: dmaengine_desc_callback_valid(): Check for `callback_result` [ Upstream commit e7e1e880b114ca640a2f280b0d5d38aed98f98c6 ] Before the `callback_result` callback was introduced drivers coded their invocation to the callback in a similar way to: if (cb->callback) { spin_unlock(&dma->lock); cb->callback(cb->callback_param); spin_lock(&dma->lock); } With the introduction of `callback_result` two helpers where introduced to transparently handle both types of callbacks. And drivers where updated to look like this: if (dmaengine_desc_callback_valid(cb)) { spin_unlock(&dma->lock); dmaengine_desc_callback_invoke(cb, ...); spin_lock(&dma->lock); } dmaengine_desc_callback_invoke() correctly handles both `callback_result` and `callback`. But we forgot to update the dmaengine_desc_callback_valid() function to check for `callback_result`. As a result DMA descriptors that use the `callback_result` rather than `callback` don't have their callback invoked by drivers that follow the pattern above. Fix this by checking for both `callback` and `callback_result` in dmaengine_desc_callback_valid(). Fixes: f067025bc676 ("dmaengine: add support to provide error result from a DMA transation") Signed-off-by: Lars-Peter Clausen Acked-by: Dave Jiang Link: https://lore.kernel.org/r/20211023134101.28042-1-lars@metafoo.de Signed-off-by: Vinod Koul Signed-off-by: Sasha Levin --- drivers/dma/dmaengine.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/dmaengine.h b/drivers/dma/dmaengine.h index 501c0b063f852..302f13efd35d9 100644 --- a/drivers/dma/dmaengine.h +++ b/drivers/dma/dmaengine.h @@ -168,7 +168,7 @@ dmaengine_desc_get_callback_invoke(struct dma_async_tx_descriptor *tx, static inline bool dmaengine_desc_callback_valid(struct dmaengine_desc_callback *cb) { - return (cb->callback) ? true : false; + return cb->callback || cb->callback_result; } #endif From 35133075492a03b0ad9417d92688bc1463c9b5c3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 20 Oct 2021 12:43:52 -0500 Subject: [PATCH 1630/5344] signal/sh: Use force_sig(SIGKILL) instead of do_group_exit(SIGKILL) [ Upstream commit ce0ee4e6ac99606f3945f4d47775544edc3f7985 ] Today the sh code allocates memory the first time a process uses the fpu. If that memory allocation fails, kill the affected task with force_sig(SIGKILL) rather than do_group_exit(SIGKILL). Calling do_group_exit from an exception handler can potentially lead to dead locks as do_group_exit is not designed to be called from interrupt context. Instead use force_sig(SIGKILL) to kill the userspace process. Sending signals in general and force_sig in particular has been tested from interrupt context so there should be no problems. Cc: Yoshinori Sato Cc: Rich Felker Cc: linux-sh@vger.kernel.org Fixes: 0ea820cf9bf5 ("sh: Move over to dynamically allocated FPU context.") Link: https://lkml.kernel.org/r/20211020174406.17889-6-ebiederm@xmission.com Signed-off-by: Eric W. Biederman Signed-off-by: Sasha Levin --- arch/sh/kernel/cpu/fpu.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/sh/kernel/cpu/fpu.c b/arch/sh/kernel/cpu/fpu.c index ae354a2931e7e..fd6db0ab19288 100644 --- a/arch/sh/kernel/cpu/fpu.c +++ b/arch/sh/kernel/cpu/fpu.c @@ -62,18 +62,20 @@ void fpu_state_restore(struct pt_regs *regs) } if (!tsk_used_math(tsk)) { - local_irq_enable(); + int ret; /* * does a slab alloc which can sleep */ - if (init_fpu(tsk)) { + local_irq_enable(); + ret = init_fpu(tsk); + local_irq_disable(); + if (ret) { /* * ran out of memory! */ - do_group_exit(SIGKILL); + force_sig(SIGKILL); return; } - local_irq_disable(); } grab_fpu(regs); From 8e6dd5a9c4a9a58132f6b531c0a730f536ec28e9 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 2 Oct 2021 17:02:23 -0700 Subject: [PATCH 1631/5344] m68k: set a default value for MEMORY_RESERVE [ Upstream commit 1aaa557b2db95c9506ed0981bc34505c32d6b62b ] 'make randconfig' can produce a .config file with "CONFIG_MEMORY_RESERVE=" (no value) since it has no default. When a subsequent 'make all' is done, kconfig restarts the config and prompts for a value for MEMORY_RESERVE. This breaks scripting/automation where there is no interactive user input. Add a default value for MEMORY_RESERVE. (Any integer value will work here for kconfig.) Fixes a kconfig warning: .config:214:warning: symbol value '' invalid for MEMORY_RESERVE * Restart config... Memory reservation (MiB) (MEMORY_RESERVE) [] (NEW) Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") # from beginning of git history Signed-off-by: Randy Dunlap Reviewed-by: Geert Uytterhoeven Cc: Greg Ungerer Cc: linux-m68k@lists.linux-m68k.org Signed-off-by: Greg Ungerer Signed-off-by: Sasha Levin --- arch/m68k/Kconfig.machine | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/m68k/Kconfig.machine b/arch/m68k/Kconfig.machine index 1bbe0dd0c4fe5..b88a980f56f8a 100644 --- a/arch/m68k/Kconfig.machine +++ b/arch/m68k/Kconfig.machine @@ -190,6 +190,7 @@ config INIT_LCD config MEMORY_RESERVE int "Memory reservation (MiB)" depends on (UCSIMM || UCDIMM) + default 0 help Reserve certain memory regions on 68x328 based boards. From 15458600cbacf2d7ea53cc2ac2cd7f9f4bf83bfd Mon Sep 17 00:00:00 2001 From: Ahmad Fatoum Date: Mon, 9 Aug 2021 18:20:31 +0200 Subject: [PATCH 1632/5344] watchdog: f71808e_wdt: fix inaccurate report in WDIOC_GETTIMEOUT [ Upstream commit 164483c735190775f29d0dcbac0363adc51a068d ] The fintek watchdog timer can configure timeouts of second granularity only up to 255 seconds. Beyond that, the timeout needs to be configured with minute granularity. WDIOC_GETTIMEOUT should report the actual timeout configured, not just echo back the timeout configured by the user. Do so. Fixes: 96cb4eb019ce ("watchdog: f71808e_wdt: new watchdog driver for Fintek F71808E and F71882FG") Suggested-by: Guenter Roeck Reviewed-by: Guenter Roeck Signed-off-by: Ahmad Fatoum Link: https://lore.kernel.org/r/5e17960fe8cc0e3cb2ba53de4730b75d9a0f33d5.1628525954.git-series.a.fatoum@pengutronix.de Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck Signed-off-by: Sasha Levin --- drivers/watchdog/f71808e_wdt.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/watchdog/f71808e_wdt.c b/drivers/watchdog/f71808e_wdt.c index 893cef70c1599..aa57498009c34 100644 --- a/drivers/watchdog/f71808e_wdt.c +++ b/drivers/watchdog/f71808e_wdt.c @@ -228,15 +228,17 @@ static int watchdog_set_timeout(int timeout) mutex_lock(&watchdog.lock); - watchdog.timeout = timeout; if (timeout > 0xff) { watchdog.timer_val = DIV_ROUND_UP(timeout, 60); watchdog.minutes_mode = true; + timeout = watchdog.timer_val * 60; } else { watchdog.timer_val = timeout; watchdog.minutes_mode = false; } + watchdog.timeout = timeout; + mutex_unlock(&watchdog.lock); return 0; From 8f1cfe932f81a2c8602ee2c9b4b6fc57f6cc34d8 Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Tue, 7 Sep 2021 10:49:04 +0800 Subject: [PATCH 1633/5344] ar7: fix kernel builds for compiler test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 28b7ee33a2122569ac065cad578bf23f50cc65c3 ] TI AR7 Watchdog Timer is only build for 32bit. Avoid error like: In file included from drivers/watchdog/ar7_wdt.c:29: ./arch/mips/include/asm/mach-ar7/ar7.h: In function ‘ar7_is_titan’: ./arch/mips/include/asm/mach-ar7/ar7.h:111:24: error: implicit declaration of function ‘KSEG1ADDR’; did you mean ‘CKSEG1ADDR’? [-Werror=implicit-function-declaration] 111 | return (readl((void *)KSEG1ADDR(AR7_REGS_GPIO + 0x24)) & 0xffff) == | ^~~~~~~~~ | CKSEG1ADDR Fixes: da2a68b3eb47 ("watchdog: Enable COMPILE_TEST where possible") Signed-off-by: Jackie Liu Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20210907024904.4127611-1-liu.yun@linux.dev Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck Signed-off-by: Sasha Levin --- drivers/watchdog/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index 1aa42e879e633..3fabd4177b0ee 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -1682,7 +1682,7 @@ config SIBYTE_WDOG config AR7_WDT tristate "TI AR7 Watchdog Timer" - depends on AR7 || (MIPS && COMPILE_TEST) + depends on AR7 || (MIPS && 32BIT && COMPILE_TEST) help Hardware driver for the TI AR7 Watchdog Timer. From 84edb337ffcb9b3732999a042d1fa6391e3371e0 Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Tue, 26 Oct 2021 04:54:01 -0700 Subject: [PATCH 1634/5344] scsi: qla2xxx: Fix gnl list corruption [ Upstream commit c98c5daaa24b583cba1369b7d167f93c6ae7299c ] Current code does list element deletion and addition in and out of lock protection. This patch moves deletion behind lock. list_add double add: new=ffff9130b5eb89f8, prev=ffff9130b5eb89f8, next=ffff9130c6a715f0. ------------[ cut here ]------------ kernel BUG at lib/list_debug.c:31! invalid opcode: 0000 [#1] SMP PTI CPU: 1 PID: 182395 Comm: kworker/1:37 Kdump: loaded Tainted: G W OE --------- - - 4.18.0-193.el8.x86_64 #1 Hardware name: HP ProLiant DL160 Gen8, BIOS J03 02/10/2014 Workqueue: qla2xxx_wq qla2x00_iocb_work_fn [qla2xxx] RIP: 0010:__list_add_valid+0x41/0x50 Code: 85 94 00 00 00 48 39 c7 74 0b 48 39 d7 74 06 b8 01 00 00 00 c3 48 89 f2 4c 89 c1 48 89 fe 48 c7 c7 60 83 ad 97 e8 4d bd ce ff <0f> 0b 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 48 8b 07 48 8b 57 08 RSP: 0018:ffffaba306f47d68 EFLAGS: 00010046 RAX: 0000000000000058 RBX: ffff9130b5eb8800 RCX: 0000000000000006 RDX: 0000000000000000 RSI: 0000000000000096 RDI: ffff9130b7456a00 RBP: ffff9130c6a70a58 R08: 000000000008d7be R09: 0000000000000001 R10: 0000000000000000 R11: 0000000000000001 R12: ffff9130c6a715f0 R13: ffff9130b5eb8824 R14: ffff9130b5eb89f8 R15: ffff9130b5eb89f8 FS: 0000000000000000(0000) GS:ffff9130b7440000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007efcaaef11a0 CR3: 000000005200a002 CR4: 00000000000606e0 Call Trace: qla24xx_async_gnl+0x113/0x3c0 [qla2xxx] ? qla2x00_iocb_work_fn+0x53/0x80 [qla2xxx] ? process_one_work+0x1a7/0x3b0 ? worker_thread+0x30/0x390 ? create_worker+0x1a0/0x1a0 ? kthread+0x112/0x130 Link: https://lore.kernel.org/r/20211026115412.27691-3-njavali@marvell.com Fixes: 726b85487067 ("qla2xxx: Add framework for async fabric discovery") Reviewed-by: Himanshu Madhani Signed-off-by: Quinn Tran Signed-off-by: Nilesh Javali Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/qla2xxx/qla_init.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c index c9391eb1e3e93..da54574d57d1f 100644 --- a/drivers/scsi/qla2xxx/qla_init.c +++ b/drivers/scsi/qla2xxx/qla_init.c @@ -978,8 +978,6 @@ static void qla24xx_async_gnl_sp_done(srb_t *sp, int res) sp->name, res, sp->u.iocb_cmd.u.mbx.in_mb[1], sp->u.iocb_cmd.u.mbx.in_mb[2]); - if (res == QLA_FUNCTION_TIMEOUT) - return; sp->fcport->flags &= ~(FCF_ASYNC_SENT|FCF_ASYNC_ACTIVE); memset(&ea, 0, sizeof(ea)); @@ -1017,8 +1015,8 @@ static void qla24xx_async_gnl_sp_done(srb_t *sp, int res) spin_unlock_irqrestore(&vha->hw->tgt.sess_lock, flags); list_for_each_entry_safe(fcport, tf, &h, gnl_entry) { - list_del_init(&fcport->gnl_entry); spin_lock_irqsave(&vha->hw->tgt.sess_lock, flags); + list_del_init(&fcport->gnl_entry); fcport->flags &= ~(FCF_ASYNC_SENT | FCF_ASYNC_ACTIVE); spin_unlock_irqrestore(&vha->hw->tgt.sess_lock, flags); ea.fcport = fcport; From 6c915fd2222ea72bea31717def0f3bc811c02183 Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Tue, 26 Oct 2021 04:54:02 -0700 Subject: [PATCH 1635/5344] scsi: qla2xxx: Turn off target reset during issue_lip [ Upstream commit 0b7a9fd934a68ebfc1019811b7bdc1742072ad7b ] When user uses issue_lip to do link bounce, driver sends additional target reset to remote device before resetting the link. The target reset would affect other paths with active I/Os. This patch will remove the unnecessary target reset. Link: https://lore.kernel.org/r/20211026115412.27691-4-njavali@marvell.com Fixes: 5854771e314e ("[SCSI] qla2xxx: Add ISPFX00 specific bus reset routine") Reviewed-by: Himanshu Madhani Signed-off-by: Quinn Tran Signed-off-by: Nilesh Javali Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/qla2xxx/qla_gbl.h | 2 -- drivers/scsi/qla2xxx/qla_mr.c | 23 ----------------------- drivers/scsi/qla2xxx/qla_os.c | 27 ++------------------------- 3 files changed, 2 insertions(+), 50 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_gbl.h b/drivers/scsi/qla2xxx/qla_gbl.h index 7aa233771ec86..1a98e37c9be22 100644 --- a/drivers/scsi/qla2xxx/qla_gbl.h +++ b/drivers/scsi/qla2xxx/qla_gbl.h @@ -156,7 +156,6 @@ extern int ql2xasynctmfenable; extern int ql2xgffidenable; extern int ql2xenabledif; extern int ql2xenablehba_err_chk; -extern int ql2xtargetreset; extern int ql2xdontresethba; extern uint64_t ql2xmaxlun; extern int ql2xmdcapmask; @@ -770,7 +769,6 @@ extern void qlafx00_abort_iocb(srb_t *, struct abort_iocb_entry_fx00 *); extern void qlafx00_fxdisc_iocb(srb_t *, struct fxdisc_entry_fx00 *); extern void qlafx00_timer_routine(scsi_qla_host_t *); extern int qlafx00_rescan_isp(scsi_qla_host_t *); -extern int qlafx00_loop_reset(scsi_qla_host_t *vha); /* qla82xx related functions */ diff --git a/drivers/scsi/qla2xxx/qla_mr.c b/drivers/scsi/qla2xxx/qla_mr.c index 605b59c76c901..badd09c5dd429 100644 --- a/drivers/scsi/qla2xxx/qla_mr.c +++ b/drivers/scsi/qla2xxx/qla_mr.c @@ -740,29 +740,6 @@ qlafx00_lun_reset(fc_port_t *fcport, uint64_t l, int tag) return qla2x00_async_tm_cmd(fcport, TCF_LUN_RESET, l, tag); } -int -qlafx00_loop_reset(scsi_qla_host_t *vha) -{ - int ret; - struct fc_port *fcport; - struct qla_hw_data *ha = vha->hw; - - if (ql2xtargetreset) { - list_for_each_entry(fcport, &vha->vp_fcports, list) { - if (fcport->port_type != FCT_TARGET) - continue; - - ret = ha->isp_ops->target_reset(fcport, 0, 0); - if (ret != QLA_SUCCESS) { - ql_dbg(ql_dbg_taskm, vha, 0x803d, - "Bus Reset failed: Reset=%d " - "d_id=%x.\n", ret, fcport->d_id.b24); - } - } - } - return QLA_SUCCESS; -} - int qlafx00_iospace_config(struct qla_hw_data *ha) { diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 049a68c59c137..c1d4c964b0dd4 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -191,12 +191,6 @@ MODULE_PARM_DESC(ql2xdbwr, " 0 -- Regular doorbell.\n" " 1 -- CAMRAM doorbell (faster).\n"); -int ql2xtargetreset = 1; -module_param(ql2xtargetreset, int, S_IRUGO); -MODULE_PARM_DESC(ql2xtargetreset, - "Enable target reset." - "Default is 1 - use hw defaults."); - int ql2xgffidenable; module_param(ql2xgffidenable, int, S_IRUGO); MODULE_PARM_DESC(ql2xgffidenable, @@ -1638,27 +1632,10 @@ int qla2x00_loop_reset(scsi_qla_host_t *vha) { int ret; - struct fc_port *fcport; struct qla_hw_data *ha = vha->hw; - if (IS_QLAFX00(ha)) { - return qlafx00_loop_reset(vha); - } - - if (ql2xtargetreset == 1 && ha->flags.enable_target_reset) { - list_for_each_entry(fcport, &vha->vp_fcports, list) { - if (fcport->port_type != FCT_TARGET) - continue; - - ret = ha->isp_ops->target_reset(fcport, 0, 0); - if (ret != QLA_SUCCESS) { - ql_dbg(ql_dbg_taskm, vha, 0x802c, - "Bus Reset failed: Reset=%d " - "d_id=%x.\n", ret, fcport->d_id.b24); - } - } - } - + if (IS_QLAFX00(ha)) + return QLA_SUCCESS; if (ha->flags.enable_lip_full_login && !IS_CNA_CAPABLE(ha)) { atomic_set(&vha->loop_state, LOOP_DOWN); From 884192e8c6086ecacd060bfeb8c6ac323a023934 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 19 Aug 2021 22:48:08 +0200 Subject: [PATCH 1636/5344] i2c: xlr: Fix a resource leak in the error handling path of 'xlr_i2c_probe()' [ Upstream commit 7f98960c046ee1136e7096aee168eda03aef8a5d ] A successful 'clk_prepare()' call should be balanced by a corresponding 'clk_unprepare()' call in the error handling path of the probe, as already done in the remove function. More specifically, 'clk_prepare_enable()' is used, but 'clk_disable()' is also already called. So just the unprepare step has still to be done. Update the error handling path accordingly. Fixes: 75d31c2372e4 ("i2c: xlr: add support for Sigma Designs controller variant") Signed-off-by: Christophe JAILLET Signed-off-by: Wolfram Sang Signed-off-by: Sasha Levin --- drivers/i2c/busses/i2c-xlr.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-xlr.c b/drivers/i2c/busses/i2c-xlr.c index 34cd4b3085402..dda6cb848405b 100644 --- a/drivers/i2c/busses/i2c-xlr.c +++ b/drivers/i2c/busses/i2c-xlr.c @@ -433,11 +433,15 @@ static int xlr_i2c_probe(struct platform_device *pdev) i2c_set_adapdata(&priv->adap, priv); ret = i2c_add_numbered_adapter(&priv->adap); if (ret < 0) - return ret; + goto err_unprepare_clk; platform_set_drvdata(pdev, priv); dev_info(&priv->adap.dev, "Added I2C Bus.\n"); return 0; + +err_unprepare_clk: + clk_unprepare(clk); + return ret; } static int xlr_i2c_remove(struct platform_device *pdev) From 670681b80489db76218d3e6236257f8fa12f951d Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 8 Oct 2021 15:44:17 +0800 Subject: [PATCH 1637/5344] xen-pciback: Fix return in pm_ctrl_init() [ Upstream commit 4745ea2628bb43a7ec34b71763b5a56407b33990 ] Return NULL instead of passing to ERR_PTR while err is zero, this fix smatch warnings: drivers/xen/xen-pciback/conf_space_capability.c:163 pm_ctrl_init() warn: passing zero to 'ERR_PTR' Fixes: a92336a1176b ("xen/pciback: Drop two backends, squash and cleanup some code.") Signed-off-by: YueHaibing Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20211008074417.8260-1-yuehaibing@huawei.com Signed-off-by: Boris Ostrovsky Signed-off-by: Sasha Levin --- drivers/xen/xen-pciback/conf_space_capability.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/xen-pciback/conf_space_capability.c b/drivers/xen/xen-pciback/conf_space_capability.c index e5694133ebe57..42f0f64fcba47 100644 --- a/drivers/xen/xen-pciback/conf_space_capability.c +++ b/drivers/xen/xen-pciback/conf_space_capability.c @@ -160,7 +160,7 @@ static void *pm_ctrl_init(struct pci_dev *dev, int offset) } out: - return ERR_PTR(err); + return err ? ERR_PTR(err) : NULL; } static const struct config_field caplist_pm[] = { From 47be49cf147cb50091cd1ba919d76508c9a2d47c Mon Sep 17 00:00:00 2001 From: Maxim Kiselev Date: Mon, 1 Nov 2021 18:23:41 +0300 Subject: [PATCH 1638/5344] net: davinci_emac: Fix interrupt pacing disable [ Upstream commit d52bcb47bdf971a59a2467975d2405fcfcb2fa19 ] This patch allows to use 0 for `coal->rx_coalesce_usecs` param to disable rx irq coalescing. Previously we could enable rx irq coalescing via ethtool (For ex: `ethtool -C eth0 rx-usecs 2000`) but we couldn't disable it because this part rejects 0 value: if (!coal->rx_coalesce_usecs) return -EINVAL; Fixes: 84da2658a619 ("TI DaVinci EMAC : Implement interrupt pacing functionality.") Signed-off-by: Maxim Kiselev Reviewed-by: Grygorii Strashko Link: https://lore.kernel.org/r/20211101152343.4193233-1-bigunclemax@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/ti/davinci_emac.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c index 6869c5c74b9f7..fac59032bf83a 100644 --- a/drivers/net/ethernet/ti/davinci_emac.c +++ b/drivers/net/ethernet/ti/davinci_emac.c @@ -412,8 +412,20 @@ static int emac_set_coalesce(struct net_device *ndev, u32 int_ctrl, num_interrupts = 0; u32 prescale = 0, addnl_dvdr = 1, coal_intvl = 0; - if (!coal->rx_coalesce_usecs) - return -EINVAL; + if (!coal->rx_coalesce_usecs) { + priv->coal_intvl = 0; + + switch (priv->version) { + case EMAC_VERSION_2: + emac_ctrl_write(EMAC_DM646X_CMINTCTRL, 0); + break; + default: + emac_ctrl_write(EMAC_CTRL_EWINTTCNT, 0); + break; + } + + return 0; + } coal_intvl = coal->rx_coalesce_usecs; From 1581afeb2a43f9a48ed6131ec148ae8fd8534ea3 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Tue, 2 Nov 2021 10:12:18 +0800 Subject: [PATCH 1639/5344] net: vlan: fix a UAF in vlan_dev_real_dev() [ Upstream commit 563bcbae3ba233c275c244bfce2efe12938f5363 ] The real_dev of a vlan net_device may be freed after unregister_vlan_dev(). Access the real_dev continually by vlan_dev_real_dev() will trigger the UAF problem for the real_dev like following: ================================================================== BUG: KASAN: use-after-free in vlan_dev_real_dev+0xf9/0x120 Call Trace: kasan_report.cold+0x83/0xdf vlan_dev_real_dev+0xf9/0x120 is_eth_port_of_netdev_filter.part.0+0xb1/0x2c0 is_eth_port_of_netdev_filter+0x28/0x40 ib_enum_roce_netdev+0x1a3/0x300 ib_enum_all_roce_netdevs+0xc7/0x140 netdevice_event_work_handler+0x9d/0x210 ... Freed by task 9288: kasan_save_stack+0x1b/0x40 kasan_set_track+0x1c/0x30 kasan_set_free_info+0x20/0x30 __kasan_slab_free+0xfc/0x130 slab_free_freelist_hook+0xdd/0x240 kfree+0xe4/0x690 kvfree+0x42/0x50 device_release+0x9f/0x240 kobject_put+0x1c8/0x530 put_device+0x1b/0x30 free_netdev+0x370/0x540 ppp_destroy_interface+0x313/0x3d0 ... Move the put_device(real_dev) to vlan_dev_free(). Ensure real_dev not be freed before vlan_dev unregistered. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+e4df4e1389e28972e955@syzkaller.appspotmail.com Signed-off-by: Ziyang Xuan Reviewed-by: Jason Gunthorpe Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/8021q/vlan.c | 3 --- net/8021q/vlan_dev.c | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 3f47abf9ef4a6..cd7c0429cddf8 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -116,9 +116,6 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head) } vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); - - /* Get rid of the vlan's reference to real_dev */ - dev_put(real_dev); } int vlan_check_real_dev(struct net_device *real_dev, diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 2a78da4072de9..415a29d42cdf0 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -790,6 +790,9 @@ static void vlan_dev_free(struct net_device *dev) free_percpu(vlan->vlan_pcpu_stats); vlan->vlan_pcpu_stats = NULL; + + /* Get rid of the vlan's reference to real_dev */ + dev_put(vlan->real_dev); } void vlan_setup(struct net_device *dev) From 8d182294419b44d3eaad832cbebfd286fe501b1c Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 31 Oct 2021 16:31:35 +0100 Subject: [PATCH 1640/5344] ACPI: PMIC: Fix intel_pmic_regs_handler() read accesses [ Upstream commit 009a789443fe4c8e6b1ecb7c16b4865c026184cd ] The handling of PMIC register reads through writing 0 to address 4 of the OpRegion is wrong. Instead of returning the read value through the value64, which is a no-op for function == ACPI_WRITE calls, store the value and then on a subsequent function == ACPI_READ with address == 3 (the address for the value field of the OpRegion) return the stored value. This has been tested on a Xiaomi Mi Pad 2 and makes the ACPI battery dev there mostly functional (unfortunately there are still other issues). Here are the SET() / GET() functions of the PMIC ACPI device, which use this OpRegion, which clearly show the new behavior to be correct: OperationRegion (REGS, 0x8F, Zero, 0x50) Field (REGS, ByteAcc, NoLock, Preserve) { CLNT, 8, SA, 8, OFF, 8, VAL, 8, RWM, 8 } Method (GET, 3, Serialized) { If ((AVBE == One)) { CLNT = Arg0 SA = Arg1 OFF = Arg2 RWM = Zero If ((AVBG == One)) { GPRW = Zero } } Return (VAL) /* \_SB_.PCI0.I2C7.PMI5.VAL_ */ } Method (SET, 4, Serialized) { If ((AVBE == One)) { CLNT = Arg0 SA = Arg1 OFF = Arg2 VAL = Arg3 RWM = One If ((AVBG == One)) { GPRW = One } } } Fixes: 0afa877a5650 ("ACPI / PMIC: intel: add REGS operation region support") Signed-off-by: Hans de Goede Signed-off-by: Rafael J. Wysocki Signed-off-by: Sasha Levin --- drivers/acpi/pmic/intel_pmic.c | 51 +++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/drivers/acpi/pmic/intel_pmic.c b/drivers/acpi/pmic/intel_pmic.c index 452041398b347..36d5a5d50b2ff 100644 --- a/drivers/acpi/pmic/intel_pmic.c +++ b/drivers/acpi/pmic/intel_pmic.c @@ -211,31 +211,36 @@ static acpi_status intel_pmic_regs_handler(u32 function, void *handler_context, void *region_context) { struct intel_pmic_opregion *opregion = region_context; - int result = 0; + int result = -EINVAL; + + if (function == ACPI_WRITE) { + switch (address) { + case 0: + return AE_OK; + case 1: + opregion->ctx.addr |= (*value64 & 0xff) << 8; + return AE_OK; + case 2: + opregion->ctx.addr |= *value64 & 0xff; + return AE_OK; + case 3: + opregion->ctx.val = *value64 & 0xff; + return AE_OK; + case 4: + if (*value64) { + result = regmap_write(opregion->regmap, opregion->ctx.addr, + opregion->ctx.val); + } else { + result = regmap_read(opregion->regmap, opregion->ctx.addr, + &opregion->ctx.val); + } + opregion->ctx.addr = 0; + } + } - switch (address) { - case 0: - return AE_OK; - case 1: - opregion->ctx.addr |= (*value64 & 0xff) << 8; - return AE_OK; - case 2: - opregion->ctx.addr |= *value64 & 0xff; + if (function == ACPI_READ && address == 3) { + *value64 = opregion->ctx.val; return AE_OK; - case 3: - opregion->ctx.val = *value64 & 0xff; - return AE_OK; - case 4: - if (*value64) { - result = regmap_write(opregion->regmap, opregion->ctx.addr, - opregion->ctx.val); - } else { - result = regmap_read(opregion->regmap, opregion->ctx.addr, - &opregion->ctx.val); - if (result == 0) - *value64 = opregion->ctx.val; - } - memset(&opregion->ctx, 0x00, sizeof(opregion->ctx)); } if (result < 0) { From ca195f4300016cb478ab0d2ca36513896e101300 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 5 Nov 2021 13:45:03 -0700 Subject: [PATCH 1641/5344] mm/zsmalloc.c: close race window between zs_pool_dec_isolated() and zs_unregister_migration() [ Upstream commit afe8605ca45424629fdddfd85984b442c763dc47 ] There is one possible race window between zs_pool_dec_isolated() and zs_unregister_migration() because wait_for_isolated_drain() checks the isolated count without holding class->lock and there is no order inside zs_pool_dec_isolated(). Thus the below race window could be possible: zs_pool_dec_isolated zs_unregister_migration check pool->destroying != 0 pool->destroying = true; smp_mb(); wait_for_isolated_drain() wait for pool->isolated_pages == 0 atomic_long_dec(&pool->isolated_pages); atomic_long_read(&pool->isolated_pages) == 0 Since we observe the pool->destroying (false) before atomic_long_dec() for pool->isolated_pages, waking pool->migration_wait up is missed. Fix this by ensure checking pool->destroying happens after the atomic_long_dec(&pool->isolated_pages). Link: https://lkml.kernel.org/r/20210708115027.7557-1-linmiaohe@huawei.com Fixes: 701d678599d0 ("mm/zsmalloc.c: fix race condition in zs_destroy_pool") Signed-off-by: Miaohe Lin Cc: Minchan Kim Cc: Sergey Senozhatsky Cc: Henry Burns Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- mm/zsmalloc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 443b3b1c95818..490e5f3ae614a 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1835,10 +1835,11 @@ static inline void zs_pool_dec_isolated(struct zs_pool *pool) VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0); atomic_long_dec(&pool->isolated_pages); /* - * There's no possibility of racing, since wait_for_isolated_drain() - * checks the isolated count under &class->lock after enqueuing - * on migration_wait. + * Checking pool->destroying must happen after atomic_long_dec() + * for pool->isolated_pages above. Paired with the smp_mb() in + * zs_unregister_migration(). */ + smp_mb__after_atomic(); if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying) wake_up_all(&pool->migration_wait); } From 14970a02aaff6eb943d17d964ed96fe6fd11edd1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 5 Nov 2021 13:45:12 -0700 Subject: [PATCH 1642/5344] zram: off by one in read_block_state() [ Upstream commit a88e03cf3d190cf46bc4063a9b7efe87590de5f4 ] snprintf() returns the number of bytes it would have printed if there were space. But it does not count the NUL terminator. So that means that if "count == copied" then this has already overflowed by one character. This bug likely isn't super harmful in real life. Link: https://lkml.kernel.org/r/20210916130404.GA25094@kili Fixes: c0265342bff4 ("zram: introduce zram memory tracking") Signed-off-by: Dan Carpenter Cc: Minchan Kim Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- drivers/block/zram/zram_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 719c6b7741afa..1cc9b67e9bcaa 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -901,7 +901,7 @@ static ssize_t read_block_state(struct file *file, char __user *buf, zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.', zram_test_flag(zram, index, ZRAM_IDLE) ? 'i' : '.'); - if (count < copied) { + if (count <= copied) { zram_slot_unlock(zram, index); break; } From c7bd63b100291bfcbedf11713ea3c42a6db2d642 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 5 Nov 2021 22:37:33 -0700 Subject: [PATCH 1643/5344] perf bpf: Add missing free to bpf_event__print_bpf_prog_info() [ Upstream commit 88c42f4d6cb249eb68524282f8d4cc32f9059984 ] If btf__new() is called then there needs to be a corresponding btf__free(). Fixes: f8dfeae009effc0b ("perf bpf: Show more BPF program info in print_bpf_prog_info()") Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Daniel Borkmann Cc: Jiri Olsa Cc: John Fastabend Cc: KP Singh Cc: Mark Rutland Cc: Martin KaFai Lau Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Song Liu Cc: Stephane Eranian Cc: Tiezhu Yang Cc: Yonghong Song Cc: bpf@vger.kernel.org Cc: netdev@vger.kernel.org Link: http://lore.kernel.org/lkml/20211106053733.3580931-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Sasha Levin --- tools/perf/util/bpf-event.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index f7ed5d122e229..c766813d56be0 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -467,7 +467,7 @@ void bpf_event__print_bpf_prog_info(struct bpf_prog_info *info, synthesize_bpf_prog_name(name, KSYM_NAME_LEN, info, btf, 0); fprintf(fp, "# bpf_prog_info %u: %s addr 0x%llx size %u\n", info->id, name, prog_addrs[0], prog_lens[0]); - return; + goto out; } fprintf(fp, "# bpf_prog_info %u:\n", info->id); @@ -477,4 +477,6 @@ void bpf_event__print_bpf_prog_info(struct bpf_prog_info *info, fprintf(fp, "# \tsub_prog %u: %s addr 0x%llx size %u\n", i, name, prog_addrs[i], prog_lens[i]); } +out: + btf__free(btf); } From 4ae92d654350a939bdb34c8a153642ef2a826817 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Nov 2021 14:42:14 -0700 Subject: [PATCH 1644/5344] llc: fix out-of-bound array index in llc_sk_dev_hash() [ Upstream commit 8ac9dfd58b138f7e82098a4e0a0d46858b12215b ] Both ifindex and LLC_SK_DEV_HASH_ENTRIES are signed. This means that (ifindex % LLC_SK_DEV_HASH_ENTRIES) is negative if @ifindex is negative. We could simply make LLC_SK_DEV_HASH_ENTRIES unsigned. In this patch I chose to use hash_32() to get more entropy from @ifindex, like llc_sk_laddr_hashfn(). UBSAN: array-index-out-of-bounds in ./include/net/llc.h:75:26 index -43 is out of range for type 'hlist_head [64]' CPU: 1 PID: 20999 Comm: syz-executor.3 Not tainted 5.15.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 ubsan_epilogue+0xb/0x5a lib/ubsan.c:151 __ubsan_handle_out_of_bounds.cold+0x62/0x6c lib/ubsan.c:291 llc_sk_dev_hash include/net/llc.h:75 [inline] llc_sap_add_socket+0x49c/0x520 net/llc/llc_conn.c:697 llc_ui_bind+0x680/0xd70 net/llc/af_llc.c:404 __sys_bind+0x1e9/0x250 net/socket.c:1693 __do_sys_bind net/socket.c:1704 [inline] __se_sys_bind net/socket.c:1702 [inline] __x64_sys_bind+0x6f/0xb0 net/socket.c:1702 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7fa503407ae9 Fixes: 6d2e3ea28446 ("llc: use a device based hash table to speed up multicast delivery") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/net/llc.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/net/llc.h b/include/net/llc.h index df282d9b40170..9c10b121b49b0 100644 --- a/include/net/llc.h +++ b/include/net/llc.h @@ -72,7 +72,9 @@ struct llc_sap { static inline struct hlist_head *llc_sk_dev_hash(struct llc_sap *sap, int ifindex) { - return &sap->sk_dev_hash[ifindex % LLC_SK_DEV_HASH_ENTRIES]; + u32 bucket = hash_32(ifindex, LLC_SK_DEV_HASH_BITS); + + return &sap->sk_dev_hash[bucket]; } static inline From 8c64c0cc4156c4b00e2dd06f8aba5740c90fbbca Mon Sep 17 00:00:00 2001 From: Chengfeng Ye Date: Fri, 5 Nov 2021 06:36:36 -0700 Subject: [PATCH 1645/5344] nfc: pn533: Fix double free when pn533_fill_fragment_skbs() fails [ Upstream commit 9fec40f850658e00a14a7dd9e06f7fbc7e59cc4a ] skb is already freed by dev_kfree_skb in pn533_fill_fragment_skbs, but follow error handler branch when pn533_fill_fragment_skbs() fails, skb is freed again, results in double free issue. Fix this by not free skb in error path of pn533_fill_fragment_skbs. Fixes: 963a82e07d4e ("NFC: pn533: Split large Tx frames in chunks") Fixes: 93ad42020c2d ("NFC: pn533: Target mode Tx fragmentation support") Signed-off-by: Chengfeng Ye Reviewed-by: Dan Carpenter Reviewed-by: Krzysztof Kozlowski Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/nfc/pn533/pn533.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nfc/pn533/pn533.c b/drivers/nfc/pn533/pn533.c index 3ea38ce86cc9f..807b7b37d9dce 100644 --- a/drivers/nfc/pn533/pn533.c +++ b/drivers/nfc/pn533/pn533.c @@ -2072,7 +2072,7 @@ static int pn533_fill_fragment_skbs(struct pn533 *dev, struct sk_buff *skb) frag = pn533_alloc_skb(dev, frag_size); if (!frag) { skb_queue_purge(&dev->fragment_skb); - break; + return -ENOMEM; } if (!dev->tgt_mode) { @@ -2143,7 +2143,7 @@ static int pn533_transceive(struct nfc_dev *nfc_dev, /* jumbo frame ? */ if (skb->len > PN533_CMD_DATAEXCH_DATA_MAXLEN) { rc = pn533_fill_fragment_skbs(dev, skb); - if (rc <= 0) + if (rc < 0) goto error; skb = skb_dequeue(&dev->fragment_skb); @@ -2215,7 +2215,7 @@ static int pn533_tm_send(struct nfc_dev *nfc_dev, struct sk_buff *skb) /* let's split in multiple chunks if size's too big */ if (skb->len > PN533_CMD_DATAEXCH_DATA_MAXLEN) { rc = pn533_fill_fragment_skbs(dev, skb); - if (rc <= 0) + if (rc < 0) goto error; /* get the first skb */ From b05cc85d3432d7aed2b0ea483034ecb0fcc2466f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 5 Nov 2021 08:54:03 +0100 Subject: [PATCH 1646/5344] arm64: pgtable: make __pte_to_phys/__phys_to_pte_val inline functions [ Upstream commit c7c386fbc20262c1d911c615c65db6a58667d92c ] gcc warns about undefined behavior the vmalloc code when building with CONFIG_ARM64_PA_BITS_52, when the 'idx++' in the argument to __phys_to_pte_val() is evaluated twice: mm/vmalloc.c: In function 'vmap_pfn_apply': mm/vmalloc.c:2800:58: error: operation on 'data->idx' may be undefined [-Werror=sequence-point] 2800 | *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot)); | ~~~~~~~~~^~ arch/arm64/include/asm/pgtable-types.h:25:37: note: in definition of macro '__pte' 25 | #define __pte(x) ((pte_t) { (x) } ) | ^ arch/arm64/include/asm/pgtable.h:80:15: note: in expansion of macro '__phys_to_pte_val' 80 | __pte(__phys_to_pte_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) | ^~~~~~~~~~~~~~~~~ mm/vmalloc.c:2800:30: note: in expansion of macro 'pfn_pte' 2800 | *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot)); | ^~~~~~~ I have no idea why this never showed up earlier, but the safest workaround appears to be changing those macros into inline functions so the arguments get evaluated only once. Cc: Matthew Wilcox Fixes: 75387b92635e ("arm64: handle 52-bit physical addresses in page table entries") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20211105075414.2553155-1-arnd@kernel.org Signed-off-by: Will Deacon Signed-off-by: Sasha Levin --- arch/arm64/include/asm/pgtable.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 74dffb48b3970..c7b54b0fc20b6 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -54,9 +54,15 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; * page table entry, taking care of 52-bit addresses. */ #ifdef CONFIG_ARM64_PA_BITS_52 -#define __pte_to_phys(pte) \ - ((pte_val(pte) & PTE_ADDR_LOW) | ((pte_val(pte) & PTE_ADDR_HIGH) << 36)) -#define __phys_to_pte_val(phys) (((phys) | ((phys) >> 36)) & PTE_ADDR_MASK) +static inline phys_addr_t __pte_to_phys(pte_t pte) +{ + return (pte_val(pte) & PTE_ADDR_LOW) | + ((pte_val(pte) & PTE_ADDR_HIGH) << 36); +} +static inline pteval_t __phys_to_pte_val(phys_addr_t phys) +{ + return (phys | (phys >> 36)) & PTE_ADDR_MASK; +} #else #define __pte_to_phys(pte) (pte_val(pte) & PTE_ADDR_MASK) #define __phys_to_pte_val(phys) (phys) From 1970c82b2c53d723465b7a072960952647145806 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 3 Nov 2021 13:47:35 -0700 Subject: [PATCH 1647/5344] bpf: sockmap, strparser, and tls are reusing qdisc_skb_cb and colliding [ Upstream commit e0dc3b93bd7bcff8c3813d1df43e0908499c7cf0 ] Strparser is reusing the qdisc_skb_cb struct to stash the skb message handling progress, e.g. offset and length of the skb. First this is poorly named and inherits a struct from qdisc that doesn't reflect the actual usage of cb[] at this layer. But, more importantly strparser is using the following to access its metadata. (struct _strp_msg *)((void *)skb->cb + offsetof(struct qdisc_skb_cb, data)) Where _strp_msg is defined as: struct _strp_msg { struct strp_msg strp; /* 0 8 */ int accum_len; /* 8 4 */ /* size: 12, cachelines: 1, members: 2 */ /* last cacheline: 12 bytes */ }; So we use 12 bytes of ->data[] in struct. However in BPF code running parser and verdict the user has read capabilities into the data[] array as well. Its not too problematic, but we should not be exposing internal state to BPF program. If its really needed then we can use the probe_read() APIs which allow reading kernel memory. And I don't believe cb[] layer poses any API breakage by moving this around because programs can't depend on cb[] across layers. In order to fix another issue with a ctx rewrite we need to stash a temp variable somewhere. To make this work cleanly this patch builds a cb struct for sk_skb types called sk_skb_cb struct. Then we can use this consistently in the strparser, sockmap space. Additionally we can start allowing ->cb[] write access after this. Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Tested-by: Jussi Maki Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20211103204736.248403-5-john.fastabend@gmail.com Signed-off-by: Sasha Levin --- include/net/strparser.h | 16 +++++++++++++++- net/core/filter.c | 21 +++++++++++++++++++++ net/strparser/strparser.c | 10 +--------- 3 files changed, 37 insertions(+), 10 deletions(-) diff --git a/include/net/strparser.h b/include/net/strparser.h index 1d20b98493a10..bec1439bd3be6 100644 --- a/include/net/strparser.h +++ b/include/net/strparser.h @@ -54,10 +54,24 @@ struct strp_msg { int offset; }; +struct _strp_msg { + /* Internal cb structure. struct strp_msg must be first for passing + * to upper layer. + */ + struct strp_msg strp; + int accum_len; +}; + +struct sk_skb_cb { +#define SK_SKB_CB_PRIV_LEN 20 + unsigned char data[SK_SKB_CB_PRIV_LEN]; + struct _strp_msg strp; +}; + static inline struct strp_msg *strp_msg(struct sk_buff *skb) { return (struct strp_msg *)((void *)skb->cb + - offsetof(struct qdisc_skb_cb, data)); + offsetof(struct sk_skb_cb, strp)); } /* Structure for an attached lower socket */ diff --git a/net/core/filter.c b/net/core/filter.c index d62cac6379170..702a6235d5535 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9315,6 +9315,27 @@ static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, off); break; + case offsetof(struct __sk_buff, cb[0]) ... + offsetofend(struct __sk_buff, cb[4]) - 1: + BUILD_BUG_ON(sizeof_field(struct sk_skb_cb, data) < 20); + BUILD_BUG_ON((offsetof(struct sk_buff, cb) + + offsetof(struct sk_skb_cb, data)) % + sizeof(__u64)); + + prog->cb_access = 1; + off = si->off; + off -= offsetof(struct __sk_buff, cb[0]); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct sk_skb_cb, data); + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg, + si->src_reg, off); + else + *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, + si->src_reg, off); + break; + + default: return bpf_convert_ctx_access(type, si, insn_buf, prog, target_size); diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index b3815c1e8f2ea..cd9954c4ad808 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -27,18 +27,10 @@ static struct workqueue_struct *strp_wq; -struct _strp_msg { - /* Internal cb structure. struct strp_msg must be first for passing - * to upper layer. - */ - struct strp_msg strp; - int accum_len; -}; - static inline struct _strp_msg *_strp_msg(struct sk_buff *skb) { return (struct _strp_msg *)((void *)skb->cb + - offsetof(struct qdisc_skb_cb, data)); + offsetof(struct sk_skb_cb, strp)); } /* Lower lock held */ From 732636be501257ba9b8de5823c1dc60b3f6aad54 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 8 Nov 2021 10:08:15 -0800 Subject: [PATCH 1648/5344] net/sched: sch_taprio: fix undefined behavior in ktime_mono_to_any [ Upstream commit 6dc25401cba4d428328eade8ceae717633fdd702 ] 1) if q->tk_offset == TK_OFFS_MAX, then get_tcp_tstamp() calls ktime_mono_to_any() with out-of-bound value. 2) if q->tk_offset is changed in taprio_parse_clockid(), taprio_get_time() might also call ktime_mono_to_any() with out-of-bound value as sysbot found: UBSAN: array-index-out-of-bounds in kernel/time/timekeeping.c:908:27 index 3 is out of range for type 'ktime_t *[3]' CPU: 1 PID: 25668 Comm: kworker/u4:0 Not tainted 5.15.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: bat_events batadv_iv_send_outstanding_bat_ogm_packet Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 ubsan_epilogue+0xb/0x5a lib/ubsan.c:151 __ubsan_handle_out_of_bounds.cold+0x62/0x6c lib/ubsan.c:291 ktime_mono_to_any+0x1d4/0x1e0 kernel/time/timekeeping.c:908 get_tcp_tstamp net/sched/sch_taprio.c:322 [inline] get_packet_txtime net/sched/sch_taprio.c:353 [inline] taprio_enqueue_one+0x5b0/0x1460 net/sched/sch_taprio.c:420 taprio_enqueue+0x3b1/0x730 net/sched/sch_taprio.c:485 dev_qdisc_enqueue+0x40/0x300 net/core/dev.c:3785 __dev_xmit_skb net/core/dev.c:3869 [inline] __dev_queue_xmit+0x1f6e/0x3630 net/core/dev.c:4194 batadv_send_skb_packet+0x4a9/0x5f0 net/batman-adv/send.c:108 batadv_iv_ogm_send_to_if net/batman-adv/bat_iv_ogm.c:393 [inline] batadv_iv_ogm_emit net/batman-adv/bat_iv_ogm.c:421 [inline] batadv_iv_send_outstanding_bat_ogm_packet+0x6d7/0x8e0 net/batman-adv/bat_iv_ogm.c:1701 process_one_work+0x9b2/0x1690 kernel/workqueue.c:2298 worker_thread+0x658/0x11f0 kernel/workqueue.c:2445 kthread+0x405/0x4f0 kernel/kthread.c:327 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 Fixes: 7ede7b03484b ("taprio: make clock reference conversions easier") Fixes: 54002066100b ("taprio: Adjust timestamps for TCP packets") Signed-off-by: Eric Dumazet Cc: Vedang Patel Reported-by: syzbot Reviewed-by: Vinicius Costa Gomes Link: https://lore.kernel.org/r/20211108180815.1822479-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/sched/sch_taprio.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index e14a66ce4884d..b268e61304515 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -94,18 +94,22 @@ static ktime_t sched_base_time(const struct sched_gate_list *sched) return ns_to_ktime(sched->base_time); } -static ktime_t taprio_get_time(struct taprio_sched *q) +static ktime_t taprio_mono_to_any(const struct taprio_sched *q, ktime_t mono) { - ktime_t mono = ktime_get(); + /* This pairs with WRITE_ONCE() in taprio_parse_clockid() */ + enum tk_offsets tk_offset = READ_ONCE(q->tk_offset); - switch (q->tk_offset) { + switch (tk_offset) { case TK_OFFS_MAX: return mono; default: - return ktime_mono_to_any(mono, q->tk_offset); + return ktime_mono_to_any(mono, tk_offset); } +} - return KTIME_MAX; +static ktime_t taprio_get_time(const struct taprio_sched *q) +{ + return taprio_mono_to_any(q, ktime_get()); } static void taprio_free_sched_cb(struct rcu_head *head) @@ -321,7 +325,7 @@ static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb) return 0; } - return ktime_mono_to_any(skb->skb_mstamp_ns, q->tk_offset); + return taprio_mono_to_any(q, skb->skb_mstamp_ns); } /* There are a few scenarios where we will have to modify the txtime from @@ -1342,6 +1346,7 @@ static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb, } } else if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { int clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]); + enum tk_offsets tk_offset; /* We only support static clockids and we don't allow * for it to be modified after the first init. @@ -1356,22 +1361,24 @@ static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb, switch (clockid) { case CLOCK_REALTIME: - q->tk_offset = TK_OFFS_REAL; + tk_offset = TK_OFFS_REAL; break; case CLOCK_MONOTONIC: - q->tk_offset = TK_OFFS_MAX; + tk_offset = TK_OFFS_MAX; break; case CLOCK_BOOTTIME: - q->tk_offset = TK_OFFS_BOOT; + tk_offset = TK_OFFS_BOOT; break; case CLOCK_TAI: - q->tk_offset = TK_OFFS_TAI; + tk_offset = TK_OFFS_TAI; break; default: NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); err = -EINVAL; goto out; } + /* This pairs with READ_ONCE() in taprio_mono_to_any */ + WRITE_ONCE(q->tk_offset, tk_offset); q->clockid = clockid; } else { From c92e16d01e007644b66ca1923528593b9a40aac6 Mon Sep 17 00:00:00 2001 From: Guangbin Huang Date: Wed, 10 Nov 2021 21:42:56 +0800 Subject: [PATCH 1649/5344] net: hns3: allow configure ETS bandwidth of all TCs [ Upstream commit 688db0c7a4a69ddc8b8143a1cac01eb20082a3aa ] Currently, driver only allow configuring ETS bandwidth of TCs according to the max TC number queried from firmware. However, the hardware actually supports 8 TCs and users may need to configure ETS bandwidth of all TCs, so remove the restriction. Fixes: 330baff5423b ("net: hns3: add ETS TC weight setting in SSU module") Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c | 2 +- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 9 +-------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c index 9076605403a74..bb22d91f6e53e 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c @@ -124,7 +124,7 @@ static int hclge_ets_validate(struct hclge_dev *hdev, struct ieee_ets *ets, if (ret) return ret; - for (i = 0; i < hdev->tc_max; i++) { + for (i = 0; i < HNAE3_MAX_TC; i++) { switch (ets->tc_tsa[i]) { case IEEE_8021QAZ_TSA_STRICT: if (hdev->tm_info.tc_info[i].tc_sch_mode != diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c index d98f0e2ec7aa3..8448607742a6b 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c @@ -974,7 +974,6 @@ static int hclge_tm_pri_tc_base_dwrr_cfg(struct hclge_dev *hdev) static int hclge_tm_ets_tc_dwrr_cfg(struct hclge_dev *hdev) { -#define DEFAULT_TC_WEIGHT 1 #define DEFAULT_TC_OFFSET 14 struct hclge_ets_tc_weight_cmd *ets_weight; @@ -987,13 +986,7 @@ static int hclge_tm_ets_tc_dwrr_cfg(struct hclge_dev *hdev) for (i = 0; i < HNAE3_MAX_TC; i++) { struct hclge_pg_info *pg_info; - ets_weight->tc_weight[i] = DEFAULT_TC_WEIGHT; - - if (!(hdev->hw_tc_map & BIT(i))) - continue; - - pg_info = - &hdev->tm_info.pg_info[hdev->tm_info.tc_info[i].pgid]; + pg_info = &hdev->tm_info.pg_info[hdev->tm_info.tc_info[i].pgid]; ets_weight->tc_weight[i] = pg_info->tc_dwrr[i]; } From 1b8e785881f6469b9e769d293c402fd9e7f064b3 Mon Sep 17 00:00:00 2001 From: Eiichi Tsukata Date: Tue, 9 Nov 2021 00:15:02 +0000 Subject: [PATCH 1650/5344] vsock: prevent unnecessary refcnt inc for nonblocking connect [ Upstream commit c7cd82b90599fa10915f41e3dd9098a77d0aa7b6 ] Currently vosck_connect() increments sock refcount for nonblocking socket each time it's called, which can lead to memory leak if it's called multiple times because connect timeout function decrements sock refcount only once. Fixes it by making vsock_connect() return -EALREADY immediately when sock state is already SS_CONNECTING. Fixes: d021c344051a ("VSOCK: Introduce VM Sockets") Reviewed-by: Stefano Garzarella Signed-off-by: Eiichi Tsukata Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/vmw_vsock/af_vsock.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index d4104144bab1b..bc8055f4571bc 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1151,6 +1151,8 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, * non-blocking call. */ err = -EALREADY; + if (flags & O_NONBLOCK) + goto out; break; default: if ((sk->sk_state == TCP_LISTEN) || From bd841bdb7b8f3c6537608ee82a91db4e4c1bfbab Mon Sep 17 00:00:00 2001 From: Rahul Lakkireddy Date: Thu, 11 Nov 2021 15:55:16 +0530 Subject: [PATCH 1651/5344] cxgb4: fix eeprom len when diagnostics not implemented [ Upstream commit 4ca110bf8d9b31a60f8f8ff6706ea147d38ad97c ] Ensure diagnostics monitoring support is implemented for the SFF 8472 compliant port module and set the correct length for ethtool port module eeprom read. Fixes: f56ec6766dcf ("cxgb4: Add support for ethtool i2c dump") Signed-off-by: Manoj Malviya Signed-off-by: Rahul Lakkireddy Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c | 7 +++++-- drivers/net/ethernet/chelsio/cxgb4/t4_hw.h | 2 ++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c index f537be9cb3155..5ba30b8eb1e11 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c @@ -1466,12 +1466,15 @@ static int cxgb4_get_module_info(struct net_device *dev, if (ret) return ret; - if (!sff8472_comp || (sff_diag_type & 4)) { + if (!sff8472_comp || (sff_diag_type & SFP_DIAG_ADDRMODE)) { modinfo->type = ETH_MODULE_SFF_8079; modinfo->eeprom_len = ETH_MODULE_SFF_8079_LEN; } else { modinfo->type = ETH_MODULE_SFF_8472; - modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN; + if (sff_diag_type & SFP_DIAG_IMPLEMENTED) + modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN; + else + modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN / 2; } break; diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.h b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.h index 002fc62ea7262..63bc956d20376 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.h +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.h @@ -293,6 +293,8 @@ enum { #define I2C_PAGE_SIZE 0x100 #define SFP_DIAG_TYPE_ADDR 0x5c #define SFP_DIAG_TYPE_LEN 0x1 +#define SFP_DIAG_ADDRMODE BIT(2) +#define SFP_DIAG_IMPLEMENTED BIT(6) #define SFF_8472_COMP_ADDR 0x5e #define SFF_8472_COMP_LEN 0x1 #define SFF_REV_ADDR 0x1 From 64c52cf6f09a4a39a49ee65db15317e3eef32f48 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 11 Nov 2021 06:57:17 -0500 Subject: [PATCH 1652/5344] selftests/net: udpgso_bench_rx: fix port argument [ Upstream commit d336509cb9d03970911878bb77f0497f64fda061 ] The below commit added optional support for passing a bind address. It configures the sockaddr bind arguments before parsing options and reconfigures on options -b and -4. This broke support for passing port (-p) on its own. Configure sockaddr after parsing all arguments. Fixes: 3327a9c46352 ("selftests: add functionals test for UDP GRO") Reported-by: Eric Dumazet Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- tools/testing/selftests/net/udpgso_bench_rx.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/udpgso_bench_rx.c b/tools/testing/selftests/net/udpgso_bench_rx.c index 76a24052f4b47..6a193425c367f 100644 --- a/tools/testing/selftests/net/udpgso_bench_rx.c +++ b/tools/testing/selftests/net/udpgso_bench_rx.c @@ -293,19 +293,17 @@ static void usage(const char *filepath) static void parse_opts(int argc, char **argv) { + const char *bind_addr = NULL; int c; - /* bind to any by default */ - setup_sockaddr(PF_INET6, "::", &cfg_bind_addr); while ((c = getopt(argc, argv, "4b:C:Gl:n:p:rR:S:tv")) != -1) { switch (c) { case '4': cfg_family = PF_INET; cfg_alen = sizeof(struct sockaddr_in); - setup_sockaddr(PF_INET, "0.0.0.0", &cfg_bind_addr); break; case 'b': - setup_sockaddr(cfg_family, optarg, &cfg_bind_addr); + bind_addr = optarg; break; case 'C': cfg_connect_timeout_ms = strtoul(optarg, NULL, 0); @@ -341,6 +339,11 @@ static void parse_opts(int argc, char **argv) } } + if (!bind_addr) + bind_addr = cfg_family == PF_INET6 ? "::" : "0.0.0.0"; + + setup_sockaddr(cfg_family, bind_addr, &cfg_bind_addr); + if (optind != argc) usage(argv[0]); From 51c947c87e7f60cc4aaf95c7fffe4104fbdbdeb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Thu, 4 Nov 2021 17:28:28 +0100 Subject: [PATCH 1653/5344] ARM: 9155/1: fix early early_iounmap() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 0d08e7bf0d0d1a29aff7b16ef516f7415eb1aa05 upstream. Currently __set_fixmap() bails out with a warning when called in early boot from early_iounmap(). Fix it, and while at it, make the comment a bit easier to understand. Cc: Fixes: b089c31c519c ("ARM: 8667/3: Fix memory attribute inconsistencies when using fixmap") Acked-by: Ard Biesheuvel Signed-off-by: Michał Mirosław Signed-off-by: Russell King (Oracle) Signed-off-by: Greg Kroah-Hartman --- arch/arm/mm/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index ee943ac325560..538d5da741b09 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -415,9 +415,9 @@ void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot) FIXADDR_END); BUG_ON(idx >= __end_of_fixed_addresses); - /* we only support device mappings until pgprot_kernel has been set */ + /* We support only device mappings before pgprot_kernel is set. */ if (WARN_ON(pgprot_val(prot) != pgprot_val(FIXMAP_PAGE_IO) && - pgprot_val(pgprot_kernel) == 0)) + pgprot_val(prot) && pgprot_val(pgprot_kernel) == 0)) return; if (pgprot_val(prot)) From 49c9cc1e87bc5577f4fb43fba0fd564f13f2339e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 6 Nov 2021 19:42:29 +0100 Subject: [PATCH 1654/5344] ARM: 9156/1: drop cc-option fallbacks for architecture selection commit 418ace9992a7647c446ed3186df40cf165b67298 upstream. Naresh and Antonio ran into a build failure with latest Debian armhf compilers, with lots of output like tmp/ccY3nOAs.s:2215: Error: selected processor does not support `cpsid i' in ARM mode As it turns out, $(cc-option) fails early here when the FPU is not selected before CPU architecture is selected, as the compiler option check runs before enabling -msoft-float, which causes a problem when testing a target architecture level without an FPU: cc1: error: '-mfloat-abi=hard': selected architecture lacks an FPU Passing e.g. -march=armv6k+fp in place of -march=armv6k would avoid this issue, but the fallback logic is already broken because all supported compilers (gcc-5 and higher) are much more recent than these options, and building with -march=armv5t as a fallback no longer works. The best way forward that I see is to just remove all the checks, which also has the nice side-effect of slightly improving the startup time for 'make'. The -mtune=marvell-f option was apparently never supported by any mainline compiler, and the custom Codesourcery gcc build that did support is now too old to build kernels, so just use -mtune=xscale unconditionally for those. This should be safe to apply on all stable kernels, and will be required in order to keep building them with gcc-11 and higher. Link: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=996419 Reported-by: Antonio Terceiro Reported-by: Naresh Kamboju Reported-by: Sebastian Andrzej Siewior Tested-by: Sebastian Reichel Tested-by: Klaus Kudielka Cc: Matthias Klose Cc: stable@vger.kernel.org Signed-off-by: Arnd Bergmann Signed-off-by: Russell King (Oracle) Signed-off-by: Greg Kroah-Hartman --- arch/arm/Makefile | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/arm/Makefile b/arch/arm/Makefile index 1fc32b611f8a4..4f098edfbf202 100644 --- a/arch/arm/Makefile +++ b/arch/arm/Makefile @@ -66,15 +66,15 @@ KBUILD_CFLAGS += $(call cc-option,-fno-ipa-sra) # Note that GCC does not numerically define an architecture version # macro, but instead defines a whole series of macros which makes # testing for a specific architecture or later rather impossible. -arch-$(CONFIG_CPU_32v7M) =-D__LINUX_ARM_ARCH__=7 -march=armv7-m -Wa,-march=armv7-m -arch-$(CONFIG_CPU_32v7) =-D__LINUX_ARM_ARCH__=7 $(call cc-option,-march=armv7-a,-march=armv5t -Wa$(comma)-march=armv7-a) -arch-$(CONFIG_CPU_32v6) =-D__LINUX_ARM_ARCH__=6 $(call cc-option,-march=armv6,-march=armv5t -Wa$(comma)-march=armv6) +arch-$(CONFIG_CPU_32v7M) =-D__LINUX_ARM_ARCH__=7 -march=armv7-m +arch-$(CONFIG_CPU_32v7) =-D__LINUX_ARM_ARCH__=7 -march=armv7-a +arch-$(CONFIG_CPU_32v6) =-D__LINUX_ARM_ARCH__=6 -march=armv6 # Only override the compiler option if ARMv6. The ARMv6K extensions are # always available in ARMv7 ifeq ($(CONFIG_CPU_32v6),y) -arch-$(CONFIG_CPU_32v6K) =-D__LINUX_ARM_ARCH__=6 $(call cc-option,-march=armv6k,-march=armv5t -Wa$(comma)-march=armv6k) +arch-$(CONFIG_CPU_32v6K) =-D__LINUX_ARM_ARCH__=6 -march=armv6k endif -arch-$(CONFIG_CPU_32v5) =-D__LINUX_ARM_ARCH__=5 $(call cc-option,-march=armv5te,-march=armv4t) +arch-$(CONFIG_CPU_32v5) =-D__LINUX_ARM_ARCH__=5 -march=armv5te arch-$(CONFIG_CPU_32v4T) =-D__LINUX_ARM_ARCH__=4 -march=armv4t arch-$(CONFIG_CPU_32v4) =-D__LINUX_ARM_ARCH__=4 -march=armv4 arch-$(CONFIG_CPU_32v3) =-D__LINUX_ARM_ARCH__=3 -march=armv3m @@ -88,7 +88,7 @@ tune-$(CONFIG_CPU_ARM720T) =-mtune=arm7tdmi tune-$(CONFIG_CPU_ARM740T) =-mtune=arm7tdmi tune-$(CONFIG_CPU_ARM9TDMI) =-mtune=arm9tdmi tune-$(CONFIG_CPU_ARM940T) =-mtune=arm9tdmi -tune-$(CONFIG_CPU_ARM946E) =$(call cc-option,-mtune=arm9e,-mtune=arm9tdmi) +tune-$(CONFIG_CPU_ARM946E) =-mtune=arm9e tune-$(CONFIG_CPU_ARM920T) =-mtune=arm9tdmi tune-$(CONFIG_CPU_ARM922T) =-mtune=arm9tdmi tune-$(CONFIG_CPU_ARM925T) =-mtune=arm9tdmi @@ -96,11 +96,11 @@ tune-$(CONFIG_CPU_ARM926T) =-mtune=arm9tdmi tune-$(CONFIG_CPU_FA526) =-mtune=arm9tdmi tune-$(CONFIG_CPU_SA110) =-mtune=strongarm110 tune-$(CONFIG_CPU_SA1100) =-mtune=strongarm1100 -tune-$(CONFIG_CPU_XSCALE) =$(call cc-option,-mtune=xscale,-mtune=strongarm110) -Wa,-mcpu=xscale -tune-$(CONFIG_CPU_XSC3) =$(call cc-option,-mtune=xscale,-mtune=strongarm110) -Wa,-mcpu=xscale -tune-$(CONFIG_CPU_FEROCEON) =$(call cc-option,-mtune=marvell-f,-mtune=xscale) -tune-$(CONFIG_CPU_V6) =$(call cc-option,-mtune=arm1136j-s,-mtune=strongarm) -tune-$(CONFIG_CPU_V6K) =$(call cc-option,-mtune=arm1136j-s,-mtune=strongarm) +tune-$(CONFIG_CPU_XSCALE) =-mtune=xscale +tune-$(CONFIG_CPU_XSC3) =-mtune=xscale +tune-$(CONFIG_CPU_FEROCEON) =-mtune=xscale +tune-$(CONFIG_CPU_V6) =-mtune=arm1136j-s +tune-$(CONFIG_CPU_V6K) =-mtune=arm1136j-s # Evaluate tune cc-option calls now tune-y := $(tune-y) From 497cc9d92c911b1c07239d955a5ac2414c9cd785 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 4 Nov 2021 20:19:00 +0100 Subject: [PATCH 1655/5344] parisc: Fix backtrace to always include init funtion names commit 279917e27edc293eb645a25428c6ab3f3bca3f86 upstream. I noticed that sometimes at kernel startup the backtraces did not included the function names of init functions. Their address were not resolved to function names and instead only the address was printed. Debugging shows that the culprit is is_ksym_addr() which is called by the backtrace functions to check if an address belongs to a function in the kernel. The problem occurs only for CONFIG_KALLSYMS_ALL=y. When looking at is_ksym_addr() one can see that for CONFIG_KALLSYMS_ALL=y the function only tries to resolve the address via is_kernel() function, which checks like this: if (addr >= _stext && addr <= _end) return 1; On parisc the init functions are located before _stext, so this check fails. Other platforms seem to have all functions (including init functions) behind _stext. The following patch moves the _stext symbol at the beginning of the kernel and thus includes the init section. This fixes the check and does not seem to have any negative side effects on where the kernel mapping happens in the map_pages() function in arch/parisc/mm/init.c. Signed-off-by: Helge Deller Cc: stable@kernel.org # 5.4+ Signed-off-by: Greg Kroah-Hartman --- arch/parisc/kernel/vmlinux.lds.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S index 99cd24f2ea01b..164483b37d854 100644 --- a/arch/parisc/kernel/vmlinux.lds.S +++ b/arch/parisc/kernel/vmlinux.lds.S @@ -56,6 +56,8 @@ SECTIONS { . = KERNEL_BINARY_TEXT_START; + _stext = .; /* start of kernel text, includes init code & data */ + __init_begin = .; HEAD_TEXT_SECTION MLONGCALL_DISCARD(INIT_TEXT_SECTION(8)) @@ -79,7 +81,6 @@ SECTIONS /* freed after init ends here */ _text = .; /* Text and read-only data */ - _stext = .; MLONGCALL_KEEP(INIT_TEXT_SECTION(8)) .text ALIGN(PAGE_SIZE) : { TEXT_TEXT From 3f4b5f4c7a2e95c09cc9ac34fb9dc953541fed4e Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 31 Oct 2021 21:58:12 +0100 Subject: [PATCH 1656/5344] parisc: Fix set_fixmap() on PA1.x CPUs commit 6e866a462867b60841202e900f10936a0478608c upstream. Fix a kernel crash which happens on PA1.x CPUs while initializing the FTRACE/KPROBE breakpoints. The PTE table entries for the fixmap area were not created correctly. Signed-off-by: Helge Deller Fixes: ccfbc68d41c2 ("parisc: add set_fixmap()/clear_fixmap()") Cc: stable@vger.kernel.org # v5.2+ Signed-off-by: Greg Kroah-Hartman --- arch/parisc/mm/fixmap.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/parisc/mm/fixmap.c b/arch/parisc/mm/fixmap.c index 474cd241c1509..02e19a32e6c5f 100644 --- a/arch/parisc/mm/fixmap.c +++ b/arch/parisc/mm/fixmap.c @@ -18,12 +18,9 @@ void notrace set_fixmap(enum fixed_addresses idx, phys_addr_t phys) pte_t *pte; if (pmd_none(*pmd)) - pmd = pmd_alloc(NULL, pgd, vaddr); - - pte = pte_offset_kernel(pmd, vaddr); - if (pte_none(*pte)) pte = pte_alloc_kernel(pmd, vaddr); + pte = pte_offset_kernel(pmd, vaddr); set_pte_at(&init_mm, vaddr, pte, __mk_pte(phys, PAGE_KERNEL_RWX)); flush_tlb_kernel_range(vaddr, vaddr + PAGE_SIZE); } From 9ffc23ca90cb766a016563660f1cb68104a1c72c Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Fri, 5 Nov 2021 17:47:48 +0800 Subject: [PATCH 1657/5344] irqchip/sifive-plic: Fixup EOI failed when masked commit 69ea463021be0d159ab30f96195fb0dd18ee2272 upstream. When using "devm_request_threaded_irq(,,,,IRQF_ONESHOT,,)" in a driver, only the first interrupt is handled, and following interrupts are never delivered (initially reported in [1]). That's because the RISC-V PLIC cannot EOI masked interrupts, as explained in the description of Interrupt Completion in the PLIC spec [2]: The PLIC signals it has completed executing an interrupt handler by writing the interrupt ID it received from the claim to the claim/complete register. The PLIC does not check whether the completion ID is the same as the last claim ID for that target. If the completion ID does not match an interrupt source that *is currently enabled* for the target, the completion is silently ignored. Re-enable the interrupt before completion if it has been masked during the handling, and remask it afterwards. [1] http://lists.infradead.org/pipermail/linux-riscv/2021-July/007441.html [2] https://github.com/riscv/riscv-plic-spec/blob/8bc15a35d07c9edf7b5d23fec9728302595ffc4d/riscv-plic.adoc Fixes: bb0fed1c60cc ("irqchip/sifive-plic: Switch to fasteoi flow") Reported-by: Vincent Pelletier Tested-by: Nikita Shubin Signed-off-by: Guo Ren Cc: stable@vger.kernel.org Cc: Thomas Gleixner Cc: Palmer Dabbelt Cc: Atish Patra Reviewed-by: Anup Patel [maz: amended commit message] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20211105094748.3894453-1-guoren@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/irqchip/irq-sifive-plic.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c index 7d0a12fe2714a..7cd7b140dfe97 100644 --- a/drivers/irqchip/irq-sifive-plic.c +++ b/drivers/irqchip/irq-sifive-plic.c @@ -138,7 +138,13 @@ static void plic_irq_eoi(struct irq_data *d) { struct plic_handler *handler = this_cpu_ptr(&plic_handlers); - writel(d->hwirq, handler->hart_base + CONTEXT_CLAIM); + if (irqd_irq_masked(d)) { + plic_irq_unmask(d); + writel(d->hwirq, handler->hart_base + CONTEXT_CLAIM); + plic_irq_mask(d); + } else { + writel(d->hwirq, handler->hart_base + CONTEXT_CLAIM); + } } static struct irq_chip plic_chip = { From e89e8865663962f004c110ab4320549d4885a427 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 7 Sep 2021 10:24:21 -0700 Subject: [PATCH 1658/5344] f2fs: should use GFP_NOFS for directory inodes commit 92d602bc7177325e7453189a22e0c8764ed3453e upstream. We use inline_dentry which requires to allocate dentry page when adding a link. If we allow to reclaim memory from filesystem, we do down_read(&sbi->cp_rwsem) twice by f2fs_lock_op(). I think this should be okay, but how about stopping the lockdep complaint [1]? f2fs_create() - f2fs_lock_op() - f2fs_do_add_link() - __f2fs_find_entry - f2fs_get_read_data_page() -> kswapd - shrink_node - f2fs_evict_inode - f2fs_lock_op() [1] fs_reclaim ){+.+.}-{0:0} : kswapd0: lock_acquire+0x114/0x394 kswapd0: __fs_reclaim_acquire+0x40/0x50 kswapd0: prepare_alloc_pages+0x94/0x1ec kswapd0: __alloc_pages_nodemask+0x78/0x1b0 kswapd0: pagecache_get_page+0x2e0/0x57c kswapd0: f2fs_get_read_data_page+0xc0/0x394 kswapd0: f2fs_find_data_page+0xa4/0x23c kswapd0: find_in_level+0x1a8/0x36c kswapd0: __f2fs_find_entry+0x70/0x100 kswapd0: f2fs_do_add_link+0x84/0x1ec kswapd0: f2fs_mkdir+0xe4/0x1e4 kswapd0: vfs_mkdir+0x110/0x1c0 kswapd0: do_mkdirat+0xa4/0x160 kswapd0: __arm64_sys_mkdirat+0x24/0x34 kswapd0: el0_svc_common.llvm.17258447499513131576+0xc4/0x1e8 kswapd0: do_el0_svc+0x28/0xa0 kswapd0: el0_svc+0x24/0x38 kswapd0: el0_sync_handler+0x88/0xec kswapd0: el0_sync+0x1c0/0x200 kswapd0: -> #1 ( &sbi->cp_rwsem ){++++}-{3:3} : kswapd0: lock_acquire+0x114/0x394 kswapd0: down_read+0x7c/0x98 kswapd0: f2fs_do_truncate_blocks+0x78/0x3dc kswapd0: f2fs_truncate+0xc8/0x128 kswapd0: f2fs_evict_inode+0x2b8/0x8b8 kswapd0: evict+0xd4/0x2f8 kswapd0: iput+0x1c0/0x258 kswapd0: do_unlinkat+0x170/0x2a0 kswapd0: __arm64_sys_unlinkat+0x4c/0x68 kswapd0: el0_svc_common.llvm.17258447499513131576+0xc4/0x1e8 kswapd0: do_el0_svc+0x28/0xa0 kswapd0: el0_svc+0x24/0x38 kswapd0: el0_sync_handler+0x88/0xec kswapd0: el0_sync+0x1c0/0x200 Cc: stable@vger.kernel.org Fixes: bdbc90fa55af ("f2fs: don't put dentry page in pagecache into highmem") Reviewed-by: Chao Yu Reviewed-by: Stanley Chu Reviewed-by: Light Hsieh Tested-by: Light Hsieh Signed-off-by: Jaegeuk Kim Signed-off-by: Greg Kroah-Hartman --- fs/f2fs/inode.c | 2 +- fs/f2fs/namei.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 502bd491336a8..2383d52b1f424 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -455,7 +455,7 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; - inode_nohighmem(inode); + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); } else if (S_ISLNK(inode->i_mode)) { if (file_is_encrypt(inode)) inode->i_op = &f2fs_encrypted_symlink_inode_operations; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 81a18ba18e301..ed95c27e93026 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -679,7 +679,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; - inode_nohighmem(inode); + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); set_inode_flag(inode, FI_INC_LINK); f2fs_lock_op(sbi); From 4ff8cd59649c06b4795859401dc9163c728f68ef Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 11 Oct 2021 14:12:36 +0200 Subject: [PATCH 1659/5344] net, neigh: Enable state migration between NUD_PERMANENT and NTF_USE [ Upstream commit 3dc20f4762c62d3b3f0940644881ed818aa7b2f5 ] Currently, it is not possible to migrate a neighbor entry between NUD_PERMANENT state and NTF_USE flag with a dynamic NUD state from a user space control plane. Similarly, it is not possible to add/remove NTF_EXT_LEARNED flag from an existing neighbor entry in combination with NTF_USE flag. This is due to the latter directly calling into neigh_event_send() without any meta data updates as happening in __neigh_update(). Thus, to enable this use case, extend the latter with a NEIGH_UPDATE_F_USE flag where we break the NUD_PERMANENT state in particular so that a latter neigh_event_send() is able to re-resolve a neighbor entry. Before fix, NUD_PERMANENT -> NUD_* & NTF_USE: # ./ip/ip n replace 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a # ./ip/ip n 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT [...] # ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn # ./ip/ip n 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT [...] As can be seen, despite the admin-triggered replace, the entry remains in the NUD_PERMANENT state. After fix, NUD_PERMANENT -> NUD_* & NTF_USE: # ./ip/ip n replace 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a # ./ip/ip n 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT [...] # ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn # ./ip/ip n 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a extern_learn REACHABLE [...] # ./ip/ip n 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a extern_learn STALE [...] # ./ip/ip n replace 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a # ./ip/ip n 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a PERMANENT [...] After the fix, the admin-triggered replace switches to a dynamic state from the NTF_USE flag which triggered a new neighbor resolution. Likewise, we can transition back from there, if needed, into NUD_PERMANENT. Similar before/after behavior can be observed for below transitions: Before fix, NTF_USE -> NTF_USE | NTF_EXT_LEARNED -> NTF_USE: # ./ip/ip n replace 192.168.178.30 dev enp5s0 use # ./ip/ip n 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE [...] # ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn # ./ip/ip n 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE [...] After fix, NTF_USE -> NTF_USE | NTF_EXT_LEARNED -> NTF_USE: # ./ip/ip n replace 192.168.178.30 dev enp5s0 use # ./ip/ip n 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE [...] # ./ip/ip n replace 192.168.178.30 dev enp5s0 use extern_learn # ./ip/ip n 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a extern_learn REACHABLE [...] # ./ip/ip n replace 192.168.178.30 dev enp5s0 use # ./ip/ip n 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a REACHABLE [..] Signed-off-by: Daniel Borkmann Acked-by: Roopa Prabhu Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/net/neighbour.h | 1 + net/core/neighbour.c | 22 +++++++++++++--------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 4232bc8ce3d7d..b6494e87c897c 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -253,6 +253,7 @@ static inline void *neighbour_priv(const struct neighbour *n) #define NEIGH_UPDATE_F_OVERRIDE 0x00000001 #define NEIGH_UPDATE_F_WEAK_OVERRIDE 0x00000002 #define NEIGH_UPDATE_F_OVERRIDE_ISROUTER 0x00000004 +#define NEIGH_UPDATE_F_USE 0x10000000 #define NEIGH_UPDATE_F_EXT_LEARNED 0x20000000 #define NEIGH_UPDATE_F_ISROUTER 0x40000000 #define NEIGH_UPDATE_F_ADMIN 0x80000000 diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 3a4cf53e38416..02e55041a8813 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1221,7 +1221,7 @@ static void neigh_update_hhs(struct neighbour *neigh) lladdr instead of overriding it if it is different. NEIGH_UPDATE_F_ADMIN means that the change is administrative. - + NEIGH_UPDATE_F_USE means that the entry is user triggered. NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing NTF_ROUTER flag. NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as @@ -1259,6 +1259,12 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, goto out; ext_learn_change = neigh_update_ext_learned(neigh, flags, ¬ify); + if (flags & NEIGH_UPDATE_F_USE) { + new = old & ~NUD_PERMANENT; + neigh->nud_state = new; + err = 0; + goto out; + } if (!(new & NUD_VALID)) { neigh_del_timer(neigh); @@ -1966,22 +1972,20 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (protocol) neigh->protocol = protocol; - if (ndm->ndm_flags & NTF_EXT_LEARNED) flags |= NEIGH_UPDATE_F_EXT_LEARNED; - if (ndm->ndm_flags & NTF_ROUTER) flags |= NEIGH_UPDATE_F_ISROUTER; + if (ndm->ndm_flags & NTF_USE) + flags |= NEIGH_UPDATE_F_USE; - if (ndm->ndm_flags & NTF_USE) { + err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, + NETLINK_CB(skb).portid, extack); + if (!err && ndm->ndm_flags & NTF_USE) { neigh_event_send(neigh, NULL); err = 0; - } else - err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, - NETLINK_CB(skb).portid, extack); - + } neigh_release(neigh); - out: return err; } From c5539e3ffb8495f90122562afc784dd1e2f379ad Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Tue, 2 Nov 2021 19:47:47 +0900 Subject: [PATCH 1660/5344] 9p/net: fix missing error check in p9_check_errors commit 27eb4c3144f7a5ebef3c9a261d80cb3e1fa784dc upstream. Link: https://lkml.kernel.org/r/99338965-d36c-886e-cd0e-1d8fff2b4746@gmail.com Reported-by: syzbot+06472778c97ed94af66d@syzkaller.appspotmail.com Cc: stable@vger.kernel.org Signed-off-by: Dominique Martinet Signed-off-by: Greg Kroah-Hartman --- net/9p/client.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/9p/client.c b/net/9p/client.c index 1d48afc7033ca..b03fce66eb8de 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -538,6 +538,8 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req) kfree(ename); } else { err = p9pdu_readf(&req->rc, c->proto_version, "d", &ecode); + if (err) + goto out_err; err = -ecode; p9_debug(P9_DEBUG_9P, "<<< RLERROR (%d)\n", -ecode); From 16f87615cb88b28247f0b01b76f1c5ace0c169b1 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 28 Jul 2021 10:38:43 +0200 Subject: [PATCH 1661/5344] ovl: fix deadlock in splice write commit 9b91b6b019fda817eb52f728eb9c79b3579760bc upstream. There's possibility of an ABBA deadlock in case of a splice write to an overlayfs file and a concurrent splice write to a corresponding real file. The call chain for splice to an overlay file: -> do_splice [takes sb_writers on overlay file] -> do_splice_from -> iter_file_splice_write [takes pipe->mutex] -> vfs_iter_write ... -> ovl_write_iter [takes sb_writers on real file] And the call chain for splice to a real file: -> do_splice [takes sb_writers on real file] -> do_splice_from -> iter_file_splice_write [takes pipe->mutex] Syzbot successfully bisected this to commit 82a763e61e2b ("ovl: simplify file splice"). Fix by reverting the write part of the above commit and by adding missing bits from ovl_write_iter() into ovl_splice_write(). Fixes: 82a763e61e2b ("ovl: simplify file splice") Reported-and-tested-by: syzbot+579885d1a9a833336209@syzkaller.appspotmail.com Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/overlayfs/file.c | 47 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 2ba8b328f4d0c..af733eec4e424 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -299,6 +299,51 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) return ret; } +/* + * Calling iter_file_splice_write() directly from overlay's f_op may deadlock + * due to lock order inversion between pipe->mutex in iter_file_splice_write() + * and file_start_write(real.file) in ovl_write_iter(). + * + * So do everything ovl_write_iter() does and call iter_file_splice_write() on + * the real file. + */ +static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) +{ + struct fd real; + const struct cred *old_cred; + struct inode *inode = file_inode(out); + struct inode *realinode = ovl_inode_real(inode); + ssize_t ret; + + inode_lock(inode); + /* Update mode */ + ovl_copyattr(realinode, inode); + ret = file_remove_privs(out); + if (ret) + goto out_unlock; + + ret = ovl_real_fdget(out, &real); + if (ret) + goto out_unlock; + + old_cred = ovl_override_creds(inode->i_sb); + file_start_write(real.file); + + ret = iter_file_splice_write(pipe, real.file, ppos, len, flags); + + file_end_write(real.file); + /* Update size */ + ovl_copyattr(realinode, inode); + revert_creds(old_cred); + fdput(real); + +out_unlock: + inode_unlock(inode); + + return ret; +} + static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct fd real; @@ -660,7 +705,7 @@ const struct file_operations ovl_file_operations = { .unlocked_ioctl = ovl_ioctl, .compat_ioctl = ovl_compat_ioctl, .splice_read = generic_file_splice_read, - .splice_write = iter_file_splice_write, + .splice_write = ovl_splice_write, .copy_file_range = ovl_copy_file_range, .remap_file_range = ovl_remap_file_range, From 07872ab30227f393cc62b495bec91a3ca2d4655f Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Mon, 15 Nov 2021 16:36:00 +0530 Subject: [PATCH 1662/5344] powerpc/lib: Add helper to check if offset is within conditional branch range upstream commit 4549c3ea3160fa8b3f37dfe2f957657bb265eda9 Add a helper to check if a given offset is within the branch range for a powerpc conditional branch instruction, and update some sites to use the new helper. Signed-off-by: Naveen N. Rao Reviewed-by: Christophe Leroy Acked-by: Song Liu Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/442b69a34ced32ca346a0d9a855f3f6cfdbbbd41.1633464148.git.naveen.n.rao@linux.vnet.ibm.com Signed-off-by: Naveen N. Rao Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/include/asm/code-patching.h | 1 + arch/powerpc/lib/code-patching.c | 7 ++++++- arch/powerpc/net/bpf_jit.h | 7 +------ 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h index be0f7257b13c8..96ef54bc7d968 100644 --- a/arch/powerpc/include/asm/code-patching.h +++ b/arch/powerpc/include/asm/code-patching.h @@ -22,6 +22,7 @@ #define BRANCH_ABSOLUTE 0x2 bool is_offset_in_branch_range(long offset); +bool is_offset_in_cond_branch_range(long offset); unsigned int create_branch(const unsigned int *addr, unsigned long target, int flags); unsigned int create_cond_branch(const unsigned int *addr, diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 3345f039a8769..a05f289e613ed 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -221,6 +221,11 @@ bool is_offset_in_branch_range(long offset) return (offset >= -0x2000000 && offset <= 0x1fffffc && !(offset & 0x3)); } +bool is_offset_in_cond_branch_range(long offset) +{ + return offset >= -0x8000 && offset <= 0x7fff && !(offset & 0x3); +} + /* * Helper to check if a given instruction is a conditional branch * Derived from the conditional checks in analyse_instr() @@ -274,7 +279,7 @@ unsigned int create_cond_branch(const unsigned int *addr, offset = offset - (unsigned long)addr; /* Check we can represent the target in the instruction format */ - if (offset < -0x8000 || offset > 0x7FFF || offset & 0x3) + if (!is_offset_in_cond_branch_range(offset)) return 0; /* Mask out the flags and target, so they don't step on each other. */ diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h index 55d4377ccfae3..bc06bb7ff5c8d 100644 --- a/arch/powerpc/net/bpf_jit.h +++ b/arch/powerpc/net/bpf_jit.h @@ -225,11 +225,6 @@ #define PPC_FUNC_ADDR(d,i) do { PPC_LI32(d, i); } while(0) #endif -static inline bool is_nearbranch(int offset) -{ - return (offset < 32768) && (offset >= -32768); -} - /* * The fly in the ointment of code size changing from pass to pass is * avoided by padding the short branch case with a NOP. If code size differs @@ -238,7 +233,7 @@ static inline bool is_nearbranch(int offset) * state. */ #define PPC_BCC(cond, dest) do { \ - if (is_nearbranch((dest) - (ctx->idx * 4))) { \ + if (is_offset_in_cond_branch_range((long)(dest) - (ctx->idx * 4))) { \ PPC_BCC_SHORT(cond, dest); \ PPC_NOP(); \ } else { \ From b1a06e6fbc1d16228b120ba9814d9204e55892aa Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Mon, 15 Nov 2021 16:36:01 +0530 Subject: [PATCH 1663/5344] powerpc/bpf: Validate branch ranges upstream commit 3832ba4e283d7052b783dab8311df7e3590fed93 Add checks to ensure that we never emit branch instructions with truncated branch offsets. Suggested-by: Michael Ellerman Signed-off-by: Naveen N. Rao Tested-by: Johan Almbladh Reviewed-by: Christophe Leroy Acked-by: Song Liu Acked-by: Johan Almbladh Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/71d33a6b7603ec1013c9734dd8bdd4ff5e929142.1633464148.git.naveen.n.rao@linux.vnet.ibm.com [include header, drop ppc32 changes] Signed-off-by: Naveen N. Rao Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/net/bpf_jit.h | 26 ++++++++++++++++++++------ arch/powerpc/net/bpf_jit_comp64.c | 8 ++++++-- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h index bc06bb7ff5c8d..6d2a268528dfc 100644 --- a/arch/powerpc/net/bpf_jit.h +++ b/arch/powerpc/net/bpf_jit.h @@ -11,6 +11,7 @@ #ifndef __ASSEMBLY__ #include +#include #ifdef PPC64_ELF_ABI_v1 #define FUNCTION_DESCR_SIZE 24 @@ -180,13 +181,26 @@ #define PPC_NEG(d, a) EMIT(PPC_INST_NEG | ___PPC_RT(d) | ___PPC_RA(a)) /* Long jump; (unconditional 'branch') */ -#define PPC_JMP(dest) EMIT(PPC_INST_BRANCH | \ - (((dest) - (ctx->idx * 4)) & 0x03fffffc)) +#define PPC_JMP(dest) \ + do { \ + long offset = (long)(dest) - (ctx->idx * 4); \ + if (!is_offset_in_branch_range(offset)) { \ + pr_err_ratelimited("Branch offset 0x%lx (@%u) out of range\n", offset, ctx->idx); \ + return -ERANGE; \ + } \ + EMIT(PPC_INST_BRANCH | (offset & 0x03fffffc)); \ + } while (0) /* "cond" here covers BO:BI fields. */ -#define PPC_BCC_SHORT(cond, dest) EMIT(PPC_INST_BRANCH_COND | \ - (((cond) & 0x3ff) << 16) | \ - (((dest) - (ctx->idx * 4)) & \ - 0xfffc)) +#define PPC_BCC_SHORT(cond, dest) \ + do { \ + long offset = (long)(dest) - (ctx->idx * 4); \ + if (!is_offset_in_cond_branch_range(offset)) { \ + pr_err_ratelimited("Conditional branch offset 0x%lx (@%u) out of range\n", offset, ctx->idx); \ + return -ERANGE; \ + } \ + EMIT(PPC_INST_BRANCH_COND | (((cond) & 0x3ff) << 16) | (offset & 0xfffc)); \ + } while (0) + /* Sign-extended 32-bit immediate load */ #define PPC_LI32(d, i) do { \ if ((int)(uintptr_t)(i) >= -32768 && \ diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 81e6279c9874f..ec1d01654f40f 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -224,7 +224,7 @@ static void bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, PPC_BLRL(); } -static void bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 out) +static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 out) { /* * By now, the eBPF program has already setup parameters in r3, r4 and r5 @@ -285,7 +285,9 @@ static void bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 bpf_jit_emit_common_epilogue(image, ctx); PPC_BCTR(); + /* out: */ + return 0; } /* Assemble the body code between the prologue & epilogue */ @@ -1001,7 +1003,9 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, */ case BPF_JMP | BPF_TAIL_CALL: ctx->seen |= SEEN_TAILCALL; - bpf_jit_emit_tail_call(image, ctx, addrs[i + 1]); + ret = bpf_jit_emit_tail_call(image, ctx, addrs[i + 1]); + if (ret < 0) + return ret; break; default: From f0d538839c4826b4f8bb3ccdb10477caf225c3f6 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Mon, 15 Nov 2021 16:36:02 +0530 Subject: [PATCH 1664/5344] powerpc/bpf: Fix BPF_SUB when imm == 0x80000000 upstream commit 5855c4c1f415ca3ba1046e77c0b3d3dfc96c9025 We aren't handling subtraction involving an immediate value of 0x80000000 properly. Fix the same. Fixes: 156d0e290e969c ("powerpc/ebpf/jit: Implement JIT compiler for extended BPF") Signed-off-by: Naveen N. Rao Reviewed-by: Christophe Leroy [mpe: Fold in fix from Naveen to use imm <= 32768] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/fc4b1276eb10761fd7ce0814c8dd089da2815251.1633464148.git.naveen.n.rao@linux.vnet.ibm.com [adjust macros to account for commits 0654186510a40e and 3a181237916310] Signed-off-by: Naveen N. Rao Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/net/bpf_jit_comp64.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index ec1d01654f40f..b5a0c3a3ce6e5 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -349,18 +349,25 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, PPC_SUB(dst_reg, dst_reg, src_reg); goto bpf_alu32_trunc; case BPF_ALU | BPF_ADD | BPF_K: /* (u32) dst += (u32) imm */ - case BPF_ALU | BPF_SUB | BPF_K: /* (u32) dst -= (u32) imm */ case BPF_ALU64 | BPF_ADD | BPF_K: /* dst += imm */ + if (!imm) { + goto bpf_alu32_trunc; + } else if (imm >= -32768 && imm < 32768) { + PPC_ADDI(dst_reg, dst_reg, IMM_L(imm)); + } else { + PPC_LI32(b2p[TMP_REG_1], imm); + PPC_ADD(dst_reg, dst_reg, b2p[TMP_REG_1]); + } + goto bpf_alu32_trunc; + case BPF_ALU | BPF_SUB | BPF_K: /* (u32) dst -= (u32) imm */ case BPF_ALU64 | BPF_SUB | BPF_K: /* dst -= imm */ - if (BPF_OP(code) == BPF_SUB) - imm = -imm; - if (imm) { - if (imm >= -32768 && imm < 32768) - PPC_ADDI(dst_reg, dst_reg, IMM_L(imm)); - else { - PPC_LI32(b2p[TMP_REG_1], imm); - PPC_ADD(dst_reg, dst_reg, b2p[TMP_REG_1]); - } + if (!imm) { + goto bpf_alu32_trunc; + } else if (imm > -32768 && imm <= 32768) { + PPC_ADDI(dst_reg, dst_reg, IMM_L(-imm)); + } else { + PPC_LI32(b2p[TMP_REG_1], imm); + PPC_SUB(dst_reg, dst_reg, b2p[TMP_REG_1]); } goto bpf_alu32_trunc; case BPF_ALU | BPF_MUL | BPF_X: /* (u32) dst *= (u32) src */ From 40619ae5daa1532154c1e4910c23ac9a3ef3caa7 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Mon, 15 Nov 2021 16:36:03 +0530 Subject: [PATCH 1665/5344] powerpc/security: Add a helper to query stf_barrier type upstream commit 030905920f32e91a52794937f67434ac0b3ea41a Add a helper to return the stf_barrier type for the current processor. Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/3bd5d7f96ea1547991ac2ce3137dc2b220bae285.1633464148.git.naveen.n.rao@linux.vnet.ibm.com Signed-off-by: Naveen N. Rao Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/include/asm/security_features.h | 5 +++++ arch/powerpc/kernel/security.c | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/arch/powerpc/include/asm/security_features.h b/arch/powerpc/include/asm/security_features.h index e9e3f85134e54..316a9c8d1929e 100644 --- a/arch/powerpc/include/asm/security_features.h +++ b/arch/powerpc/include/asm/security_features.h @@ -39,6 +39,11 @@ static inline bool security_ftr_enabled(u64 feature) return !!(powerpc_security_features & feature); } +#ifdef CONFIG_PPC_BOOK3S_64 +enum stf_barrier_type stf_barrier_type_get(void); +#else +static inline enum stf_barrier_type stf_barrier_type_get(void) { return STF_BARRIER_NONE; } +#endif // Features indicating support for Spectre/Meltdown mitigations diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c index 1740a66cea84d..ff022e7256934 100644 --- a/arch/powerpc/kernel/security.c +++ b/arch/powerpc/kernel/security.c @@ -256,6 +256,11 @@ static int __init handle_no_stf_barrier(char *p) early_param("no_stf_barrier", handle_no_stf_barrier); +enum stf_barrier_type stf_barrier_type_get(void) +{ + return stf_enabled_flush_types; +} + /* This is the generic flag used by other architectures */ static int __init handle_ssbd(char *p) { From 771bac3be9621f784a9b379cfef620a9601c5b88 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Mon, 15 Nov 2021 16:36:04 +0530 Subject: [PATCH 1666/5344] powerpc/bpf: Emit stf barrier instruction sequences for BPF_NOSPEC upstream commit b7540d62509453263604a155bf2d5f0ed450cba2 Emit similar instruction sequences to commit a048a07d7f4535 ("powerpc/64s: Add support for a store forwarding barrier at kernel entry/exit") when encountering BPF_NOSPEC. Mitigations are enabled depending on what the firmware advertises. In particular, we do not gate these mitigations based on current settings, just like in x86. Due to this, we don't need to take any action if mitigations are enabled or disabled at runtime. Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/956570cbc191cd41f8274bed48ee757a86dac62a.1633464148.git.naveen.n.rao@linux.vnet.ibm.com [adjust macros to account for commits 0654186510a40e, 3a181237916310 and ef909ba954145e. adjust security feature checks to account for commit 84ed26fd00c514] Signed-off-by: Naveen N. Rao Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/net/bpf_jit64.h | 8 ++--- arch/powerpc/net/bpf_jit_comp64.c | 56 ++++++++++++++++++++++++++++--- 2 files changed, 56 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/net/bpf_jit64.h b/arch/powerpc/net/bpf_jit64.h index cf3a7e337f025..68ddf3502ab03 100644 --- a/arch/powerpc/net/bpf_jit64.h +++ b/arch/powerpc/net/bpf_jit64.h @@ -16,18 +16,18 @@ * with our redzone usage. * * [ prev sp ] <------------- - * [ nv gpr save area ] 6*8 | + * [ nv gpr save area ] 5*8 | * [ tail_call_cnt ] 8 | - * [ local_tmp_var ] 8 | + * [ local_tmp_var ] 16 | * fp (r31) --> [ ebpf stack space ] upto 512 | * [ frame header ] 32/112 | * sp (r1) ---> [ stack pointer ] -------------- */ /* for gpr non volatile registers BPG_REG_6 to 10 */ -#define BPF_PPC_STACK_SAVE (6*8) +#define BPF_PPC_STACK_SAVE (5*8) /* for bpf JIT code internal usage */ -#define BPF_PPC_STACK_LOCALS 16 +#define BPF_PPC_STACK_LOCALS 24 /* stack frame excluding BPF stack, ensure this is quadword aligned */ #define BPF_PPC_STACKFRAME (STACK_FRAME_MIN_SIZE + \ BPF_PPC_STACK_LOCALS + BPF_PPC_STACK_SAVE) diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index b5a0c3a3ce6e5..687cd352648aa 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "bpf_jit64.h" @@ -56,9 +57,9 @@ static inline bool bpf_has_stack_frame(struct codegen_context *ctx) * [ prev sp ] <------------- * [ ... ] | * sp (r1) ---> [ stack pointer ] -------------- - * [ nv gpr save area ] 6*8 + * [ nv gpr save area ] 5*8 * [ tail_call_cnt ] 8 - * [ local_tmp_var ] 8 + * [ local_tmp_var ] 16 * [ unused red zone ] 208 bytes protected */ static int bpf_jit_stack_local(struct codegen_context *ctx) @@ -66,12 +67,12 @@ static int bpf_jit_stack_local(struct codegen_context *ctx) if (bpf_has_stack_frame(ctx)) return STACK_FRAME_MIN_SIZE + ctx->stack_size; else - return -(BPF_PPC_STACK_SAVE + 16); + return -(BPF_PPC_STACK_SAVE + 24); } static int bpf_jit_stack_tailcallcnt(struct codegen_context *ctx) { - return bpf_jit_stack_local(ctx) + 8; + return bpf_jit_stack_local(ctx) + 16; } static int bpf_jit_stack_offsetof(struct codegen_context *ctx, int reg) @@ -290,11 +291,34 @@ static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 o return 0; } +/* + * We spill into the redzone always, even if the bpf program has its own stackframe. + * Offsets hardcoded based on BPF_PPC_STACK_SAVE -- see bpf_jit_stack_local() + */ +void bpf_stf_barrier(void); + +asm ( +" .global bpf_stf_barrier ;" +" bpf_stf_barrier: ;" +" std 21,-64(1) ;" +" std 22,-56(1) ;" +" sync ;" +" ld 21,-64(1) ;" +" ld 22,-56(1) ;" +" ori 31,31,0 ;" +" .rept 14 ;" +" b 1f ;" +" 1: ;" +" .endr ;" +" blr ;" +); + /* Assemble the body code between the prologue & epilogue */ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *ctx, u32 *addrs, bool extra_pass) { + enum stf_barrier_type stf_barrier = stf_barrier_type_get(); const struct bpf_insn *insn = fp->insnsi; int flen = fp->len; int i, ret; @@ -663,6 +687,30 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, * BPF_ST NOSPEC (speculation barrier) */ case BPF_ST | BPF_NOSPEC: + if (!security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) || + (!security_ftr_enabled(SEC_FTR_L1D_FLUSH_PR) && + (!security_ftr_enabled(SEC_FTR_L1D_FLUSH_HV) || !cpu_has_feature(CPU_FTR_HVMODE)))) + break; + + switch (stf_barrier) { + case STF_BARRIER_EIEIO: + EMIT(0x7c0006ac | 0x02000000); + break; + case STF_BARRIER_SYNC_ORI: + EMIT(PPC_INST_SYNC); + PPC_LD(b2p[TMP_REG_1], 13, 0); + PPC_ORI(31, 31, 0); + break; + case STF_BARRIER_FALLBACK: + EMIT(PPC_INST_MFLR | ___PPC_RT(b2p[TMP_REG_1])); + PPC_LI64(12, dereference_kernel_function_descriptor(bpf_stf_barrier)); + PPC_MTCTR(12); + EMIT(PPC_INST_BCTR | 0x1); + PPC_MTLR(b2p[TMP_REG_1]); + break; + case STF_BARRIER_NONE: + break; + } break; /* From a0772ea6586c95025307dac81d752560315ca7a4 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Fri, 5 Nov 2021 13:38:02 -0700 Subject: [PATCH 1667/5344] mm, oom: pagefault_out_of_memory: don't force global OOM for dying tasks commit 0b28179a6138a5edd9d82ad2687c05b3773c387b upstream. Patch series "memcg: prohibit unconditional exceeding the limit of dying tasks", v3. Memory cgroup charging allows killed or exiting tasks to exceed the hard limit. It can be misused and allowed to trigger global OOM from inside a memcg-limited container. On the other hand if memcg fails allocation, called from inside #PF handler it triggers global OOM from inside pagefault_out_of_memory(). To prevent these problems this patchset: (a) removes execution of out_of_memory() from pagefault_out_of_memory(), becasue nobody can explain why it is necessary. (b) allow memcg to fail allocation of dying/killed tasks. This patch (of 3): Any allocation failure during the #PF path will return with VM_FAULT_OOM which in turn results in pagefault_out_of_memory which in turn executes out_out_memory() and can kill a random task. An allocation might fail when the current task is the oom victim and there are no memory reserves left. The OOM killer is already handled at the page allocator level for the global OOM and at the charging level for the memcg one. Both have much more information about the scope of allocation/charge request. This means that either the OOM killer has been invoked properly and didn't lead to the allocation success or it has been skipped because it couldn't have been invoked. In both cases triggering it from here is pointless and even harmful. It makes much more sense to let the killed task die rather than to wake up an eternally hungry oom-killer and send him to choose a fatter victim for breakfast. Link: https://lkml.kernel.org/r/0828a149-786e-7c06-b70a-52d086818ea3@virtuozzo.com Signed-off-by: Vasily Averin Suggested-by: Michal Hocko Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Mel Gorman Cc: Roman Gushchin Cc: Shakeel Butt Cc: Tetsuo Handa Cc: Uladzislau Rezki Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/oom_kill.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 651b3a338810d..5eda823a42a0e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1131,6 +1131,9 @@ void pagefault_out_of_memory(void) if (mem_cgroup_oom_synchronize(true)) return; + if (fatal_signal_pending(current)) + return; + if (!mutex_trylock(&oom_lock)) return; out_of_memory(&oc); From 0e42c3cb3fae9949d01905f3359934ee7a5b2a2e Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 5 Nov 2021 13:38:06 -0700 Subject: [PATCH 1668/5344] mm, oom: do not trigger out_of_memory from the #PF commit 60e2793d440a3ec95abb5d6d4fc034a4b480472d upstream. Any allocation failure during the #PF path will return with VM_FAULT_OOM which in turn results in pagefault_out_of_memory. This can happen for 2 different reasons. a) Memcg is out of memory and we rely on mem_cgroup_oom_synchronize to perform the memcg OOM handling or b) normal allocation fails. The latter is quite problematic because allocation paths already trigger out_of_memory and the page allocator tries really hard to not fail allocations. Anyway, if the OOM killer has been already invoked there is no reason to invoke it again from the #PF path. Especially when the OOM condition might be gone by that time and we have no way to find out other than allocate. Moreover if the allocation failed and the OOM killer hasn't been invoked then we are unlikely to do the right thing from the #PF context because we have already lost the allocation context and restictions and therefore might oom kill a task from a different NUMA domain. This all suggests that there is no legitimate reason to trigger out_of_memory from pagefault_out_of_memory so drop it. Just to be sure that no #PF path returns with VM_FAULT_OOM without allocation print a warning that this is happening before we restart the #PF. [VvS: #PF allocation can hit into limit of cgroup v1 kmem controller. This is a local problem related to memcg, however, it causes unnecessary global OOM kills that are repeated over and over again and escalate into a real disaster. This has been broken since kmem accounting has been introduced for cgroup v1 (3.8). There was no kmem specific reclaim for the separate limit so the only way to handle kmem hard limit was to return with ENOMEM. In upstream the problem will be fixed by removing the outdated kmem limit, however stable and LTS kernels cannot do it and are still affected. This patch fixes the problem and should be backported into stable/LTS.] Link: https://lkml.kernel.org/r/f5fd8dd8-0ad4-c524-5f65-920b01972a42@virtuozzo.com Signed-off-by: Michal Hocko Signed-off-by: Vasily Averin Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Mel Gorman Cc: Roman Gushchin Cc: Shakeel Butt Cc: Tetsuo Handa Cc: Uladzislau Rezki Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/oom_kill.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 5eda823a42a0e..2c51b271b7a41 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1114,19 +1114,15 @@ bool out_of_memory(struct oom_control *oc) } /* - * The pagefault handler calls here because it is out of memory, so kill a - * memory-hogging task. If oom_lock is held by somebody else, a parallel oom - * killing is already in progress so do nothing. + * The pagefault handler calls here because some allocation has failed. We have + * to take care of the memcg OOM here because this is the only safe context without + * any locks held but let the oom killer triggered from the allocation context care + * about the global OOM. */ void pagefault_out_of_memory(void) { - struct oom_control oc = { - .zonelist = NULL, - .nodemask = NULL, - .memcg = NULL, - .gfp_mask = 0, - .order = 0, - }; + static DEFINE_RATELIMIT_STATE(pfoom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); if (mem_cgroup_oom_synchronize(true)) return; @@ -1134,8 +1130,6 @@ void pagefault_out_of_memory(void) if (fatal_signal_pending(current)) return; - if (!mutex_trylock(&oom_lock)) - return; - out_of_memory(&oc); - mutex_unlock(&oom_lock); + if (__ratelimit(&pfoom_rs)) + pr_warn("Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF\n"); } From 528e28fa172c9dc9d28b4d40c7950cd8552f62f1 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Tue, 21 Sep 2021 19:35:06 +0200 Subject: [PATCH 1669/5344] video: backlight: Drop maximum brightness override for brightness zero MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 33a5471f8da976bf271a1ebbd6b9d163cb0cb6aa upstream. The note in c2adda27d202f ("video: backlight: Add of_find_backlight helper in backlight.c") says that gpio-backlight uses brightness as power state. This has been fixed since in ec665b756e6f7 ("backlight: gpio-backlight: Correct initial power state handling") and other backlight drivers do not require this workaround. Drop the workaround. This fixes the case where e.g. pwm-backlight can perfectly well be set to brightness 0 on boot in DT, which without this patch leads to the display brightness to be max instead of off. Fixes: c2adda27d202f ("video: backlight: Add of_find_backlight helper in backlight.c") Cc: # 5.4+ Cc: # 4.19.x: ec665b756e6f7: backlight: gpio-backlight: Correct initial power state handling Signed-off-by: Marek Vasut Acked-by: Noralf Trønnes Reviewed-by: Daniel Thompson Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- drivers/video/backlight/backlight.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c index cac3e35d76308..813a264a1b119 100644 --- a/drivers/video/backlight/backlight.c +++ b/drivers/video/backlight/backlight.c @@ -630,12 +630,6 @@ struct backlight_device *of_find_backlight(struct device *dev) of_node_put(np); if (!bd) return ERR_PTR(-EPROBE_DEFER); - /* - * Note: gpio_backlight uses brightness as - * power state during probe - */ - if (!bd->props.brightness) - bd->props.brightness = bd->props.max_brightness; } } From a8742c1243897ddb683c613f5e2bc92c786b9576 Mon Sep 17 00:00:00 2001 From: Vineeth Vijayan Date: Fri, 5 Nov 2021 16:44:51 +0100 Subject: [PATCH 1670/5344] s390/cio: check the subchannel validity for dev_busid commit a4751f157c194431fae9e9c493f456df8272b871 upstream. Check the validity of subchanel before reading other fields in the schib. Fixes: d3683c055212 ("s390/cio: add dev_busid sysfs entry for each subchannel") CC: Reported-by: Cornelia Huck Signed-off-by: Vineeth Vijayan Reviewed-by: Cornelia Huck Link: https://lore.kernel.org/r/20211105154451.847288-1-vneethv@linux.ibm.com Signed-off-by: Vasily Gorbik Signed-off-by: Greg Kroah-Hartman --- drivers/s390/cio/css.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c index 7950ac59b1744..ad447437a27c5 100644 --- a/drivers/s390/cio/css.c +++ b/drivers/s390/cio/css.c @@ -433,8 +433,8 @@ static ssize_t dev_busid_show(struct device *dev, struct subchannel *sch = to_subchannel(dev); struct pmcw *pmcw = &sch->schib.pmcw; - if ((pmcw->st == SUBCHANNEL_TYPE_IO || - pmcw->st == SUBCHANNEL_TYPE_MSG) && pmcw->dnv) + if ((pmcw->st == SUBCHANNEL_TYPE_IO && pmcw->dnv) || + (pmcw->st == SUBCHANNEL_TYPE_MSG && pmcw->w)) return sysfs_emit(buf, "0.%x.%04x\n", sch->schid.ssid, pmcw->dev); else From 88ecc69128ca3d49878628e71f1dc53f1d907db1 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Tue, 2 Nov 2021 10:55:30 +0100 Subject: [PATCH 1671/5344] s390/tape: fix timer initialization in tape_std_assign() commit 213fca9e23b59581c573d558aa477556f00b8198 upstream. commit 9c6c273aa424 ("timer: Remove init_timer_on_stack() in favor of timer_setup_on_stack()") changed the timer setup from init_timer_on_stack(() to timer_setup(), but missed to change the mod_timer() call. And while at it, use msecs_to_jiffies() instead of the open coded timeout calculation. Cc: stable@vger.kernel.org Fixes: 9c6c273aa424 ("timer: Remove init_timer_on_stack() in favor of timer_setup_on_stack()") Signed-off-by: Sven Schnelle Reviewed-by: Vasily Gorbik Signed-off-by: Vasily Gorbik Signed-off-by: Greg Kroah-Hartman --- drivers/s390/char/tape_std.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/s390/char/tape_std.c b/drivers/s390/char/tape_std.c index 1f5fab617b679..f7e75d9fedf61 100644 --- a/drivers/s390/char/tape_std.c +++ b/drivers/s390/char/tape_std.c @@ -53,7 +53,6 @@ int tape_std_assign(struct tape_device *device) { int rc; - struct timer_list timeout; struct tape_request *request; request = tape_alloc_request(2, 11); @@ -70,7 +69,7 @@ tape_std_assign(struct tape_device *device) * So we set up a timeout for this call. */ timer_setup(&request->timer, tape_std_assign_timeout, 0); - mod_timer(&timeout, jiffies + 2 * HZ); + mod_timer(&request->timer, jiffies + msecs_to_jiffies(2000)); rc = tape_do_io_interruptible(device, request); From 4799e4cc3144abd651d640696e0530ae8ef7938d Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Wed, 8 Sep 2021 17:36:23 +0200 Subject: [PATCH 1672/5344] s390/cio: make ccw_device_dma_* more robust commit ad9a14517263a16af040598c7920c09ca9670a31 upstream. Since commit 48720ba56891 ("virtio/s390: use DMA memory for ccw I/O and classic notifiers") we were supposed to make sure that virtio_ccw_release_dev() completes before the ccw device and the attached dma pool are torn down, but unfortunately we did not. Before that commit it used to be OK to delay cleaning up the memory allocated by virtio-ccw indefinitely (which isn't really intuitive for guys used to destruction happens in reverse construction order), but now we trigger a BUG_ON if the genpool is destroyed before all memory allocated from it is deallocated. Which brings down the guest. We can observe this problem, when unregister_virtio_device() does not give up the last reference to the virtio_device (e.g. because a virtio-scsi attached scsi disk got removed without previously unmounting its previously mounted partition). To make sure that the genpool is only destroyed after all the necessary freeing is done let us take a reference on the ccw device on each ccw_device_dma_zalloc() and give it up on each ccw_device_dma_free(). Actually there are multiple approaches to fixing the problem at hand that can work. The upside of this one is that it is the safest one while remaining simple. We don't crash the guest even if the driver does not pair allocations and frees. The downside is the reference counting overhead, that the reference counting for ccw devices becomes more complex, in a sense that we need to pair the calls to the aforementioned functions for it to be correct, and that if we happen to leak, we leak more than necessary (the whole ccw device instead of just the genpool). Some alternatives to this approach are taking a reference in virtio_ccw_online() and giving it up in virtio_ccw_release_dev() or making sure virtio_ccw_release_dev() completes its work before virtio_ccw_remove() returns. The downside of these approaches is that these are less safe against programming errors. Cc: # v5.3 Signed-off-by: Halil Pasic Fixes: 48720ba56891 ("virtio/s390: use DMA memory for ccw I/O and classic notifiers") Reported-by: bfu@redhat.com Reviewed-by: Vineeth Vijayan Acked-by: Cornelia Huck Signed-off-by: Vasily Gorbik Signed-off-by: Greg Kroah-Hartman --- drivers/s390/cio/device_ops.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/s390/cio/device_ops.c b/drivers/s390/cio/device_ops.c index ccecf6b9504e8..5a411595347b5 100644 --- a/drivers/s390/cio/device_ops.c +++ b/drivers/s390/cio/device_ops.c @@ -717,13 +717,23 @@ EXPORT_SYMBOL_GPL(ccw_device_get_schid); */ void *ccw_device_dma_zalloc(struct ccw_device *cdev, size_t size) { - return cio_gp_dma_zalloc(cdev->private->dma_pool, &cdev->dev, size); + void *addr; + + if (!get_device(&cdev->dev)) + return NULL; + addr = cio_gp_dma_zalloc(cdev->private->dma_pool, &cdev->dev, size); + if (IS_ERR_OR_NULL(addr)) + put_device(&cdev->dev); + return addr; } EXPORT_SYMBOL(ccw_device_dma_zalloc); void ccw_device_dma_free(struct ccw_device *cdev, void *cpu_addr, size_t size) { + if (!cpu_addr) + return; cio_gp_dma_free(cdev->private->dma_pool, cpu_addr, size); + put_device(&cdev->dev); } EXPORT_SYMBOL(ccw_device_dma_free); From f62054bf962d3f40772cb26c3fb73bfd33ae384b Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 28 Oct 2021 22:27:16 +0530 Subject: [PATCH 1673/5344] powerpc/powernv/prd: Unregister OPAL_MSG_PRD2 notifier during module unload commit 52862ab33c5d97490f3fa345d6529829e6d6637b upstream. Commit 587164cd, introduced new opal message type (OPAL_MSG_PRD2) and added opal notifier. But I missed to unregister the notifier during module unload path. This results in below call trace if you try to unload and load opal_prd module. Also add new notifier_block for OPAL_MSG_PRD2 message. Sample calltrace (modprobe -r opal_prd; modprobe opal_prd) BUG: Unable to handle kernel data access on read at 0xc0080000192200e0 Faulting instruction address: 0xc00000000018d1cc Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA PowerNV CPU: 66 PID: 7446 Comm: modprobe Kdump: loaded Tainted: G E 5.14.0prd #759 NIP: c00000000018d1cc LR: c00000000018d2a8 CTR: c0000000000cde10 REGS: c0000003c4c0f0a0 TRAP: 0300 Tainted: G E (5.14.0prd) MSR: 9000000002009033 CR: 24224824 XER: 20040000 CFAR: c00000000018d2a4 DAR: c0080000192200e0 DSISR: 40000000 IRQMASK: 1 ... NIP notifier_chain_register+0x2c/0xc0 LR atomic_notifier_chain_register+0x48/0x80 Call Trace: 0xc000000002090610 (unreliable) atomic_notifier_chain_register+0x58/0x80 opal_message_notifier_register+0x7c/0x1e0 opal_prd_probe+0x84/0x150 [opal_prd] platform_probe+0x78/0x130 really_probe+0x110/0x5d0 __driver_probe_device+0x17c/0x230 driver_probe_device+0x60/0x130 __driver_attach+0xfc/0x220 bus_for_each_dev+0xa8/0x130 driver_attach+0x34/0x50 bus_add_driver+0x1b0/0x300 driver_register+0x98/0x1a0 __platform_driver_register+0x38/0x50 opal_prd_driver_init+0x34/0x50 [opal_prd] do_one_initcall+0x60/0x2d0 do_init_module+0x7c/0x320 load_module+0x3394/0x3650 __do_sys_finit_module+0xd4/0x160 system_call_exception+0x140/0x290 system_call_common+0xf4/0x258 Fixes: 587164cd593c ("powerpc/powernv: Add new opal message type") Cc: stable@vger.kernel.org # v5.4+ Signed-off-by: Vasant Hegde Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211028165716.41300-1-hegdevasant@linux.vnet.ibm.com Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/platforms/powernv/opal-prd.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c index 45f4223a790f2..6ebc906e07b0a 100644 --- a/arch/powerpc/platforms/powernv/opal-prd.c +++ b/arch/powerpc/platforms/powernv/opal-prd.c @@ -372,6 +372,12 @@ static struct notifier_block opal_prd_event_nb = { .priority = 0, }; +static struct notifier_block opal_prd_event_nb2 = { + .notifier_call = opal_prd_msg_notifier, + .next = NULL, + .priority = 0, +}; + static int opal_prd_probe(struct platform_device *pdev) { int rc; @@ -393,9 +399,10 @@ static int opal_prd_probe(struct platform_device *pdev) return rc; } - rc = opal_message_notifier_register(OPAL_MSG_PRD2, &opal_prd_event_nb); + rc = opal_message_notifier_register(OPAL_MSG_PRD2, &opal_prd_event_nb2); if (rc) { pr_err("Couldn't register PRD2 event notifier\n"); + opal_message_notifier_unregister(OPAL_MSG_PRD, &opal_prd_event_nb); return rc; } @@ -404,6 +411,8 @@ static int opal_prd_probe(struct platform_device *pdev) pr_err("failed to register miscdev\n"); opal_message_notifier_unregister(OPAL_MSG_PRD, &opal_prd_event_nb); + opal_message_notifier_unregister(OPAL_MSG_PRD2, + &opal_prd_event_nb2); return rc; } @@ -414,6 +423,7 @@ static int opal_prd_remove(struct platform_device *pdev) { misc_deregister(&opal_prd_dev); opal_message_notifier_unregister(OPAL_MSG_PRD, &opal_prd_event_nb); + opal_message_notifier_unregister(OPAL_MSG_PRD2, &opal_prd_event_nb2); return 0; } From 411e40c8cc3fdb791ba51684d3a735373917fcfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Tue, 5 Oct 2021 20:09:40 +0200 Subject: [PATCH 1674/5344] PCI: Add PCI_EXP_DEVCTL_PAYLOAD_* macros MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 460275f124fb072dca218a6b43b6370eebbab20d upstream. Define a macro PCI_EXP_DEVCTL_PAYLOAD_* for every possible Max Payload Size in linux/pci_regs.h, in the same style as PCI_EXP_DEVCTL_READRQ_*. Link: https://lore.kernel.org/r/20211005180952.6812-2-kabel@kernel.org Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marek Behún Reviewed-by: Bjorn Helgaas Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/pci_regs.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 29d6e93fd15e3..b485d8b0d5a79 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -502,6 +502,12 @@ #define PCI_EXP_DEVCTL_URRE 0x0008 /* Unsupported Request Reporting En. */ #define PCI_EXP_DEVCTL_RELAX_EN 0x0010 /* Enable relaxed ordering */ #define PCI_EXP_DEVCTL_PAYLOAD 0x00e0 /* Max_Payload_Size */ +#define PCI_EXP_DEVCTL_PAYLOAD_128B 0x0000 /* 128 Bytes */ +#define PCI_EXP_DEVCTL_PAYLOAD_256B 0x0020 /* 256 Bytes */ +#define PCI_EXP_DEVCTL_PAYLOAD_512B 0x0040 /* 512 Bytes */ +#define PCI_EXP_DEVCTL_PAYLOAD_1024B 0x0060 /* 1024 Bytes */ +#define PCI_EXP_DEVCTL_PAYLOAD_2048B 0x0080 /* 2048 Bytes */ +#define PCI_EXP_DEVCTL_PAYLOAD_4096B 0x00a0 /* 4096 Bytes */ #define PCI_EXP_DEVCTL_EXT_TAG 0x0100 /* Extended Tag Field Enable */ #define PCI_EXP_DEVCTL_PHANTOM 0x0200 /* Phantom Functions Enable */ #define PCI_EXP_DEVCTL_AUX_PME 0x0400 /* Auxiliary Power PM Enable */ From 9177643cdfc3dcd942b579fc64ba6c7ec08cbae1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 12 Jul 2021 09:52:59 -0400 Subject: [PATCH 1675/5344] SUNRPC: Partial revert of commit 6f9f17287e78 commit ea7a1019d8baf8503ecd6e3ec8436dec283569e6 upstream. The premise of commit 6f9f17287e78 ("SUNRPC: Mitigate cond_resched() in xprt_transmit()") was that cond_resched() is expensive and unnecessary when there has been just a single send. The point of cond_resched() is to ensure that tasks that should pre-empt this one get a chance to do so when it is safe to do so. The code prior to commit 6f9f17287e78 failed to take into account that it was keeping a rpc_task pinned for longer than it needed to, and so rather than doing a full revert, let's just move the cond_resched. Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- net/sunrpc/xprt.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 3653898f465ff..93b6afd28405e 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1538,15 +1538,14 @@ xprt_transmit(struct rpc_task *task) { struct rpc_rqst *next, *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; - int counter, status; + int status; spin_lock(&xprt->queue_lock); - counter = 0; - while (!list_empty(&xprt->xmit_queue)) { - if (++counter == 20) + for (;;) { + next = list_first_entry_or_null(&xprt->xmit_queue, + struct rpc_rqst, rq_xmit); + if (!next) break; - next = list_first_entry(&xprt->xmit_queue, - struct rpc_rqst, rq_xmit); xprt_pin_rqst(next); spin_unlock(&xprt->queue_lock); status = xprt_request_transmit(next, task); @@ -1554,13 +1553,16 @@ xprt_transmit(struct rpc_task *task) status = 0; spin_lock(&xprt->queue_lock); xprt_unpin_rqst(next); - if (status == 0) { - if (!xprt_request_data_received(task) || - test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) - continue; - } else if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) - task->tk_status = status; - break; + if (status < 0) { + if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) + task->tk_status = status; + break; + } + /* Was @task transmitted, and has it received a reply? */ + if (xprt_request_data_received(task) && + !test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) + break; + cond_resched_lock(&xprt->queue_lock); } spin_unlock(&xprt->queue_lock); } From 0f8b9e3002ff653655b07370598473b60bc4b729 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 20 Oct 2021 11:59:07 +0300 Subject: [PATCH 1676/5344] ath10k: fix invalid dma_addr_t token assignment commit 937e79c67740d1d84736730d679f3cb2552f990e upstream. Using a kernel pointer in place of a dma_addr_t token can lead to undefined behavior if that makes it into cache management functions. The compiler caught one such attempt in a cast: drivers/net/wireless/ath/ath10k/mac.c: In function 'ath10k_add_interface': drivers/net/wireless/ath/ath10k/mac.c:5586:47: error: cast from pointer to integer of different size [-Werror=pointer-to-int-cast] 5586 | arvif->beacon_paddr = (dma_addr_t)arvif->beacon_buf; | ^ Looking through how this gets used down the way, I'm fairly sure that beacon_paddr is never accessed again for ATH10K_DEV_TYPE_HL devices, and if it was accessed, that would be a bug. Change the assignment to use a known-invalid address token instead, which avoids the warning and makes it easier to catch bugs if it does end up getting used. Fixes: e263bdab9c0e ("ath10k: high latency fixes for beacon buffer") Signed-off-by: Arnd Bergmann Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20211014075153.3655910-1-arnd@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/ath/ath10k/mac.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index 9daaacf789d60..3026eb54a7f23 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -5258,7 +5258,15 @@ static int ath10k_add_interface(struct ieee80211_hw *hw, if (ar->bus_param.dev_type == ATH10K_DEV_TYPE_HL) { arvif->beacon_buf = kmalloc(IEEE80211_MAX_FRAME_LEN, GFP_KERNEL); - arvif->beacon_paddr = (dma_addr_t)arvif->beacon_buf; + + /* Using a kernel pointer in place of a dma_addr_t + * token can lead to undefined behavior if that + * makes it into cache management functions. Use a + * known-invalid address token instead, which + * avoids the warning and makes it easier to catch + * bugs if it does end up getting used. + */ + arvif->beacon_paddr = DMA_MAPPING_ERROR; } else { arvif->beacon_buf = dma_alloc_coherent(ar->dev, From c5859b4e826e2a84da97a650e75f2d8850193d8e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 1 Nov 2021 16:01:18 -0700 Subject: [PATCH 1677/5344] selftests/bpf: Fix also no-alu32 strobemeta selftest commit a20eac0af02810669e187cb623bc904908c423af upstream. Previous fix aded bpf_clamp_umax() helper use to re-validate boundaries. While that works correctly, it introduces more branches, which blows up past 1 million instructions in no-alu32 variant of strobemeta selftests. Switching len variable from u32 to u64 also fixes the issue and reduces the number of validated instructions, so use that instead. Fix this patch and bpf_clamp_umax() removed, both alu32 and no-alu32 selftests pass. Fixes: 0133c20480b1 ("selftests/bpf: Fix strobemeta selftest regression") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211101230118.1273019-1-andrii@kernel.org Signed-off-by: Greg Kroah-Hartman --- tools/testing/selftests/bpf/progs/strobemeta.h | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/strobemeta.h b/tools/testing/selftests/bpf/progs/strobemeta.h index 938765d87528a..5ba8d39c4d544 100644 --- a/tools/testing/selftests/bpf/progs/strobemeta.h +++ b/tools/testing/selftests/bpf/progs/strobemeta.h @@ -10,14 +10,6 @@ #include #include "bpf_helpers.h" -#define bpf_clamp_umax(VAR, UMAX) \ - asm volatile ( \ - "if %0 <= %[max] goto +1\n" \ - "%0 = %[max]\n" \ - : "+r"(VAR) \ - : [max]"i"(UMAX) \ - ) - typedef uint32_t pid_t; struct task_struct {}; @@ -357,7 +349,7 @@ static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg, void *payload) { void *location; - uint32_t len; + uint64_t len; data->str_lens[idx] = 0; location = calc_location(&cfg->str_locs[idx], tls_base); @@ -389,7 +381,7 @@ static __always_inline void *read_map_var(struct strobemeta_cfg *cfg, struct strobe_map_descr* descr = &data->map_descrs[idx]; struct strobe_map_raw map; void *location; - uint32_t len; + uint64_t len; int i; descr->tag_len = 0; /* presume no tag is set */ @@ -412,7 +404,6 @@ static __always_inline void *read_map_var(struct strobemeta_cfg *cfg, len = bpf_probe_read_str(payload, STROBE_MAX_STR_LEN, map.tag); if (len <= STROBE_MAX_STR_LEN) { - bpf_clamp_umax(len, STROBE_MAX_STR_LEN); descr->tag_len = len; payload += len; } @@ -430,7 +421,6 @@ static __always_inline void *read_map_var(struct strobemeta_cfg *cfg, len = bpf_probe_read_str(payload, STROBE_MAX_STR_LEN, map.entries[i].key); if (len <= STROBE_MAX_STR_LEN) { - bpf_clamp_umax(len, STROBE_MAX_STR_LEN); descr->key_lens[i] = len; payload += len; } @@ -438,7 +428,6 @@ static __always_inline void *read_map_var(struct strobemeta_cfg *cfg, len = bpf_probe_read_str(payload, STROBE_MAX_STR_LEN, map.entries[i].val); if (len <= STROBE_MAX_STR_LEN) { - bpf_clamp_umax(len, STROBE_MAX_STR_LEN); descr->val_lens[i] = len; payload += len; } From 72379885152c460ebe1af1404f576da1f83065a8 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 17 Nov 2021 09:48:51 +0100 Subject: [PATCH 1678/5344] Linux 5.4.160 Link: https://lore.kernel.org/r/20211115165313.549179499@linuxfoundation.org Tested-by: Shuah Khan Tested-by: Hulk Robot Tested-by: Jon Hunter Tested-by: Sudip Mukherjee Link: https://lore.kernel.org/r/20211116142514.833707661@linuxfoundation.org Tested-by: Linux Kernel Functional Testing Tested-by: Guenter Roeck Tested-by: Jon Hunter Tested-by: Shuah Khan Tested-by: Florian Fainelli Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 602b5167dacd7..e938662dab289 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 4 -SUBLEVEL = 159 +SUBLEVEL = 160 EXTRAVERSION = NAME = Kleptomaniac Octopus From 15ab307b0575136fb3cbc0179cc16fdd8752f92c Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Tue, 2 Mar 2021 15:24:58 +0300 Subject: [PATCH 1679/5344] soc/tegra: pmc: Fix imbalanced clock disabling in error code path commit 19221e3083020bd9537624caa0ee0145ed92ba36 upstream. The tegra_powergate_power_up() has a typo in the error code path where it will try to disable clocks twice, fix it. In practice that error never happens, so this is a minor correction. Tested-by: Peter Geis # Ouya T30 Tested-by: Nicolas Chauvet # PAZ00 T20 and TK1 T124 Tested-by: Matt Merhar # Ouya T30 Signed-off-by: Dmitry Osipenko Signed-off-by: Thierry Reding Cc: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- drivers/soc/tegra/pmc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/soc/tegra/pmc.c b/drivers/soc/tegra/pmc.c index ab75f41e9c0c9..d11320a4dfcf9 100644 --- a/drivers/soc/tegra/pmc.c +++ b/drivers/soc/tegra/pmc.c @@ -579,7 +579,7 @@ static int tegra_powergate_power_up(struct tegra_powergate *pg, err = tegra_powergate_enable_clocks(pg); if (err) - goto disable_clks; + goto powergate_off; usleep_range(10, 20); From f5dd6893108f30f3514e3c3f1e7282aa33c5b525 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 11 Aug 2020 16:39:35 +0300 Subject: [PATCH 1680/5344] scsi: ufs: Fix interrupt error message for shared interrupts commit 6337f58cec030b34ced435b3d9d7d29d63c96e36 upstream. The interrupt might be shared, in which case it is not an error for the interrupt handler to be called when the interrupt status is zero, so don't print the message unless there was enabled interrupt status. Link: https://lore.kernel.org/r/20200811133936.19171-1-adrian.hunter@intel.com Fixes: 9333d7757348 ("scsi: ufs: Fix irq return code") Reviewed-by: Avri Altman Signed-off-by: Adrian Hunter Signed-off-by: Martin K. Petersen Signed-off-by: Orson Zhai Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/ufs/ufshcd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 24396f4d5f2d3..a5d4ee6b750be 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -5661,7 +5661,7 @@ static irqreturn_t ufshcd_intr(int irq, void *__hba) intr_status = ufshcd_readl(hba, REG_INTERRUPT_STATUS); } - if (retval == IRQ_NONE) { + if (enabled_intr_status && retval == IRQ_NONE) { dev_err(hba->dev, "%s: Unhandled interrupt 0x%08x\n", __func__, intr_status); ufshcd_dump_regs(hba, 0, UFSHCI_REG_SPACE_SIZE, "host_regs: "); From 83b5cc5df5cf89dffb0e41d4f282ea77b0b55785 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Fri, 22 Oct 2021 00:58:23 +0200 Subject: [PATCH 1681/5344] MIPS: Fix assembly error from MIPSr2 code used within MIPS_ISA_ARCH_LEVEL commit a923a2676e60683aee46aa4b93c30aff240ac20d upstream. Fix assembly errors like: {standard input}: Assembler messages: {standard input}:287: Error: opcode not supported on this processor: mips3 (mips3) `dins $10,$7,32,32' {standard input}:680: Error: opcode not supported on this processor: mips3 (mips3) `dins $10,$7,32,32' {standard input}:1274: Error: opcode not supported on this processor: mips3 (mips3) `dins $12,$9,32,32' {standard input}:2175: Error: opcode not supported on this processor: mips3 (mips3) `dins $10,$7,32,32' make[1]: *** [scripts/Makefile.build:277: mm/highmem.o] Error 1 with code produced from `__cmpxchg64' for MIPS64r2 CPU configurations using CONFIG_32BIT and CONFIG_PHYS_ADDR_T_64BIT. This is due to MIPS_ISA_ARCH_LEVEL downgrading the assembly architecture to `r4000' i.e. MIPS III for MIPS64r2 configurations, while there is a block of code containing a DINS MIPS64r2 instruction conditionalized on MIPS_ISA_REV >= 2 within the scope of the downgrade. The assembly architecture override code pattern has been put there for LL/SC instructions, so that code compiles for configurations that select a processor to build for that does not support these instructions while still providing run-time support for processors that do, dynamically switched by non-constant `cpu_has_llsc'. It went in with linux-mips.org commit aac8aa7717a2 ("Enable a suitable ISA for the assembler around ll/sc so that code builds even for processors that don't support the instructions. Plus minor formatting fixes.") back in 2005. Fix the problem by wrapping these instructions along with the adjacent SYNC instructions only, following the practice established with commit cfd54de3b0e4 ("MIPS: Avoid move psuedo-instruction whilst using MIPS_ISA_LEVEL") and commit 378ed6f0e3c5 ("MIPS: Avoid using .set mips0 to restore ISA"). Strictly speaking the SYNC instructions do not have to be wrapped as they are only used as a Loongson3 erratum workaround, so they will be enabled in the assembler by default, but do this so as to keep code consistent with other places. Reported-by: kernel test robot Signed-off-by: Maciej W. Rozycki Fixes: c7e2d71dda7a ("MIPS: Fix set_pte() for Netlogic XLR using cmpxchg64()") Cc: stable@vger.kernel.org # v5.1+ Signed-off-by: Thomas Bogendoerfer Signed-off-by: Greg Kroah-Hartman --- arch/mips/include/asm/cmpxchg.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/mips/include/asm/cmpxchg.h b/arch/mips/include/asm/cmpxchg.h index f6136871561dc..9182ce828f540 100644 --- a/arch/mips/include/asm/cmpxchg.h +++ b/arch/mips/include/asm/cmpxchg.h @@ -239,6 +239,7 @@ static inline unsigned long __cmpxchg64(volatile void *ptr, " .set " MIPS_ISA_ARCH_LEVEL " \n" /* Load 64 bits from ptr */ "1: lld %L0, %3 # __cmpxchg64 \n" + " .set pop \n" /* * Split the 64 bit value we loaded into the 2 registers that hold the * ret variable. @@ -266,6 +267,8 @@ static inline unsigned long __cmpxchg64(volatile void *ptr, " or %L1, %L1, $at \n" " .set at \n" # endif + " .set push \n" + " .set " MIPS_ISA_ARCH_LEVEL " \n" /* Attempt to store new at ptr */ " scd %L1, %2 \n" /* If we failed, loop! */ From fb65a4c9003390b1c461a7216a9d8f462aabcb51 Mon Sep 17 00:00:00 2001 From: Shaoying Xu Date: Thu, 2 Sep 2021 16:44:12 +0000 Subject: [PATCH 1682/5344] ext4: fix lazy initialization next schedule time computation in more granular unit commit 39fec6889d15a658c3a3ebb06fd69d3584ddffd3 upstream. Ext4 file system has default lazy inode table initialization setup once it is mounted. However, it has issue on computing the next schedule time that makes the timeout same amount in jiffies but different real time in secs if with various HZ values. Therefore, fix by measuring the current time in a more granular unit nanoseconds and make the next schedule time independent of the HZ value. Fixes: bfff68738f1c ("ext4: add support for lazy inode table initialization") Signed-off-by: Shaoying Xu Cc: stable@vger.kernel.org Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20210902164412.9994-2-shaoyi@amazon.com Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/super.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1211ae203face..f68dfef5939f4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3071,8 +3071,8 @@ static int ext4_run_li_request(struct ext4_li_request *elr) struct ext4_group_desc *gdp = NULL; ext4_group_t group, ngroups; struct super_block *sb; - unsigned long timeout = 0; int ret = 0; + u64 start_time; sb = elr->lr_super; ngroups = EXT4_SB(sb)->s_groups_count; @@ -3092,13 +3092,12 @@ static int ext4_run_li_request(struct ext4_li_request *elr) ret = 1; if (!ret) { - timeout = jiffies; + start_time = ktime_get_real_ns(); ret = ext4_init_inode_table(sb, group, elr->lr_timeout ? 0 : 1); if (elr->lr_timeout == 0) { - timeout = (jiffies - timeout) * - elr->lr_sbi->s_li_wait_mult; - elr->lr_timeout = timeout; + elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) * + elr->lr_sbi->s_li_wait_mult); } elr->lr_next_sched = jiffies + elr->lr_timeout; elr->lr_next_group = group + 1; From 3947f8c1acb08494e04b3b70591ab1067b431f55 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 7 Jan 2021 10:53:16 -0800 Subject: [PATCH 1683/5344] scsi: ufs: Fix tm request when non-fatal error happens commit eeb1b55b6e25c5f7265ff45cd050f3bc2cc423a4 upstream. When non-fatal error like line-reset happens, ufshcd_err_handler() starts to abort tasks by ufshcd_try_to_abort_task(). When it tries to issue a task management request, we hit two warnings: WARNING: CPU: 7 PID: 7 at block/blk-core.c:630 blk_get_request+0x68/0x70 WARNING: CPU: 4 PID: 157 at block/blk-mq-tag.c:82 blk_mq_get_tag+0x438/0x46c After fixing the above warnings we hit another tm_cmd timeout which may be caused by unstable controller state: __ufshcd_issue_tm_cmd: task management cmd 0x80 timed-out Then, ufshcd_err_handler() enters full reset, and kernel gets stuck. It turned out ufshcd_print_trs() printed too many messages on console which requires CPU locks. Likewise hba->silence_err_logs, we need to avoid too verbose messages. This is actually not an error case. Link: https://lore.kernel.org/r/20210107185316.788815-3-jaegeuk@kernel.org Fixes: 69a6c269c097 ("scsi: ufs: Use blk_{get,put}_request() to allocate and free TMFs") Reviewed-by: Can Guo Signed-off-by: Jaegeuk Kim Signed-off-by: Martin K. Petersen [Zhai: remove an item of debug print not available in v5.4] Signed-off-by: Orson Zhai Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/ufs/ufshcd.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index a5d4ee6b750be..29c7a76d2c658 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -4748,7 +4748,8 @@ ufshcd_transfer_rsp_status(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) break; } /* end of switch */ - if ((host_byte(result) != DID_OK) && !hba->silence_err_logs) + if ((host_byte(result) != DID_OK) && + (host_byte(result) != DID_REQUEUE) && !hba->silence_err_logs) ufshcd_print_trs(hba, 1 << lrbp->task_tag, true); return result; } @@ -5661,9 +5662,12 @@ static irqreturn_t ufshcd_intr(int irq, void *__hba) intr_status = ufshcd_readl(hba, REG_INTERRUPT_STATUS); } - if (enabled_intr_status && retval == IRQ_NONE) { - dev_err(hba->dev, "%s: Unhandled interrupt 0x%08x\n", - __func__, intr_status); + if (enabled_intr_status && retval == IRQ_NONE && + !ufshcd_eh_in_progress(hba)) { + dev_err(hba->dev, "%s: Unhandled interrupt 0x%08x (-, 0x%08x)\n", + __func__, + intr_status, + enabled_intr_status); ufshcd_dump_regs(hba, 0, UFSHCI_REG_SPACE_SIZE, "host_regs: "); } @@ -5705,7 +5709,10 @@ static int __ufshcd_issue_tm_cmd(struct ufs_hba *hba, /* * blk_get_request() is used here only to get a free tag. */ - req = blk_get_request(q, REQ_OP_DRV_OUT, BLK_MQ_REQ_RESERVED); + req = blk_get_request(q, REQ_OP_DRV_OUT, 0); + if (IS_ERR(req)) + return PTR_ERR(req); + req->end_io_data = &wait; ufshcd_hold(hba, false); From d7a2464eb1c654a24522c151d12da0e8d12073d1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 12 May 2021 21:51:10 -0700 Subject: [PATCH 1684/5344] fortify: Explicitly disable Clang support commit a52f8a59aef46b59753e583bf4b28fccb069ce64 upstream. Clang has never correctly compiled the FORTIFY_SOURCE defenses due to a couple bugs: Eliding inlines with matching __builtin_* names https://bugs.llvm.org/show_bug.cgi?id=50322 Incorrect __builtin_constant_p() of some globals https://bugs.llvm.org/show_bug.cgi?id=41459 In the process of making improvements to the FORTIFY_SOURCE defenses, the first (silent) bug (coincidentally) becomes worked around, but exposes the latter which breaks the build. As such, Clang must not be used with CONFIG_FORTIFY_SOURCE until at least latter bug is fixed (in Clang 13), and the fortify routines have been rearranged. Update the Kconfig to reflect the reality of the current situation. Signed-off-by: Kees Cook Acked-by: Nick Desaulniers Link: https://lore.kernel.org/lkml/CAKwvOd=A+ueGV2ihdy5GtgR2fQbcXjjAtVxv3=cPjffpebZB7A@mail.gmail.com Cc: Nathan Chancellor Signed-off-by: Greg Kroah-Hartman --- security/Kconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/security/Kconfig b/security/Kconfig index 2a1a2d3962281..52e5109f2c1b6 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -191,6 +191,9 @@ config HARDENED_USERCOPY_PAGESPAN config FORTIFY_SOURCE bool "Harden common str/mem functions against buffer overflows" depends on ARCH_HAS_FORTIFY_SOURCE + # https://bugs.llvm.org/show_bug.cgi?id=50322 + # https://bugs.llvm.org/show_bug.cgi?id=41459 + depends on !CC_IS_CLANG help Detect overflows of buffers in common string and memory functions where the compiler can determine and validate the buffer sizes. From ee2469a6bdc9446fbb03ece5f4d234fc79c1a2be Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Sat, 13 Nov 2021 20:41:17 +0100 Subject: [PATCH 1685/5344] parisc/entry: fix trace test in syscall exit path commit 3ec18fc7831e7d79e2d536dd1f3bc0d3ba425e8a upstream. commit 8779e05ba8aa ("parisc: Fix ptrace check on syscall return") fixed testing of TI_FLAGS. This uncovered a bug in the test mask. syscall_restore_rfi is only used when the kernel needs to exit to usespace with single or block stepping and the recovery counter enabled. The test however used _TIF_SYSCALL_TRACE_MASK, which includes a lot of bits that shouldn't be tested here. Fix this by using TIF_SINGLESTEP and TIF_BLOCKSTEP directly. I encountered this bug by enabling syscall tracepoints. Both in qemu and on real hardware. As soon as i enabled the tracepoint (sys_exit_read, but i guess it doesn't really matter which one), i got random page faults in userspace almost immediately. Signed-off-by: Sven Schnelle Signed-off-by: Helge Deller Signed-off-by: Greg Kroah-Hartman --- arch/parisc/kernel/entry.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S index 7d14e9ae18fb1..2f64f112934b6 100644 --- a/arch/parisc/kernel/entry.S +++ b/arch/parisc/kernel/entry.S @@ -1842,7 +1842,7 @@ syscall_restore: /* Are we being ptraced? */ LDREG TI_FLAGS-THREAD_SZ_ALGN-FRAME_SIZE(%r30),%r19 - ldi _TIF_SYSCALL_TRACE_MASK,%r2 + ldi _TIF_SINGLESTEP|_TIF_BLOCKSTEP,%r2 and,COND(=) %r19,%r2,%r0 b,n syscall_restore_rfi From 20c6131c966321a6f76f00c2b9b157a7d254f02a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 9 Nov 2021 14:53:57 +0100 Subject: [PATCH 1686/5344] PCI/MSI: Destroy sysfs before freeing entries commit 3735459037114d31e5acd9894fad9aed104231a0 upstream. free_msi_irqs() frees the MSI entries before destroying the sysfs entries which are exposing them. Nothing prevents a concurrent free while a sysfs file is read and accesses the possibly freed entry. Move the sysfs release ahead of freeing the entries. Fixes: 1c51b50c2995 ("PCI/MSI: Export MSI mode using attributes, not kobjects") Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/87sfw5305m.ffs@tglx Signed-off-by: Greg Kroah-Hartman --- drivers/pci/msi.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 971080232afc7..8a78f92697446 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -395,18 +395,6 @@ static void free_msi_irqs(struct pci_dev *dev) for (i = 0; i < entry->nvec_used; i++) BUG_ON(irq_has_action(entry->irq + i)); - pci_msi_teardown_msi_irqs(dev); - - list_for_each_entry_safe(entry, tmp, msi_list, list) { - if (entry->msi_attrib.is_msix) { - if (list_is_last(&entry->list, msi_list)) - iounmap(entry->mask_base); - } - - list_del(&entry->list); - free_msi_entry(entry); - } - if (dev->msi_irq_groups) { sysfs_remove_groups(&dev->dev.kobj, dev->msi_irq_groups); msi_attrs = dev->msi_irq_groups[0]->attrs; @@ -422,6 +410,18 @@ static void free_msi_irqs(struct pci_dev *dev) kfree(dev->msi_irq_groups); dev->msi_irq_groups = NULL; } + + pci_msi_teardown_msi_irqs(dev); + + list_for_each_entry_safe(entry, tmp, msi_list, list) { + if (entry->msi_attrib.is_msix) { + if (list_is_last(&entry->list, msi_list)) + iounmap(entry->mask_base); + } + + list_del(&entry->list); + free_msi_entry(entry); + } } static void pci_intx_for_msi(struct pci_dev *dev, int enable) From 8dffb800352e75438aab86b9c05582382b222804 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 4 Nov 2021 18:01:29 +0000 Subject: [PATCH 1687/5344] PCI/MSI: Deal with devices lying about their MSI mask capability commit 2226667a145db2e1f314d7f57fd644fe69863ab9 upstream. It appears that some devices are lying about their mask capability, pretending that they don't have it, while they actually do. The net result is that now that we don't enable MSIs on such endpoint. Add a new per-device flag to deal with this. Further patches will make use of it, sadly. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20211104180130.3825416-2-maz@kernel.org Cc: Bjorn Helgaas Signed-off-by: Greg Kroah-Hartman --- drivers/pci/msi.c | 3 +++ include/linux/pci.h | 2 ++ 2 files changed, 5 insertions(+) diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 8a78f92697446..f3df6a851a631 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -591,6 +591,9 @@ msi_setup_entry(struct pci_dev *dev, int nvec, struct irq_affinity *affd) goto out; pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, &control); + /* Lies, damned lies, and MSIs */ + if (dev->dev_flags & PCI_DEV_FLAGS_HAS_MSI_MASKING) + control |= PCI_MSI_FLAGS_MASKBIT; entry->msi_attrib.is_msix = 0; entry->msi_attrib.is_64 = !!(control & PCI_MSI_FLAGS_64BIT); diff --git a/include/linux/pci.h b/include/linux/pci.h index 65399271f9202..6a3e436954993 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -208,6 +208,8 @@ enum pci_dev_flags { PCI_DEV_FLAGS_NO_FLR_RESET = (__force pci_dev_flags_t) (1 << 10), /* Don't use Relaxed Ordering for TLPs directed at this device */ PCI_DEV_FLAGS_NO_RELAXED_ORDERING = (__force pci_dev_flags_t) (1 << 11), + /* Device does honor MSI masking despite saying otherwise */ + PCI_DEV_FLAGS_HAS_MSI_MASKING = (__force pci_dev_flags_t) (1 << 12), }; enum pci_irq_reroute_variant { From bd676ca998b7e829ecf0ec53663026ed7898c4f0 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 4 Nov 2021 18:01:30 +0000 Subject: [PATCH 1688/5344] PCI: Add MSI masking quirk for Nvidia ION AHCI commit f21082fb20dbfb3e42b769b59ef21c2a7f2c7c1f upstream. The ION AHCI device pretends that MSI masking isn't a thing, while it actually implements it and needs MSIs to be unmasked to work. Add a quirk to that effect. Reported-by: Rui Salvaterra Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Rui Salvaterra Reviewed-by: Thomas Gleixner Cc: Bjorn Helgaas Link: https://lore.kernel.org/r/CALjTZvbzYfBuLB+H=fj2J+9=DxjQ2Uqcy0if_PvmJ-nU-qEgkg@mail.gmail.com Link: https://lore.kernel.org/r/20211104180130.3825416-3-maz@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/pci/quirks.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index fb7c5518447da..cf3986d4413f7 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -5777,3 +5777,9 @@ static void apex_pci_fixup_class(struct pci_dev *pdev) } DECLARE_PCI_FIXUP_CLASS_HEADER(0x1ac1, 0x089a, PCI_CLASS_NOT_DEFINED, 8, apex_pci_fixup_class); + +static void nvidia_ion_ahci_fixup(struct pci_dev *pdev) +{ + pdev->dev_flags |= PCI_DEV_FLAGS_HAS_MSI_MASKING; +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NVIDIA, 0x0ab8, nvidia_ion_ahci_fixup); From f74cb2451b35f8b8a8ed1f86391c588ea3b43806 Mon Sep 17 00:00:00 2001 From: Yue Hu Date: Tue, 16 Nov 2021 09:10:34 +0800 Subject: [PATCH 1689/5344] erofs: remove the occupied parameter from z_erofs_pagevec_enqueue() commit 7dea3de7d384f4c8156e8bd93112ba6db1eb276c upstream. No any behavior to variable occupied in z_erofs_attach_page() which is only caller to z_erofs_pagevec_enqueue(). Link: https://lore.kernel.org/r/20210419102623.2015-1-zbestahu@gmail.com Signed-off-by: Yue Hu Reviewed-by: Gao Xiang Signed-off-by: Gao Xiang Signed-off-by: Gao Xiang Signed-off-by: Greg Kroah-Hartman --- fs/erofs/zdata.c | 4 +--- fs/erofs/zpvec.h | 5 +---- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index fad80c97d2476..f784feaeb819a 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -292,7 +292,6 @@ static int z_erofs_attach_page(struct z_erofs_collector *clt, enum z_erofs_page_type type) { int ret; - bool occupied; /* give priority for inplaceio */ if (clt->mode >= COLLECT_PRIMARY && @@ -300,8 +299,7 @@ static int z_erofs_attach_page(struct z_erofs_collector *clt, z_erofs_try_inplace_io(clt, page)) return 0; - ret = z_erofs_pagevec_enqueue(&clt->vector, - page, type, &occupied); + ret = z_erofs_pagevec_enqueue(&clt->vector, page, type); clt->cl->vcnt += (unsigned int)ret; return ret ? 0 : -EAGAIN; diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h index 58556903aa945..a38c526103675 100644 --- a/fs/erofs/zpvec.h +++ b/fs/erofs/zpvec.h @@ -107,10 +107,8 @@ static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor, static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor, struct page *page, - enum z_erofs_page_type type, - bool *occupied) + enum z_erofs_page_type type) { - *occupied = false; if (!ctor->next && type) if (ctor->index + 1 == ctor->nr) return false; @@ -125,7 +123,6 @@ static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor, /* should remind that collector->next never equal to 1, 2 */ if (type == (uintptr_t)ctor->next) { ctor->next = page; - *occupied = true; } ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, page, type); return true; From c0402569ed6eee1e88f1a50fae2714888079b4e1 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 16 Nov 2021 09:10:35 +0800 Subject: [PATCH 1690/5344] erofs: fix unsafe pagevec reuse of hooked pclusters commit 86432a6dca9bed79111990851df5756d3eb5f57c upstream. There are pclusters in runtime marked with Z_EROFS_PCLUSTER_TAIL before actual I/O submission. Thus, the decompression chain can be extended if the following pcluster chain hooks such tail pcluster. As the related comment mentioned, if some page is made of a hooked pcluster and another followed pcluster, it can be reused for in-place I/O (since I/O should be submitted anyway): _______________________________________________________________ | tail (partial) page | head (partial) page | |_____PRIMARY_HOOKED___|____________PRIMARY_FOLLOWED____________| However, it's by no means safe to reuse as pagevec since if such PRIMARY_HOOKED pclusters finally move into bypass chain without I/O submission. It's somewhat hard to reproduce with LZ4 and I just found it (general protection fault) by ro_fsstressing a LZMA image for long time. I'm going to actively clean up related code together with multi-page folio adaption in the next few months. Let's address it directly for easier backporting for now. Call trace for reference: z_erofs_decompress_pcluster+0x10a/0x8a0 [erofs] z_erofs_decompress_queue.isra.36+0x3c/0x60 [erofs] z_erofs_runqueue+0x5f3/0x840 [erofs] z_erofs_readahead+0x1e8/0x320 [erofs] read_pages+0x91/0x270 page_cache_ra_unbounded+0x18b/0x240 filemap_get_pages+0x10a/0x5f0 filemap_read+0xa9/0x330 new_sync_read+0x11b/0x1a0 vfs_read+0xf1/0x190 Link: https://lore.kernel.org/r/20211103182006.4040-1-xiang@kernel.org Fixes: 3883a79abd02 ("staging: erofs: introduce VLE decompression support") Cc: # 4.19+ Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Signed-off-by: Greg Kroah-Hartman --- fs/erofs/zdata.c | 13 +++++++------ fs/erofs/zpvec.h | 13 ++++++++++--- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index f784feaeb819a..fdd18c2508115 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -288,8 +288,8 @@ static inline bool z_erofs_try_inplace_io(struct z_erofs_collector *clt, /* callers must be with collection lock held */ static int z_erofs_attach_page(struct z_erofs_collector *clt, - struct page *page, - enum z_erofs_page_type type) + struct page *page, enum z_erofs_page_type type, + bool pvec_safereuse) { int ret; @@ -299,9 +299,9 @@ static int z_erofs_attach_page(struct z_erofs_collector *clt, z_erofs_try_inplace_io(clt, page)) return 0; - ret = z_erofs_pagevec_enqueue(&clt->vector, page, type); + ret = z_erofs_pagevec_enqueue(&clt->vector, page, type, + pvec_safereuse); clt->cl->vcnt += (unsigned int)ret; - return ret ? 0 : -EAGAIN; } @@ -652,14 +652,15 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, tight &= (clt->mode >= COLLECT_PRIMARY_FOLLOWED); retry: - err = z_erofs_attach_page(clt, page, page_type); + err = z_erofs_attach_page(clt, page, page_type, + clt->mode >= COLLECT_PRIMARY_FOLLOWED); /* should allocate an additional staging page for pagevec */ if (err == -EAGAIN) { struct page *const newpage = __stagingpage_alloc(pagepool, GFP_NOFS); err = z_erofs_attach_page(clt, newpage, - Z_EROFS_PAGE_TYPE_EXCLUSIVE); + Z_EROFS_PAGE_TYPE_EXCLUSIVE, true); if (!err) goto retry; } diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h index a38c526103675..6a20b2c3a24cd 100644 --- a/fs/erofs/zpvec.h +++ b/fs/erofs/zpvec.h @@ -107,11 +107,18 @@ static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor, static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor, struct page *page, - enum z_erofs_page_type type) + enum z_erofs_page_type type, + bool pvec_safereuse) { - if (!ctor->next && type) - if (ctor->index + 1 == ctor->nr) + if (!ctor->next) { + /* some pages cannot be reused as pvec safely without I/O */ + if (type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && !pvec_safereuse) + type = Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED; + + if (type != Z_EROFS_PAGE_TYPE_EXCLUSIVE && + ctor->index + 1 == ctor->nr) return false; + } if (ctor->index >= ctor->nr) z_erofs_pagevec_ctor_pagedown(ctor, false); From 6b540f2349405ce755a4d4e8b59a30ec4146d5f4 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 21 Nov 2021 13:38:51 +0100 Subject: [PATCH 1691/5344] Linux 5.4.161 Link: https://lore.kernel.org/r/20211119171444.640508836@linuxfoundation.org Tested-by: Florian Fainelli Tested-by: Fox Chen Tested-by: Shuah Khan Tested-by: Rudi Heitbaum Tested-by: Guenter Roeck Tested-By: Scott Bruce Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e938662dab289..f552556966f1d 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 4 -SUBLEVEL = 160 +SUBLEVEL = 161 EXTRAVERSION = NAME = Kleptomaniac Octopus From 1ba396ff5dee6435eb373536691554dbc08b8566 Mon Sep 17 00:00:00 2001 From: Amit Kumar Mahapatra Date: Mon, 14 Jun 2021 17:25:10 +0200 Subject: [PATCH 1692/5344] arm64: zynqmp: Do not duplicate flash partition label property [ Upstream commit 167721a5909f867f8c18c8e78ea58e705ad9bbd4 ] In kernel 5.4, support has been added for reading MTD devices via the nvmem API. For this the mtd devices are registered as read-only NVMEM providers under sysfs with the same name as the flash partition label property. So if flash partition label property of multiple flash devices are identical then the second mtd device fails to get registered as a NVMEM provider. This patch fixes the issue by having different label property for different flashes. Signed-off-by: Amit Kumar Mahapatra Signed-off-by: Michal Simek Link: https://lore.kernel.org/r/6c4b9b9232b93d9e316a63c086540fd5bf6b8687.1623684253.git.michal.simek@xilinx.com Signed-off-by: Sasha Levin --- arch/arm64/boot/dts/xilinx/zynqmp-zc1751-xm016-dc2.dts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/xilinx/zynqmp-zc1751-xm016-dc2.dts b/arch/arm64/boot/dts/xilinx/zynqmp-zc1751-xm016-dc2.dts index 2421ec71a201c..41a66787247b6 100644 --- a/arch/arm64/boot/dts/xilinx/zynqmp-zc1751-xm016-dc2.dts +++ b/arch/arm64/boot/dts/xilinx/zynqmp-zc1751-xm016-dc2.dts @@ -131,7 +131,7 @@ reg = <0>; partition@0 { - label = "data"; + label = "spi0-data"; reg = <0x0 0x100000>; }; }; @@ -149,7 +149,7 @@ reg = <0>; partition@0 { - label = "data"; + label = "spi1-data"; reg = <0x0 0x84000>; }; }; From 4ba26dee16d194b882b6a2add43d10b006d6afab Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Fri, 6 Aug 2021 10:58:29 +0200 Subject: [PATCH 1693/5344] arm64: zynqmp: Fix serial compatible string [ Upstream commit 812fa2f0e9d33564bd0131a69750e0d165f4c82a ] Based on commit 65a2c14d4f00 ("dt-bindings: serial: convert Cadence UART bindings to YAML") compatible string should look like differently that's why fix it to be aligned with dt binding. Signed-off-by: Michal Simek Reviewed-by: Laurent Pinchart Link: https://lore.kernel.org/r/89b36e0a6187cc6b05b27a035efdf79173bd4486.1628240307.git.michal.simek@xilinx.com Signed-off-by: Sasha Levin --- arch/arm64/boot/dts/xilinx/zynqmp.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/xilinx/zynqmp.dtsi b/arch/arm64/boot/dts/xilinx/zynqmp.dtsi index a2645262f8623..b92549fb32400 100644 --- a/arch/arm64/boot/dts/xilinx/zynqmp.dtsi +++ b/arch/arm64/boot/dts/xilinx/zynqmp.dtsi @@ -582,7 +582,7 @@ }; uart0: serial@ff000000 { - compatible = "cdns,uart-r1p12", "xlnx,xuartps"; + compatible = "xlnx,zynqmp-uart", "cdns,uart-r1p12"; status = "disabled"; interrupt-parent = <&gic>; interrupts = <0 21 4>; @@ -591,7 +591,7 @@ }; uart1: serial@ff010000 { - compatible = "cdns,uart-r1p12", "xlnx,xuartps"; + compatible = "xlnx,zynqmp-uart", "cdns,uart-r1p12"; status = "disabled"; interrupt-parent = <&gic>; interrupts = <0 22 4>; From af09f36f4409fd8db5352322cf4d9aa132268fcc Mon Sep 17 00:00:00 2001 From: Matthew Hagan Date: Sun, 29 Aug 2021 22:37:48 +0000 Subject: [PATCH 1694/5344] ARM: dts: NSP: Fix mpcore, mmc node names [ Upstream commit 15a563d008ef9d04df525f0c476cd7d7127bb883 ] Running dtbs_check yielded the issues with bcm-nsp.dtsi. Firstly this patch fixes the following message by appending "-bus" to the mpcore node name: mpcore@19000000: $nodename:0: 'mpcore@19000000' does not match '^([a-z][a-z0-9\\-]+-bus|bus|soc|axi|ahb|apb)(@[0-9a-f]+)?$' Secondly mmc node name. The label name can remain as is. sdhci@21000: $nodename:0: 'sdhci@21000' does not match '^mmc(@.*)?$' Signed-off-by: Matthew Hagan Signed-off-by: Florian Fainelli Signed-off-by: Sasha Levin --- arch/arm/boot/dts/bcm-nsp.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/bcm-nsp.dtsi b/arch/arm/boot/dts/bcm-nsp.dtsi index 43ff85d31dc12..5a1352fd90d16 100644 --- a/arch/arm/boot/dts/bcm-nsp.dtsi +++ b/arch/arm/boot/dts/bcm-nsp.dtsi @@ -77,7 +77,7 @@ interrupt-affinity = <&cpu0>, <&cpu1>; }; - mpcore@19000000 { + mpcore-bus@19000000 { compatible = "simple-bus"; ranges = <0x00000000 0x19000000 0x00023000>; #address-cells = <1>; @@ -217,7 +217,7 @@ #dma-cells = <1>; }; - sdio: sdhci@21000 { + sdio: mmc@21000 { compatible = "brcm,sdhci-iproc-cygnus"; reg = <0x21000 0x100>; interrupts = ; From 29cf5177a9871c4cf64ff5459dde44182b39c563 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 10 Sep 2021 16:31:46 -0700 Subject: [PATCH 1695/5344] scsi: lpfc: Fix list_add() corruption in lpfc_drain_txq() [ Upstream commit 99154581b05c8fb22607afb7c3d66c1bace6aa5d ] When parsing the txq list in lpfc_drain_txq(), the driver attempts to pass the requests to the adapter. If such an attempt fails, a local "fail_msg" string is set and a log message output. The job is then added to a completions list for cancellation. Processing of any further jobs from the txq list continues, but since "fail_msg" remains set, jobs are added to the completions list regardless of whether a wqe was passed to the adapter. If successfully added to txcmplq, jobs are added to both lists resulting in list corruption. Fix by clearing the fail_msg string after adding a job to the completions list. This stops the subsequent jobs from being added to the completions list unless they had an appropriate failure. Link: https://lore.kernel.org/r/20210910233159.115896-2-jsmart2021@gmail.com Co-developed-by: Justin Tee Signed-off-by: Justin Tee Signed-off-by: James Smart Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/lpfc/lpfc_sli.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index 4a7ceaa34341c..51bab0979527b 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -19692,6 +19692,7 @@ lpfc_drain_txq(struct lpfc_hba *phba) fail_msg, piocbq->iotag, piocbq->sli4_xritag); list_add_tail(&piocbq->list, &completions); + fail_msg = NULL; } spin_unlock_irqrestore(&pring->ring_lock, iflags); } From adb0b142045e14fcd6cd99707acd8d42b015d9ad Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Mon, 30 Aug 2021 18:51:13 +0200 Subject: [PATCH 1696/5344] arm64: dts: hisilicon: fix arm,sp805 compatible string [ Upstream commit 894d4f1f77d0e88f1f81af2e1e37333c1c41b631 ] According to Documentation/devicetree/bindings/watchdog/arm,sp805.yaml the compatible is: compatible = "arm,sp805", "arm,primecell"; The current compatible string doesn't exist at all. Fix it. Signed-off-by: Michael Walle Signed-off-by: Wei Xu Signed-off-by: Sasha Levin --- arch/arm64/boot/dts/hisilicon/hi3660.dtsi | 4 ++-- arch/arm64/boot/dts/hisilicon/hi6220.dtsi | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/boot/dts/hisilicon/hi3660.dtsi b/arch/arm64/boot/dts/hisilicon/hi3660.dtsi index 253cc345f143a..0c88b72094774 100644 --- a/arch/arm64/boot/dts/hisilicon/hi3660.dtsi +++ b/arch/arm64/boot/dts/hisilicon/hi3660.dtsi @@ -1086,7 +1086,7 @@ }; watchdog0: watchdog@e8a06000 { - compatible = "arm,sp805-wdt", "arm,primecell"; + compatible = "arm,sp805", "arm,primecell"; reg = <0x0 0xe8a06000 0x0 0x1000>; interrupts = ; clocks = <&crg_ctrl HI3660_OSC32K>; @@ -1094,7 +1094,7 @@ }; watchdog1: watchdog@e8a07000 { - compatible = "arm,sp805-wdt", "arm,primecell"; + compatible = "arm,sp805", "arm,primecell"; reg = <0x0 0xe8a07000 0x0 0x1000>; interrupts = ; clocks = <&crg_ctrl HI3660_OSC32K>; diff --git a/arch/arm64/boot/dts/hisilicon/hi6220.dtsi b/arch/arm64/boot/dts/hisilicon/hi6220.dtsi index 108e2a4227f66..568faaba7ace9 100644 --- a/arch/arm64/boot/dts/hisilicon/hi6220.dtsi +++ b/arch/arm64/boot/dts/hisilicon/hi6220.dtsi @@ -839,7 +839,7 @@ }; watchdog0: watchdog@f8005000 { - compatible = "arm,sp805-wdt", "arm,primecell"; + compatible = "arm,sp805", "arm,primecell"; reg = <0x0 0xf8005000 0x0 0x1000>; interrupts = ; clocks = <&ao_ctrl HI6220_WDT0_PCLK>; From b1aa149c2e641ee7ae9a9f48e7bb44a6d23c9872 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Wed, 15 Sep 2021 05:32:42 -0700 Subject: [PATCH 1697/5344] RDMA/bnxt_re: Check if the vlan is valid before reporting [ Upstream commit 6bda39149d4b8920fdb8744090653aca3daa792d ] When VF is configured with default vlan, HW strips the vlan from the packet and driver receives it in Rx completion. VLAN needs to be reported for UD work completion only if the vlan is configured on the host. Add a check for valid vlan in the UD receive path. Link: https://lore.kernel.org/r/1631709163-2287-12-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe Signed-off-by: Sasha Levin --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index a96f9142fe08e..dd006b177b544 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -3081,8 +3081,11 @@ static void bnxt_re_process_res_ud_wc(struct bnxt_re_qp *qp, struct ib_wc *wc, struct bnxt_qplib_cqe *cqe) { + struct bnxt_re_dev *rdev; + u16 vlan_id = 0; u8 nw_type; + rdev = qp->rdev; wc->opcode = IB_WC_RECV; wc->status = __rc_to_ib_wc_status(cqe->status); @@ -3094,9 +3097,12 @@ static void bnxt_re_process_res_ud_wc(struct bnxt_re_qp *qp, memcpy(wc->smac, cqe->smac, ETH_ALEN); wc->wc_flags |= IB_WC_WITH_SMAC; if (cqe->flags & CQ_RES_UD_FLAGS_META_FORMAT_VLAN) { - wc->vlan_id = (cqe->cfa_meta & 0xFFF); - if (wc->vlan_id < 0x1000) - wc->wc_flags |= IB_WC_WITH_VLAN; + vlan_id = (cqe->cfa_meta & 0xFFF); + } + /* Mark only if vlan_id is non zero */ + if (vlan_id && bnxt_re_check_if_vlan_valid(rdev, vlan_id)) { + wc->vlan_id = vlan_id; + wc->wc_flags |= IB_WC_WITH_VLAN; } nw_type = (cqe->flags & CQ_RES_UD_FLAGS_ROCE_IP_VER_MASK) >> CQ_RES_UD_FLAGS_ROCE_IP_VER_SFT; From 8eee0bc499c28a1f8288f906acfacd0830f5e867 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Wed, 15 Sep 2021 11:49:25 +0800 Subject: [PATCH 1698/5344] usb: musb: tusb6010: check return value after calling platform_get_resource() [ Upstream commit 14651496a3de6807a17c310f63c894ea0c5d858e ] It will cause null-ptr-deref if platform_get_resource() returns NULL, we need check the return value. Signed-off-by: Yang Yingliang Link: https://lore.kernel.org/r/20210915034925.2399823-1-yangyingliang@huawei.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/usb/musb/tusb6010.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/usb/musb/tusb6010.c b/drivers/usb/musb/tusb6010.c index 4ecfbf6bb1fa8..902507da8aa85 100644 --- a/drivers/usb/musb/tusb6010.c +++ b/drivers/usb/musb/tusb6010.c @@ -1103,6 +1103,11 @@ static int tusb_musb_init(struct musb *musb) /* dma address for async dma */ mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!mem) { + pr_debug("no async dma resource?\n"); + ret = -ENODEV; + goto done; + } musb->async = mem->start; /* dma address for sync dma */ From 3b760e229c09e96d3405aa41d3add9578c7cf8f3 Mon Sep 17 00:00:00 2001 From: Sven Peter Date: Tue, 14 Sep 2021 16:02:35 +0200 Subject: [PATCH 1699/5344] usb: typec: tipd: Remove WARN_ON in tps6598x_block_read [ Upstream commit b7a0a63f3fed57d413bb857de164ea9c3984bc4e ] Calling tps6598x_block_read with a higher than allowed len can be handled by just returning an error. There's no need to crash systems with panic-on-warn enabled. Reviewed-by: Heikki Krogerus Signed-off-by: Sven Peter Link: https://lore.kernel.org/r/20210914140235.65955-3-sven@svenpeter.dev Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/usb/typec/tps6598x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/typec/tps6598x.c b/drivers/usb/typec/tps6598x.c index a38d1409f15b7..67bebee693301 100644 --- a/drivers/usb/typec/tps6598x.c +++ b/drivers/usb/typec/tps6598x.c @@ -109,7 +109,7 @@ tps6598x_block_read(struct tps6598x *tps, u8 reg, void *val, size_t len) u8 data[TPS_MAX_LEN + 1]; int ret; - if (WARN_ON(len + 1 > sizeof(data))) + if (len + 1 > sizeof(data)) return -EINVAL; if (!tps->i2c_protocol) From be7ffc8d5bf30a8b9d3e1020d026c6633f640d48 Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Wed, 1 Sep 2021 20:31:21 +0200 Subject: [PATCH 1700/5344] arm64: dts: qcom: msm8998: Fix CPU/L2 idle state latency and residency [ Upstream commit 3f1dcaff642e75c1d2ad03f783fa8a3b1f56dd50 ] The entry/exit latency and minimum residency in state for the idle states of MSM8998 were ..bad: first of all, for all of them the timings were written for CPU sleep but the min-residency-us param was miscalculated (supposedly, while porting this from downstream); Then, the power collapse states are setting PC on both the CPU cluster *and* the L2 cache, which have different timings: in the specific case of L2 the times are higher so these ones should be taken into account instead of the CPU ones. This parameter misconfiguration was not giving particular issues because on MSM8998 there was no CPU scaling at all, so cluster/L2 power collapse was rarely (if ever) hit. When CPU scaling is enabled, though, the wrong timings will produce SoC unstability shown to the user as random, apparently error-less, sudden reboots and/or lockups. This set of parameters are stabilizing the SoC when CPU scaling is ON and when power collapse is frequently hit. Signed-off-by: AngeloGioacchino Del Regno Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20210901183123.1087392-3-angelogioacchino.delregno@somainline.org Signed-off-by: Sasha Levin --- arch/arm64/boot/dts/qcom/msm8998.dtsi | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/arm64/boot/dts/qcom/msm8998.dtsi b/arch/arm64/boot/dts/qcom/msm8998.dtsi index ccd535edbf4e1..dcb79003ca0e6 100644 --- a/arch/arm64/boot/dts/qcom/msm8998.dtsi +++ b/arch/arm64/boot/dts/qcom/msm8998.dtsi @@ -246,38 +246,42 @@ LITTLE_CPU_SLEEP_0: cpu-sleep-0-0 { compatible = "arm,idle-state"; idle-state-name = "little-retention"; + /* CPU Retention (C2D), L2 Active */ arm,psci-suspend-param = <0x00000002>; entry-latency-us = <81>; exit-latency-us = <86>; - min-residency-us = <200>; + min-residency-us = <504>; }; LITTLE_CPU_SLEEP_1: cpu-sleep-0-1 { compatible = "arm,idle-state"; idle-state-name = "little-power-collapse"; + /* CPU + L2 Power Collapse (C3, D4) */ arm,psci-suspend-param = <0x40000003>; - entry-latency-us = <273>; - exit-latency-us = <612>; - min-residency-us = <1000>; + entry-latency-us = <814>; + exit-latency-us = <4562>; + min-residency-us = <9183>; local-timer-stop; }; BIG_CPU_SLEEP_0: cpu-sleep-1-0 { compatible = "arm,idle-state"; idle-state-name = "big-retention"; + /* CPU Retention (C2D), L2 Active */ arm,psci-suspend-param = <0x00000002>; entry-latency-us = <79>; exit-latency-us = <82>; - min-residency-us = <200>; + min-residency-us = <1302>; }; BIG_CPU_SLEEP_1: cpu-sleep-1-1 { compatible = "arm,idle-state"; idle-state-name = "big-power-collapse"; + /* CPU + L2 Power Collapse (C3, D4) */ arm,psci-suspend-param = <0x40000003>; - entry-latency-us = <336>; - exit-latency-us = <525>; - min-residency-us = <1000>; + entry-latency-us = <724>; + exit-latency-us = <2027>; + min-residency-us = <9419>; local-timer-stop; }; }; From 6b8aec010e804398d176c6470d622090282242b2 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Thu, 26 Aug 2021 14:35:28 +0200 Subject: [PATCH 1701/5344] arm64: dts: freescale: fix arm,sp805 compatible string [ Upstream commit 99a7cacc66cae92db40139b57689be2af75fc6b8 ] According to Documentation/devicetree/bindings/watchdog/arm,sp805.yaml the compatible is: compatible = "arm,sp805", "arm,primecell"; The current compatible string doesn't exist at all. Fix it. Signed-off-by: Michael Walle Signed-off-by: Shawn Guo Signed-off-by: Sasha Levin --- arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi | 16 ++++++++-------- arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi | 16 ++++++++-------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi index 407ebdb35cd2e..6b1b728de9e9c 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi @@ -637,56 +637,56 @@ }; cluster1_core0_watchdog: wdt@c000000 { - compatible = "arm,sp805-wdt", "arm,primecell"; + compatible = "arm,sp805", "arm,primecell"; reg = <0x0 0xc000000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; clock-names = "wdog_clk", "apb_pclk"; }; cluster1_core1_watchdog: wdt@c010000 { - compatible = "arm,sp805-wdt", "arm,primecell"; + compatible = "arm,sp805", "arm,primecell"; reg = <0x0 0xc010000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; clock-names = "wdog_clk", "apb_pclk"; }; cluster1_core2_watchdog: wdt@c020000 { - compatible = "arm,sp805-wdt", "arm,primecell"; + compatible = "arm,sp805", "arm,primecell"; reg = <0x0 0xc020000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; clock-names = "wdog_clk", "apb_pclk"; }; cluster1_core3_watchdog: wdt@c030000 { - compatible = "arm,sp805-wdt", "arm,primecell"; + compatible = "arm,sp805", "arm,primecell"; reg = <0x0 0xc030000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; clock-names = "wdog_clk", "apb_pclk"; }; cluster2_core0_watchdog: wdt@c100000 { - compatible = "arm,sp805-wdt", "arm,primecell"; + compatible = "arm,sp805", "arm,primecell"; reg = <0x0 0xc100000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; clock-names = "wdog_clk", "apb_pclk"; }; cluster2_core1_watchdog: wdt@c110000 { - compatible = "arm,sp805-wdt", "arm,primecell"; + compatible = "arm,sp805", "arm,primecell"; reg = <0x0 0xc110000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; clock-names = "wdog_clk", "apb_pclk"; }; cluster2_core2_watchdog: wdt@c120000 { - compatible = "arm,sp805-wdt", "arm,primecell"; + compatible = "arm,sp805", "arm,primecell"; reg = <0x0 0xc120000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; clock-names = "wdog_clk", "apb_pclk"; }; cluster2_core3_watchdog: wdt@c130000 { - compatible = "arm,sp805-wdt", "arm,primecell"; + compatible = "arm,sp805", "arm,primecell"; reg = <0x0 0xc130000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; clock-names = "wdog_clk", "apb_pclk"; diff --git a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi index 82f0fe6acbfb7..4bf4a22faa61a 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi @@ -227,56 +227,56 @@ }; cluster1_core0_watchdog: wdt@c000000 { - compatible = "arm,sp805-wdt", "arm,primecell"; + compatible = "arm,sp805", "arm,primecell"; reg = <0x0 0xc000000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; clock-names = "wdog_clk", "apb_pclk"; }; cluster1_core1_watchdog: wdt@c010000 { - compatible = "arm,sp805-wdt", "arm,primecell"; + compatible = "arm,sp805", "arm,primecell"; reg = <0x0 0xc010000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; clock-names = "wdog_clk", "apb_pclk"; }; cluster2_core0_watchdog: wdt@c100000 { - compatible = "arm,sp805-wdt", "arm,primecell"; + compatible = "arm,sp805", "arm,primecell"; reg = <0x0 0xc100000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; clock-names = "wdog_clk", "apb_pclk"; }; cluster2_core1_watchdog: wdt@c110000 { - compatible = "arm,sp805-wdt", "arm,primecell"; + compatible = "arm,sp805", "arm,primecell"; reg = <0x0 0xc110000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; clock-names = "wdog_clk", "apb_pclk"; }; cluster3_core0_watchdog: wdt@c200000 { - compatible = "arm,sp805-wdt", "arm,primecell"; + compatible = "arm,sp805", "arm,primecell"; reg = <0x0 0xc200000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; clock-names = "wdog_clk", "apb_pclk"; }; cluster3_core1_watchdog: wdt@c210000 { - compatible = "arm,sp805-wdt", "arm,primecell"; + compatible = "arm,sp805", "arm,primecell"; reg = <0x0 0xc210000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; clock-names = "wdog_clk", "apb_pclk"; }; cluster4_core0_watchdog: wdt@c300000 { - compatible = "arm,sp805-wdt", "arm,primecell"; + compatible = "arm,sp805", "arm,primecell"; reg = <0x0 0xc300000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; clock-names = "wdog_clk", "apb_pclk"; }; cluster4_core1_watchdog: wdt@c310000 { - compatible = "arm,sp805-wdt", "arm,primecell"; + compatible = "arm,sp805", "arm,primecell"; reg = <0x0 0xc310000 0x0 0x1000>; clocks = <&clockgen 4 3>, <&clockgen 4 3>; clock-names = "wdog_clk", "apb_pclk"; From ee661437049e24cd5d6317a592b27335162f645d Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Fri, 24 Sep 2021 14:24:17 -0500 Subject: [PATCH 1702/5344] ASoC: SOF: Intel: hda-dai: fix potential locking issue [ Upstream commit a20f3b10de61add5e14b6ce4df982f4df2a4cbbc ] The initial hdac_stream code was adapted a third time with the same locking issues. Move the spin_lock outside the loops and make sure the fields are protected on read/write. Signed-off-by: Pierre-Louis Bossart Acked-by: Mark Brown Link: https://lore.kernel.org/r/20210924192417.169243-5-pierre-louis.bossart@linux.intel.com Signed-off-by: Takashi Iwai Signed-off-by: Sasha Levin --- sound/soc/sof/intel/hda-dai.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sound/soc/sof/intel/hda-dai.c b/sound/soc/sof/intel/hda-dai.c index 3f645200d3a5c..b3cdd10c83ae1 100644 --- a/sound/soc/sof/intel/hda-dai.c +++ b/sound/soc/sof/intel/hda-dai.c @@ -67,6 +67,7 @@ static struct hdac_ext_stream * return NULL; } + spin_lock_irq(&bus->reg_lock); list_for_each_entry(stream, &bus->stream_list, list) { struct hdac_ext_stream *hstream = stream_to_hdac_ext_stream(stream); @@ -106,12 +107,12 @@ static struct hdac_ext_stream * * is updated in snd_hdac_ext_stream_decouple(). */ if (!res->decoupled) - snd_hdac_ext_stream_decouple(bus, res, true); - spin_lock_irq(&bus->reg_lock); + snd_hdac_ext_stream_decouple_locked(bus, res, true); + res->link_locked = 1; res->link_substream = substream; - spin_unlock_irq(&bus->reg_lock); } + spin_unlock_irq(&bus->reg_lock); return res; } From bf4c03154ff50e930b33ebd478bb585c28267615 Mon Sep 17 00:00:00 2001 From: Stefan Riedmueller Date: Mon, 27 Sep 2021 09:28:56 +0200 Subject: [PATCH 1703/5344] clk: imx: imx6ul: Move csi_sel mux to correct base register [ Upstream commit 2f9d61869640f732599ec36b984c2b5c46067519 ] The csi_sel mux register is located in the CCM register base and not the CCM_ANALOG register base. So move it to the correct position in code. Otherwise changing the parent of the csi clock can lead to a complete system failure due to the CCM_ANALOG_PLL_SYS_TOG register being falsely modified. Also remove the SET_RATE_PARENT flag since one possible supply for the csi_sel mux is the system PLL which we don't want to modify. Signed-off-by: Stefan Riedmueller Reviewed-by: Abel Vesa Link: https://lore.kernel.org/r/20210927072857.3940880-1-s.riedmueller@phytec.de Signed-off-by: Abel Vesa Signed-off-by: Sasha Levin --- drivers/clk/imx/clk-imx6ul.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clk/imx/clk-imx6ul.c b/drivers/clk/imx/clk-imx6ul.c index bc931988fe7b2..f3ac5a524f4ed 100644 --- a/drivers/clk/imx/clk-imx6ul.c +++ b/drivers/clk/imx/clk-imx6ul.c @@ -161,7 +161,6 @@ static void __init imx6ul_clocks_init(struct device_node *ccm_node) hws[IMX6UL_PLL5_BYPASS] = imx_clk_hw_mux_flags("pll5_bypass", base + 0xa0, 16, 1, pll5_bypass_sels, ARRAY_SIZE(pll5_bypass_sels), CLK_SET_RATE_PARENT); hws[IMX6UL_PLL6_BYPASS] = imx_clk_hw_mux_flags("pll6_bypass", base + 0xe0, 16, 1, pll6_bypass_sels, ARRAY_SIZE(pll6_bypass_sels), CLK_SET_RATE_PARENT); hws[IMX6UL_PLL7_BYPASS] = imx_clk_hw_mux_flags("pll7_bypass", base + 0x20, 16, 1, pll7_bypass_sels, ARRAY_SIZE(pll7_bypass_sels), CLK_SET_RATE_PARENT); - hws[IMX6UL_CLK_CSI_SEL] = imx_clk_hw_mux_flags("csi_sel", base + 0x3c, 9, 2, csi_sels, ARRAY_SIZE(csi_sels), CLK_SET_RATE_PARENT); /* Do not bypass PLLs initially */ clk_set_parent(hws[IMX6UL_PLL1_BYPASS]->clk, hws[IMX6UL_CLK_PLL1]->clk); @@ -270,6 +269,7 @@ static void __init imx6ul_clocks_init(struct device_node *ccm_node) hws[IMX6UL_CLK_ECSPI_SEL] = imx_clk_hw_mux("ecspi_sel", base + 0x38, 18, 1, ecspi_sels, ARRAY_SIZE(ecspi_sels)); hws[IMX6UL_CLK_LCDIF_PRE_SEL] = imx_clk_hw_mux_flags("lcdif_pre_sel", base + 0x38, 15, 3, lcdif_pre_sels, ARRAY_SIZE(lcdif_pre_sels), CLK_SET_RATE_PARENT); hws[IMX6UL_CLK_LCDIF_SEL] = imx_clk_hw_mux("lcdif_sel", base + 0x38, 9, 3, lcdif_sels, ARRAY_SIZE(lcdif_sels)); + hws[IMX6UL_CLK_CSI_SEL] = imx_clk_hw_mux("csi_sel", base + 0x3c, 9, 2, csi_sels, ARRAY_SIZE(csi_sels)); hws[IMX6UL_CLK_LDB_DI0_DIV_SEL] = imx_clk_hw_mux("ldb_di0", base + 0x20, 10, 1, ldb_di0_div_sels, ARRAY_SIZE(ldb_di0_div_sels)); hws[IMX6UL_CLK_LDB_DI1_DIV_SEL] = imx_clk_hw_mux("ldb_di1", base + 0x20, 11, 1, ldb_di1_div_sels, ARRAY_SIZE(ldb_di1_div_sels)); From 92a0974c31044248197c2f653ab79708c60b0d37 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 2 Oct 2021 23:14:57 +0200 Subject: [PATCH 1704/5344] ASoC: nau8824: Add DMI quirk mechanism for active-high jack-detect [ Upstream commit 92d3360108f1839ca40451bad20ff67dd24a1964 ] Add a quirk mechanism to allow specifying that active-high jack-detection should be used on platforms where this info is not available in devicetree. And add an entry for the Cyberbook T116 tablet to the DMI table, so that jack-detection will work properly on this tablet. Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20211002211459.110124-2-hdegoede@redhat.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/codecs/nau8824.c | 40 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/sound/soc/codecs/nau8824.c b/sound/soc/codecs/nau8824.c index 15bd8335f6678..c8ccfa2fff848 100644 --- a/sound/soc/codecs/nau8824.c +++ b/sound/soc/codecs/nau8824.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -27,6 +28,12 @@ #include "nau8824.h" +#define NAU8824_JD_ACTIVE_HIGH BIT(0) + +static int nau8824_quirk; +static int quirk_override = -1; +module_param_named(quirk, quirk_override, uint, 0444); +MODULE_PARM_DESC(quirk, "Board-specific quirk override"); static int nau8824_config_sysclk(struct nau8824 *nau8824, int clk_id, unsigned int freq); @@ -1875,6 +1882,34 @@ static int nau8824_read_device_properties(struct device *dev, return 0; } +/* Please keep this list alphabetically sorted */ +static const struct dmi_system_id nau8824_quirk_table[] = { + { + /* Cyberbook T116 rugged tablet */ + .matches = { + DMI_EXACT_MATCH(DMI_BOARD_VENDOR, "Default string"), + DMI_EXACT_MATCH(DMI_BOARD_NAME, "Cherry Trail CR"), + DMI_EXACT_MATCH(DMI_PRODUCT_SKU, "20170531"), + }, + .driver_data = (void *)(NAU8824_JD_ACTIVE_HIGH), + }, + {} +}; + +static void nau8824_check_quirks(void) +{ + const struct dmi_system_id *dmi_id; + + if (quirk_override != -1) { + nau8824_quirk = quirk_override; + return; + } + + dmi_id = dmi_first_match(nau8824_quirk_table); + if (dmi_id) + nau8824_quirk = (unsigned long)dmi_id->driver_data; +} + static int nau8824_i2c_probe(struct i2c_client *i2c, const struct i2c_device_id *id) { @@ -1899,6 +1934,11 @@ static int nau8824_i2c_probe(struct i2c_client *i2c, nau8824->irq = i2c->irq; sema_init(&nau8824->jd_sem, 1); + nau8824_check_quirks(); + + if (nau8824_quirk & NAU8824_JD_ACTIVE_HIGH) + nau8824->jkdet_polarity = 0; + nau8824_print_device_properties(nau8824); ret = regmap_read(nau8824->regmap, NAU8824_REG_I2C_DEVICE_ID, &value); From 9374cce564e7b4149f01dce5cc45aed104e1131f Mon Sep 17 00:00:00 2001 From: Guo Zhi Date: Wed, 29 Sep 2021 20:25:37 +0800 Subject: [PATCH 1705/5344] scsi: advansys: Fix kernel pointer leak [ Upstream commit d4996c6eac4c81b8872043e9391563f67f13e406 ] Pointers should be printed with %p or %px rather than cast to 'unsigned long' and printed with %lx. Change %lx to %p to print the hashed pointer. Link: https://lore.kernel.org/r/20210929122538.1158235-1-qtxuning1999@sjtu.edu.cn Signed-off-by: Guo Zhi Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/advansys.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/advansys.c b/drivers/scsi/advansys.c index a242a62caaa16..7b3e52ff5f516 100644 --- a/drivers/scsi/advansys.c +++ b/drivers/scsi/advansys.c @@ -3366,8 +3366,8 @@ static void asc_prt_adv_board_info(struct seq_file *m, struct Scsi_Host *shost) shost->host_no); seq_printf(m, - " iop_base 0x%lx, cable_detect: %X, err_code %u\n", - (unsigned long)v->iop_base, + " iop_base 0x%p, cable_detect: %X, err_code %u\n", + v->iop_base, AdvReadWordRegister(iop_base,IOPW_SCSI_CFG1) & CABLE_DETECT, v->err_code); From d0364d14b2e7f97df6fd9ac84248138b28f84870 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 17 Sep 2021 11:22:13 -0700 Subject: [PATCH 1706/5344] firmware_loader: fix pre-allocated buf built-in firmware use [ Upstream commit f7a07f7b96033df7709042ff38e998720a3f7119 ] The firmware_loader can be used with a pre-allocated buffer through the use of the API calls: o request_firmware_into_buf() o request_partial_firmware_into_buf() If the firmware was built-in and present, our current check for if the built-in firmware fits into the pre-allocated buffer does not return any errors, and we proceed to tell the caller that everything worked fine. It's a lie and no firmware would end up being copied into the pre-allocated buffer. So if the caller trust the result it may end up writing a bunch of 0's to a device! Fix this by making the function that checks for the pre-allocated buffer return non-void. Since the typical use case is when no pre-allocated buffer is provided make this return successfully for that case. If the built-in firmware does *not* fit into the pre-allocated buffer size return a failure as we should have been doing before. I'm not aware of users of the built-in firmware using the API calls with a pre-allocated buffer, as such I doubt this fixes any real life issue. But you never know... perhaps some oddball private tree might use it. In so far as upstream is concerned this just fixes our code for correctness. Signed-off-by: Luis Chamberlain Link: https://lore.kernel.org/r/20210917182226.3532898-2-mcgrof@kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/base/firmware_loader/main.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/base/firmware_loader/main.c b/drivers/base/firmware_loader/main.c index 249349f64bfe9..4f6b76bd957ef 100644 --- a/drivers/base/firmware_loader/main.c +++ b/drivers/base/firmware_loader/main.c @@ -98,12 +98,15 @@ static struct firmware_cache fw_cache; extern struct builtin_fw __start_builtin_fw[]; extern struct builtin_fw __end_builtin_fw[]; -static void fw_copy_to_prealloc_buf(struct firmware *fw, +static bool fw_copy_to_prealloc_buf(struct firmware *fw, void *buf, size_t size) { - if (!buf || size < fw->size) - return; + if (!buf) + return true; + if (size < fw->size) + return false; memcpy(buf, fw->data, fw->size); + return true; } static bool fw_get_builtin_firmware(struct firmware *fw, const char *name, @@ -115,9 +118,7 @@ static bool fw_get_builtin_firmware(struct firmware *fw, const char *name, if (strcmp(name, b_fw->name) == 0) { fw->size = b_fw->size; fw->data = b_fw->data; - fw_copy_to_prealloc_buf(fw, buf, size); - - return true; + return fw_copy_to_prealloc_buf(fw, buf, size); } } From 31b0368bc4a7da4075c1bb26924df87577f25693 Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Thu, 7 Oct 2021 15:08:30 +0300 Subject: [PATCH 1707/5344] ARM: dts: omap: fix gpmc,mux-add-data type [ Upstream commit 51b9e22ffd3c4c56cbb7caae9750f70e55ffa603 ] gpmc,mux-add-data is not boolean. Fixes the below errors flagged by dtbs_check. "ethernet@4,0:gpmc,mux-add-data: True is not of type 'array'" Signed-off-by: Roger Quadros Signed-off-by: Tony Lindgren Signed-off-by: Sasha Levin --- arch/arm/boot/dts/omap-gpmc-smsc9221.dtsi | 2 +- arch/arm/boot/dts/omap3-overo-tobiduo-common.dtsi | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/omap-gpmc-smsc9221.dtsi b/arch/arm/boot/dts/omap-gpmc-smsc9221.dtsi index 7f6aefd134514..e7534fe9c53cf 100644 --- a/arch/arm/boot/dts/omap-gpmc-smsc9221.dtsi +++ b/arch/arm/boot/dts/omap-gpmc-smsc9221.dtsi @@ -29,7 +29,7 @@ compatible = "smsc,lan9221","smsc,lan9115"; bank-width = <2>; - gpmc,mux-add-data; + gpmc,mux-add-data = <0>; gpmc,cs-on-ns = <0>; gpmc,cs-rd-off-ns = <42>; gpmc,cs-wr-off-ns = <36>; diff --git a/arch/arm/boot/dts/omap3-overo-tobiduo-common.dtsi b/arch/arm/boot/dts/omap3-overo-tobiduo-common.dtsi index e5da3bc6f1050..218a10c0d8159 100644 --- a/arch/arm/boot/dts/omap3-overo-tobiduo-common.dtsi +++ b/arch/arm/boot/dts/omap3-overo-tobiduo-common.dtsi @@ -22,7 +22,7 @@ compatible = "smsc,lan9221","smsc,lan9115"; bank-width = <2>; - gpmc,mux-add-data; + gpmc,mux-add-data = <0>; gpmc,cs-on-ns = <0>; gpmc,cs-rd-off-ns = <42>; gpmc,cs-wr-off-ns = <36>; From a425309877354ecef3d2d2a2ca5dab837cda2c26 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Mon, 11 Oct 2021 21:49:20 +0800 Subject: [PATCH 1708/5344] usb: host: ohci-tmio: check return value after calling platform_get_resource() [ Upstream commit 9eff2b2e59fda25051ab36cd1cb5014661df657b ] It will cause null-ptr-deref if platform_get_resource() returns NULL, we need check the return value. Acked-by: Alan Stern Signed-off-by: Yang Yingliang Link: https://lore.kernel.org/r/20211011134920.118477-1-yangyingliang@huawei.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/usb/host/ohci-tmio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/host/ohci-tmio.c b/drivers/usb/host/ohci-tmio.c index fed43c6dd85cc..b611c8b09a89f 100644 --- a/drivers/usb/host/ohci-tmio.c +++ b/drivers/usb/host/ohci-tmio.c @@ -199,7 +199,7 @@ static int ohci_hcd_tmio_drv_probe(struct platform_device *dev) if (usb_disabled()) return -ENODEV; - if (!cell) + if (!cell || !regs || !config || !sram) return -EINVAL; if (irq < 0) From 2944a515e2da1ecd4b5f49b10f886f5e96ccf6de Mon Sep 17 00:00:00 2001 From: Li Yang Date: Tue, 12 Oct 2021 18:58:22 -0500 Subject: [PATCH 1709/5344] ARM: dts: ls1021a: move thermal-zones node out of soc/ [ Upstream commit 1ee1500ef717eefb5d9bdaf97905cb81b4e69aa4 ] This fixes dtbs-check error from simple-bus schema: soc: thermal-zones: {'type': 'object'} is not allowed for {'cpu-thermal': ..... } From schema: /home/leo/.local/lib/python3.8/site-packages/dtschema/schemas/simple-bus.yaml Signed-off-by: Li Yang Signed-off-by: Shawn Guo Signed-off-by: Sasha Levin --- arch/arm/boot/dts/ls1021a.dtsi | 66 +++++++++++++++++----------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/arch/arm/boot/dts/ls1021a.dtsi b/arch/arm/boot/dts/ls1021a.dtsi index c62fcca7b4263..aeb8a40b6b601 100644 --- a/arch/arm/boot/dts/ls1021a.dtsi +++ b/arch/arm/boot/dts/ls1021a.dtsi @@ -311,39 +311,6 @@ #thermal-sensor-cells = <1>; }; - thermal-zones { - cpu_thermal: cpu-thermal { - polling-delay-passive = <1000>; - polling-delay = <5000>; - - thermal-sensors = <&tmu 0>; - - trips { - cpu_alert: cpu-alert { - temperature = <85000>; - hysteresis = <2000>; - type = "passive"; - }; - cpu_crit: cpu-crit { - temperature = <95000>; - hysteresis = <2000>; - type = "critical"; - }; - }; - - cooling-maps { - map0 { - trip = <&cpu_alert>; - cooling-device = - <&cpu0 THERMAL_NO_LIMIT - THERMAL_NO_LIMIT>, - <&cpu1 THERMAL_NO_LIMIT - THERMAL_NO_LIMIT>; - }; - }; - }; - }; - dspi0: spi@2100000 { compatible = "fsl,ls1021a-v1.0-dspi"; #address-cells = <1>; @@ -984,4 +951,37 @@ }; }; + + thermal-zones { + cpu_thermal: cpu-thermal { + polling-delay-passive = <1000>; + polling-delay = <5000>; + + thermal-sensors = <&tmu 0>; + + trips { + cpu_alert: cpu-alert { + temperature = <85000>; + hysteresis = <2000>; + type = "passive"; + }; + cpu_crit: cpu-crit { + temperature = <95000>; + hysteresis = <2000>; + type = "critical"; + }; + }; + + cooling-maps { + map0 { + trip = <&cpu_alert>; + cooling-device = + <&cpu0 THERMAL_NO_LIMIT + THERMAL_NO_LIMIT>, + <&cpu1 THERMAL_NO_LIMIT + THERMAL_NO_LIMIT>; + }; + }; + }; + }; }; From 14fdb8891f090df13ae48586b33f47cc478f373f Mon Sep 17 00:00:00 2001 From: Li Yang Date: Tue, 12 Oct 2021 18:58:23 -0500 Subject: [PATCH 1710/5344] ARM: dts: ls1021a-tsn: use generic "jedec,spi-nor" compatible for flash [ Upstream commit 05e63b48b20fa70726be505a7660d1a07bc1cffb ] We cannot list all the possible chips used in different board revisions, just use the generic "jedec,spi-nor" compatible instead. This also fixes dtbs_check error: ['jedec,spi-nor', 's25fl256s1', 's25fl512s'] is too long Signed-off-by: Li Yang Reviewed-by: Kuldeep Singh Signed-off-by: Shawn Guo Signed-off-by: Sasha Levin --- arch/arm/boot/dts/ls1021a-tsn.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/ls1021a-tsn.dts b/arch/arm/boot/dts/ls1021a-tsn.dts index 5b7689094b70e..7235ce2a32936 100644 --- a/arch/arm/boot/dts/ls1021a-tsn.dts +++ b/arch/arm/boot/dts/ls1021a-tsn.dts @@ -247,7 +247,7 @@ flash@0 { /* Rev. A uses 64MB flash, Rev. B & C use 32MB flash */ - compatible = "jedec,spi-nor", "s25fl256s1", "s25fl512s"; + compatible = "jedec,spi-nor"; spi-max-frequency = <20000000>; #address-cells = <1>; #size-cells = <1>; From 1eb1edb6c5c757417611fd40a34e82cfaea77c00 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 15 Oct 2021 23:26:02 -0700 Subject: [PATCH 1711/5344] ALSA: ISA: not for M68K [ Upstream commit 3c05f1477e62ea5a0a8797ba6a545b1dc751fb31 ] On m68k, compiling drivers under SND_ISA causes build errors: ../sound/core/isadma.c: In function 'snd_dma_program': ../sound/core/isadma.c:33:17: error: implicit declaration of function 'claim_dma_lock' [-Werror=implicit-function-declaration] 33 | flags = claim_dma_lock(); | ^~~~~~~~~~~~~~ ../sound/core/isadma.c:41:9: error: implicit declaration of function 'release_dma_lock' [-Werror=implicit-function-declaration] 41 | release_dma_lock(flags); | ^~~~~~~~~~~~~~~~ ../sound/isa/sb/sb16_main.c: In function 'snd_sb16_playback_prepare': ../sound/isa/sb/sb16_main.c:253:72: error: 'DMA_AUTOINIT' undeclared (first use in this function) 253 | snd_dma_program(dma, runtime->dma_addr, size, DMA_MODE_WRITE | DMA_AUTOINIT); | ^~~~~~~~~~~~ ../sound/isa/sb/sb16_main.c:253:72: note: each undeclared identifier is reported only once for each function it appears in ../sound/isa/sb/sb16_main.c: In function 'snd_sb16_capture_prepare': ../sound/isa/sb/sb16_main.c:322:71: error: 'DMA_AUTOINIT' undeclared (first use in this function) 322 | snd_dma_program(dma, runtime->dma_addr, size, DMA_MODE_READ | DMA_AUTOINIT); | ^~~~~~~~~~~~ and more... Signed-off-by: Randy Dunlap Cc: Jaroslav Kysela Cc: Takashi Iwai Cc: alsa-devel@alsa-project.org Cc: linux-m68k@lists.linux-m68k.org Cc: Geert Uytterhoeven Link: https://lore.kernel.org/r/20211016062602.3588-1-rdunlap@infradead.org Signed-off-by: Takashi Iwai Signed-off-by: Sasha Levin --- sound/core/Makefile | 2 ++ sound/isa/Kconfig | 2 +- sound/pci/Kconfig | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/sound/core/Makefile b/sound/core/Makefile index ee4a4a6b99ba7..d123587c0fd8f 100644 --- a/sound/core/Makefile +++ b/sound/core/Makefile @@ -9,7 +9,9 @@ ifneq ($(CONFIG_SND_PROC_FS),) snd-y += info.o snd-$(CONFIG_SND_OSSEMUL) += info_oss.o endif +ifneq ($(CONFIG_M68K),y) snd-$(CONFIG_ISA_DMA_API) += isadma.o +endif snd-$(CONFIG_SND_OSSEMUL) += sound_oss.o snd-$(CONFIG_SND_VMASTER) += vmaster.o snd-$(CONFIG_SND_JACK) += ctljack.o jack.o diff --git a/sound/isa/Kconfig b/sound/isa/Kconfig index b690ed937cbe8..df2e45c8814e9 100644 --- a/sound/isa/Kconfig +++ b/sound/isa/Kconfig @@ -22,7 +22,7 @@ config SND_SB16_DSP menuconfig SND_ISA bool "ISA sound devices" depends on ISA || COMPILE_TEST - depends on ISA_DMA_API + depends on ISA_DMA_API && !M68K default y help Support for sound devices connected via the ISA bus. diff --git a/sound/pci/Kconfig b/sound/pci/Kconfig index 7630f808d087c..6edde2f145025 100644 --- a/sound/pci/Kconfig +++ b/sound/pci/Kconfig @@ -279,6 +279,7 @@ config SND_CS46XX_NEW_DSP config SND_CS5530 tristate "CS5530 Audio" depends on ISA_DMA_API && (X86_32 || COMPILE_TEST) + depends on !M68K select SND_SB16_DSP help Say Y here to include support for audio on Cyrix/NatSemi CS5530 chips. From 5bd187d0b3290e52c29ea6e6d92b6edb526ff88d Mon Sep 17 00:00:00 2001 From: Guanghui Feng Date: Mon, 11 Oct 2021 22:08:24 +0800 Subject: [PATCH 1712/5344] tty: tty_buffer: Fix the softlockup issue in flush_to_ldisc [ Upstream commit 3968ddcf05fb4b9409cd1859feb06a5b0550a1c1 ] When running ltp testcase(ltp/testcases/kernel/pty/pty04.c) with arm64, there is a soft lockup, which look like this one: Workqueue: events_unbound flush_to_ldisc Call trace: dump_backtrace+0x0/0x1ec show_stack+0x24/0x30 dump_stack+0xd0/0x128 panic+0x15c/0x374 watchdog_timer_fn+0x2b8/0x304 __run_hrtimer+0x88/0x2c0 __hrtimer_run_queues+0xa4/0x120 hrtimer_interrupt+0xfc/0x270 arch_timer_handler_phys+0x40/0x50 handle_percpu_devid_irq+0x94/0x220 __handle_domain_irq+0x88/0xf0 gic_handle_irq+0x84/0xfc el1_irq+0xc8/0x180 slip_unesc+0x80/0x214 [slip] tty_ldisc_receive_buf+0x64/0x80 tty_port_default_receive_buf+0x50/0x90 flush_to_ldisc+0xbc/0x110 process_one_work+0x1d4/0x4b0 worker_thread+0x180/0x430 kthread+0x11c/0x120 In the testcase pty04, The first process call the write syscall to send data to the pty master. At the same time, the workqueue will do the flush_to_ldisc to pop data in a loop until there is no more data left. When the sender and workqueue running in different core, the sender sends data fastly in full time which will result in workqueue doing work in loop for a long time and occuring softlockup in flush_to_ldisc with kernel configured without preempt. So I add need_resched check and cond_resched in the flush_to_ldisc loop to avoid it. Signed-off-by: Guanghui Feng Link: https://lore.kernel.org/r/1633961304-24759-1-git-send-email-guanghuifeng@linux.alibaba.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/tty/tty_buffer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c index ec145a59f1993..bb148dbfbb88f 100644 --- a/drivers/tty/tty_buffer.c +++ b/drivers/tty/tty_buffer.c @@ -534,6 +534,9 @@ static void flush_to_ldisc(struct work_struct *work) if (!count) break; head->read += count; + + if (need_resched()) + cond_resched(); } mutex_unlock(&buf->lock); From 7fcddaf9cce052929ba0c707b7e3667c1802471e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 12 Oct 2021 15:23:12 -0700 Subject: [PATCH 1713/5344] MIPS: sni: Fix the build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit c91cf42f61dc77b289784ea7b15a8531defa41c0 ] This patch fixes the following gcc 10 build error: arch/mips/sni/time.c: In function ‘a20r_set_periodic’: arch/mips/sni/time.c:15:26: error: unsigned conversion from ‘int’ to ‘u8’ {aka ‘volatile unsigned char’} changes value from ‘576’ to ‘64’ [-Werror=overflow] 15 | #define SNI_COUNTER0_DIV ((SNI_CLOCK_TICK_RATE / SNI_COUNTER2_DIV) / HZ) | ^ arch/mips/sni/time.c:21:45: note: in expansion of macro ‘SNI_COUNTER0_DIV’ 21 | *(volatile u8 *)(A20R_PT_CLOCK_BASE + 0) = SNI_COUNTER0_DIV; | ^~~~~~~~~~~~~~~~ Cc: linux-mips@vger.kernel.org Signed-off-by: Bart Van Assche Signed-off-by: Thomas Bogendoerfer Signed-off-by: Sasha Levin --- arch/mips/sni/time.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/sni/time.c b/arch/mips/sni/time.c index dbace1f3e1a97..745ceb945fc50 100644 --- a/arch/mips/sni/time.c +++ b/arch/mips/sni/time.c @@ -18,14 +18,14 @@ static int a20r_set_periodic(struct clock_event_device *evt) { *(volatile u8 *)(A20R_PT_CLOCK_BASE + 12) = 0x34; wmb(); - *(volatile u8 *)(A20R_PT_CLOCK_BASE + 0) = SNI_COUNTER0_DIV; + *(volatile u8 *)(A20R_PT_CLOCK_BASE + 0) = SNI_COUNTER0_DIV & 0xff; wmb(); *(volatile u8 *)(A20R_PT_CLOCK_BASE + 0) = SNI_COUNTER0_DIV >> 8; wmb(); *(volatile u8 *)(A20R_PT_CLOCK_BASE + 12) = 0xb4; wmb(); - *(volatile u8 *)(A20R_PT_CLOCK_BASE + 8) = SNI_COUNTER2_DIV; + *(volatile u8 *)(A20R_PT_CLOCK_BASE + 8) = SNI_COUNTER2_DIV & 0xff; wmb(); *(volatile u8 *)(A20R_PT_CLOCK_BASE + 8) = SNI_COUNTER2_DIV >> 8; wmb(); From cffb06fe451256663982ece8fe2eba865cbd7409 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Wed, 29 Sep 2021 21:04:19 -0500 Subject: [PATCH 1714/5344] scsi: target: Fix ordered tag handling [ Upstream commit ed1227e080990ffec5bf39006ec8a57358e6689a ] This patch fixes the following bugs: 1. If there are multiple ordered cmds queued and multiple simple cmds completing, target_restart_delayed_cmds() could be called on different CPUs and each instance could start a ordered cmd. They could then run in different orders than they were queued. 2. target_restart_delayed_cmds() and target_handle_task_attr() can race where: 1. target_handle_task_attr() has passed the simple_cmds == 0 check. 2. transport_complete_task_attr() then decrements simple_cmds to 0. 3. transport_complete_task_attr() runs target_restart_delayed_cmds() and it does not see any cmds on the delayed_cmd_list. 4. target_handle_task_attr() adds the cmd to the delayed_cmd_list. The cmd will then end up timing out. 3. If we are sent > 1 ordered cmds and simple_cmds == 0, we can execute them out of order, because target_handle_task_attr() will hit that simple_cmds check first and return false for all ordered cmds sent. 4. We run target_restart_delayed_cmds() after every cmd completion, so if there is more than 1 simple cmd running, we start executing ordered cmds after that first cmd instead of waiting for all of them to complete. 5. Ordered cmds are not supposed to start until HEAD OF QUEUE and all older cmds have completed, and not just simple. 6. It's not a bug but it doesn't make sense to take the delayed_cmd_lock for every cmd completion when ordered cmds are almost never used. Just replacing that lock with an atomic increases IOPs by up to 10% when completions are spread over multiple CPUs and there are multiple sessions/ mqs/thread accessing the same device. This patch moves the queued delayed handling to a per device work to serialze the cmd executions for each device and adds a new counter to track HEAD_OF_QUEUE and SIMPLE cmds. We can then check the new counter to determine when to run the work on the completion path. Link: https://lore.kernel.org/r/20210930020422.92578-3-michael.christie@oracle.com Signed-off-by: Mike Christie Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/target/target_core_device.c | 2 + drivers/target/target_core_internal.h | 1 + drivers/target/target_core_transport.c | 76 ++++++++++++++++++-------- include/target/target_core_base.h | 6 +- 4 files changed, 61 insertions(+), 24 deletions(-) diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c index 2d19f0e332b01..20fe287039857 100644 --- a/drivers/target/target_core_device.c +++ b/drivers/target/target_core_device.c @@ -758,6 +758,8 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name) INIT_LIST_HEAD(&dev->t10_alua.lba_map_list); spin_lock_init(&dev->t10_alua.lba_map_lock); + INIT_WORK(&dev->delayed_cmd_work, target_do_delayed_work); + dev->t10_wwn.t10_dev = dev; dev->t10_alua.t10_dev = dev; diff --git a/drivers/target/target_core_internal.h b/drivers/target/target_core_internal.h index e7b3c6e5d5744..e4f072a680d41 100644 --- a/drivers/target/target_core_internal.h +++ b/drivers/target/target_core_internal.h @@ -150,6 +150,7 @@ int transport_dump_vpd_ident(struct t10_vpd *, unsigned char *, int); void transport_clear_lun_ref(struct se_lun *); sense_reason_t target_cmd_size_check(struct se_cmd *cmd, unsigned int size); void target_qf_do_work(struct work_struct *work); +void target_do_delayed_work(struct work_struct *work); bool target_check_wce(struct se_device *dev); bool target_check_fua(struct se_device *dev); void __target_execute_cmd(struct se_cmd *, bool); diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 5cf9e7677926f..f52fe40002259 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -2021,32 +2021,35 @@ static bool target_handle_task_attr(struct se_cmd *cmd) */ switch (cmd->sam_task_attr) { case TCM_HEAD_TAG: + atomic_inc_mb(&dev->non_ordered); pr_debug("Added HEAD_OF_QUEUE for CDB: 0x%02x\n", cmd->t_task_cdb[0]); return false; case TCM_ORDERED_TAG: - atomic_inc_mb(&dev->dev_ordered_sync); + atomic_inc_mb(&dev->delayed_cmd_count); pr_debug("Added ORDERED for CDB: 0x%02x to ordered list\n", cmd->t_task_cdb[0]); - - /* - * Execute an ORDERED command if no other older commands - * exist that need to be completed first. - */ - if (!atomic_read(&dev->simple_cmds)) - return false; break; default: /* * For SIMPLE and UNTAGGED Task Attribute commands */ - atomic_inc_mb(&dev->simple_cmds); + atomic_inc_mb(&dev->non_ordered); + + if (atomic_read(&dev->delayed_cmd_count) == 0) + return false; break; } - if (atomic_read(&dev->dev_ordered_sync) == 0) - return false; + if (cmd->sam_task_attr != TCM_ORDERED_TAG) { + atomic_inc_mb(&dev->delayed_cmd_count); + /* + * We will account for this when we dequeue from the delayed + * list. + */ + atomic_dec_mb(&dev->non_ordered); + } spin_lock(&dev->delayed_cmd_lock); list_add_tail(&cmd->se_delayed_node, &dev->delayed_cmd_list); @@ -2054,6 +2057,12 @@ static bool target_handle_task_attr(struct se_cmd *cmd) pr_debug("Added CDB: 0x%02x Task Attr: 0x%02x to delayed CMD listn", cmd->t_task_cdb[0], cmd->sam_task_attr); + /* + * We may have no non ordered cmds when this function started or we + * could have raced with the last simple/head cmd completing, so kick + * the delayed handler here. + */ + schedule_work(&dev->delayed_cmd_work); return true; } @@ -2091,29 +2100,48 @@ EXPORT_SYMBOL(target_execute_cmd); * Process all commands up to the last received ORDERED task attribute which * requires another blocking boundary */ -static void target_restart_delayed_cmds(struct se_device *dev) +void target_do_delayed_work(struct work_struct *work) { - for (;;) { + struct se_device *dev = container_of(work, struct se_device, + delayed_cmd_work); + + spin_lock(&dev->delayed_cmd_lock); + while (!dev->ordered_sync_in_progress) { struct se_cmd *cmd; - spin_lock(&dev->delayed_cmd_lock); - if (list_empty(&dev->delayed_cmd_list)) { - spin_unlock(&dev->delayed_cmd_lock); + if (list_empty(&dev->delayed_cmd_list)) break; - } cmd = list_entry(dev->delayed_cmd_list.next, struct se_cmd, se_delayed_node); + + if (cmd->sam_task_attr == TCM_ORDERED_TAG) { + /* + * Check if we started with: + * [ordered] [simple] [ordered] + * and we are now at the last ordered so we have to wait + * for the simple cmd. + */ + if (atomic_read(&dev->non_ordered) > 0) + break; + + dev->ordered_sync_in_progress = true; + } + list_del(&cmd->se_delayed_node); + atomic_dec_mb(&dev->delayed_cmd_count); spin_unlock(&dev->delayed_cmd_lock); + if (cmd->sam_task_attr != TCM_ORDERED_TAG) + atomic_inc_mb(&dev->non_ordered); + cmd->transport_state |= CMD_T_SENT; __target_execute_cmd(cmd, true); - if (cmd->sam_task_attr == TCM_ORDERED_TAG) - break; + spin_lock(&dev->delayed_cmd_lock); } + spin_unlock(&dev->delayed_cmd_lock); } /* @@ -2131,14 +2159,17 @@ static void transport_complete_task_attr(struct se_cmd *cmd) goto restart; if (cmd->sam_task_attr == TCM_SIMPLE_TAG) { - atomic_dec_mb(&dev->simple_cmds); + atomic_dec_mb(&dev->non_ordered); dev->dev_cur_ordered_id++; } else if (cmd->sam_task_attr == TCM_HEAD_TAG) { + atomic_dec_mb(&dev->non_ordered); dev->dev_cur_ordered_id++; pr_debug("Incremented dev_cur_ordered_id: %u for HEAD_OF_QUEUE\n", dev->dev_cur_ordered_id); } else if (cmd->sam_task_attr == TCM_ORDERED_TAG) { - atomic_dec_mb(&dev->dev_ordered_sync); + spin_lock(&dev->delayed_cmd_lock); + dev->ordered_sync_in_progress = false; + spin_unlock(&dev->delayed_cmd_lock); dev->dev_cur_ordered_id++; pr_debug("Incremented dev_cur_ordered_id: %u for ORDERED\n", @@ -2147,7 +2178,8 @@ static void transport_complete_task_attr(struct se_cmd *cmd) cmd->se_cmd_flags &= ~SCF_TASK_ATTR_SET; restart: - target_restart_delayed_cmds(dev); + if (atomic_read(&dev->delayed_cmd_count) > 0) + schedule_work(&dev->delayed_cmd_work); } static void transport_complete_qf(struct se_cmd *cmd) diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index 7c9716fe731e2..59d7ebb8bbaf4 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -781,8 +781,9 @@ struct se_device { atomic_long_t read_bytes; atomic_long_t write_bytes; /* Active commands on this virtual SE device */ - atomic_t simple_cmds; - atomic_t dev_ordered_sync; + atomic_t non_ordered; + bool ordered_sync_in_progress; + atomic_t delayed_cmd_count; atomic_t dev_qf_count; u32 export_count; spinlock_t delayed_cmd_lock; @@ -804,6 +805,7 @@ struct se_device { struct list_head dev_sep_list; struct list_head dev_tmr_list; struct work_struct qf_work_queue; + struct work_struct delayed_cmd_work; struct list_head delayed_cmd_list; struct list_head state_list; struct list_head qf_cmd_list; From 99fcf84f845f2446046df5398c6fa55e057bd1f7 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Wed, 29 Sep 2021 21:04:20 -0500 Subject: [PATCH 1715/5344] scsi: target: Fix alua_tg_pt_gps_count tracking [ Upstream commit 1283c0d1a32bb924324481586b5d6e8e76f676ba ] We can't free the tg_pt_gp in core_alua_set_tg_pt_gp_id() because it's still accessed via configfs. Its release must go through the normal configfs/refcount process. The max alua_tg_pt_gps_count check should probably have been done in core_alua_allocate_tg_pt_gp(), but with the current code userspace could have created 0x0000ffff + 1 groups, but only set the id for 0x0000ffff. Then it could have deleted a group with an ID set, and then set the ID for that extra group and it would work ok. It's unlikely, but just in case this patch continues to allow that type of behavior, and just fixes the kfree() while in use bug. Link: https://lore.kernel.org/r/20210930020422.92578-4-michael.christie@oracle.com Signed-off-by: Mike Christie Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/target/target_core_alua.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/target/target_core_alua.c b/drivers/target/target_core_alua.c index 385e4cf9cfa63..0fc3135d3e4f6 100644 --- a/drivers/target/target_core_alua.c +++ b/drivers/target/target_core_alua.c @@ -1702,7 +1702,6 @@ int core_alua_set_tg_pt_gp_id( pr_err("Maximum ALUA alua_tg_pt_gps_count:" " 0x0000ffff reached\n"); spin_unlock(&dev->t10_alua.tg_pt_gps_lock); - kmem_cache_free(t10_alua_tg_pt_gp_cache, tg_pt_gp); return -ENOSPC; } again: From db895bf7bb47a56725bb1501ec31a6ac7c01fda9 Mon Sep 17 00:00:00 2001 From: Teng Qi Date: Mon, 11 Oct 2021 19:40:03 +0800 Subject: [PATCH 1716/5344] iio: imu: st_lsm6dsx: Avoid potential array overflow in st_lsm6dsx_set_odr() [ Upstream commit 94be878c882d8d784ff44c639bf55f3b029f85af ] The length of hw->settings->odr_table is 2 and ref_sensor->id is an enum variable whose value is between 0 and 5. However, the value ST_LSM6DSX_ID_MAX (i.e. 5) is not caught properly in switch (sensor->id) { If ref_sensor->id is ST_LSM6DSX_ID_MAX, an array overflow will ocurrs in function st_lsm6dsx_check_odr(): odr_table = &sensor->hw->settings->odr_table[sensor->id]; and in function st_lsm6dsx_set_odr(): reg = &hw->settings->odr_table[ref_sensor->id].reg; To avoid this array overflow, handle ST_LSM6DSX_ID_GYRO explicitly and return -EINVAL for the default case. The enum value ST_LSM6DSX_ID_MAX is only present as an easy way to check the limit and as such is never used, however this is not locally obvious. Reported-by: TOTE Robot Signed-off-by: Teng Qi Acked-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/20211011114003.976221-1-starmiku1207184332@gmail.com Signed-off-by: Jonathan Cameron Signed-off-by: Sasha Levin --- drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c b/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c index 057a4b0100106..8850da8e25d69 100644 --- a/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c +++ b/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c @@ -1015,6 +1015,8 @@ static int st_lsm6dsx_set_odr(struct st_lsm6dsx_sensor *sensor, u16 req_odr) int err; switch (sensor->id) { + case ST_LSM6DSX_ID_GYRO: + break; case ST_LSM6DSX_ID_EXT0: case ST_LSM6DSX_ID_EXT1: case ST_LSM6DSX_ID_EXT2: @@ -1040,8 +1042,8 @@ static int st_lsm6dsx_set_odr(struct st_lsm6dsx_sensor *sensor, u16 req_odr) } break; } - default: - break; + default: /* should never occur */ + return -EINVAL; } if (req_odr > 0) { From 0a0f68ab5c88280e8d118b3872ccdd3c051d8d57 Mon Sep 17 00:00:00 2001 From: Anatolij Gustschin Date: Thu, 14 Oct 2021 00:05:31 +0200 Subject: [PATCH 1717/5344] powerpc/5200: dts: fix memory node unit name [ Upstream commit aed2886a5e9ffc8269a4220bff1e9e030d3d2eb1 ] Fixes build warnings: Warning (unit_address_vs_reg): /memory: node has a reg or ranges property, but no unit name Signed-off-by: Anatolij Gustschin Reviewed-by: Rob Herring Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211013220532.24759-4-agust@denx.de Signed-off-by: Sasha Levin --- arch/powerpc/boot/dts/charon.dts | 2 +- arch/powerpc/boot/dts/digsy_mtc.dts | 2 +- arch/powerpc/boot/dts/lite5200.dts | 2 +- arch/powerpc/boot/dts/lite5200b.dts | 2 +- arch/powerpc/boot/dts/media5200.dts | 2 +- arch/powerpc/boot/dts/mpc5200b.dtsi | 2 +- arch/powerpc/boot/dts/o2d.dts | 2 +- arch/powerpc/boot/dts/o2d.dtsi | 2 +- arch/powerpc/boot/dts/o2dnt2.dts | 2 +- arch/powerpc/boot/dts/o3dnt.dts | 2 +- arch/powerpc/boot/dts/pcm032.dts | 2 +- arch/powerpc/boot/dts/tqm5200.dts | 2 +- 12 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/boot/dts/charon.dts b/arch/powerpc/boot/dts/charon.dts index 408b486b13dff..cd589539f313f 100644 --- a/arch/powerpc/boot/dts/charon.dts +++ b/arch/powerpc/boot/dts/charon.dts @@ -35,7 +35,7 @@ }; }; - memory { + memory@0 { device_type = "memory"; reg = <0x00000000 0x08000000>; // 128MB }; diff --git a/arch/powerpc/boot/dts/digsy_mtc.dts b/arch/powerpc/boot/dts/digsy_mtc.dts index 0e5e9d3acf79f..19a14e62e65f4 100644 --- a/arch/powerpc/boot/dts/digsy_mtc.dts +++ b/arch/powerpc/boot/dts/digsy_mtc.dts @@ -16,7 +16,7 @@ model = "intercontrol,digsy-mtc"; compatible = "intercontrol,digsy-mtc"; - memory { + memory@0 { reg = <0x00000000 0x02000000>; // 32MB }; diff --git a/arch/powerpc/boot/dts/lite5200.dts b/arch/powerpc/boot/dts/lite5200.dts index cb2782dd6132c..e7b194775d783 100644 --- a/arch/powerpc/boot/dts/lite5200.dts +++ b/arch/powerpc/boot/dts/lite5200.dts @@ -32,7 +32,7 @@ }; }; - memory { + memory@0 { device_type = "memory"; reg = <0x00000000 0x04000000>; // 64MB }; diff --git a/arch/powerpc/boot/dts/lite5200b.dts b/arch/powerpc/boot/dts/lite5200b.dts index 2b86c81f90485..547cbe726ff23 100644 --- a/arch/powerpc/boot/dts/lite5200b.dts +++ b/arch/powerpc/boot/dts/lite5200b.dts @@ -31,7 +31,7 @@ led4 { gpios = <&gpio_simple 2 1>; }; }; - memory { + memory@0 { reg = <0x00000000 0x10000000>; // 256MB }; diff --git a/arch/powerpc/boot/dts/media5200.dts b/arch/powerpc/boot/dts/media5200.dts index 61cae9dcddef4..f3188018faceb 100644 --- a/arch/powerpc/boot/dts/media5200.dts +++ b/arch/powerpc/boot/dts/media5200.dts @@ -32,7 +32,7 @@ }; }; - memory { + memory@0 { reg = <0x00000000 0x08000000>; // 128MB RAM }; diff --git a/arch/powerpc/boot/dts/mpc5200b.dtsi b/arch/powerpc/boot/dts/mpc5200b.dtsi index 648fe31795f49..8b796f3b11da7 100644 --- a/arch/powerpc/boot/dts/mpc5200b.dtsi +++ b/arch/powerpc/boot/dts/mpc5200b.dtsi @@ -33,7 +33,7 @@ }; }; - memory: memory { + memory: memory@0 { device_type = "memory"; reg = <0x00000000 0x04000000>; // 64MB }; diff --git a/arch/powerpc/boot/dts/o2d.dts b/arch/powerpc/boot/dts/o2d.dts index 24a46f65e5299..e0a8d3034417f 100644 --- a/arch/powerpc/boot/dts/o2d.dts +++ b/arch/powerpc/boot/dts/o2d.dts @@ -12,7 +12,7 @@ model = "ifm,o2d"; compatible = "ifm,o2d"; - memory { + memory@0 { reg = <0x00000000 0x08000000>; // 128MB }; diff --git a/arch/powerpc/boot/dts/o2d.dtsi b/arch/powerpc/boot/dts/o2d.dtsi index 6661955a2be47..b55a9e5bd828c 100644 --- a/arch/powerpc/boot/dts/o2d.dtsi +++ b/arch/powerpc/boot/dts/o2d.dtsi @@ -19,7 +19,7 @@ model = "ifm,o2d"; compatible = "ifm,o2d"; - memory { + memory@0 { reg = <0x00000000 0x04000000>; // 64MB }; diff --git a/arch/powerpc/boot/dts/o2dnt2.dts b/arch/powerpc/boot/dts/o2dnt2.dts index eeba7f5507d5d..c2eedbd1f5fcb 100644 --- a/arch/powerpc/boot/dts/o2dnt2.dts +++ b/arch/powerpc/boot/dts/o2dnt2.dts @@ -12,7 +12,7 @@ model = "ifm,o2dnt2"; compatible = "ifm,o2d"; - memory { + memory@0 { reg = <0x00000000 0x08000000>; // 128MB }; diff --git a/arch/powerpc/boot/dts/o3dnt.dts b/arch/powerpc/boot/dts/o3dnt.dts index fd00396b0593e..e4c1bdd412716 100644 --- a/arch/powerpc/boot/dts/o3dnt.dts +++ b/arch/powerpc/boot/dts/o3dnt.dts @@ -12,7 +12,7 @@ model = "ifm,o3dnt"; compatible = "ifm,o2d"; - memory { + memory@0 { reg = <0x00000000 0x04000000>; // 64MB }; diff --git a/arch/powerpc/boot/dts/pcm032.dts b/arch/powerpc/boot/dts/pcm032.dts index c259c6b3ac5ab..5674f978b9830 100644 --- a/arch/powerpc/boot/dts/pcm032.dts +++ b/arch/powerpc/boot/dts/pcm032.dts @@ -22,7 +22,7 @@ model = "phytec,pcm032"; compatible = "phytec,pcm032"; - memory { + memory@0 { reg = <0x00000000 0x08000000>; // 128MB }; diff --git a/arch/powerpc/boot/dts/tqm5200.dts b/arch/powerpc/boot/dts/tqm5200.dts index 9ed0bc78967e1..5bb25a9e40a01 100644 --- a/arch/powerpc/boot/dts/tqm5200.dts +++ b/arch/powerpc/boot/dts/tqm5200.dts @@ -32,7 +32,7 @@ }; }; - memory { + memory@0 { device_type = "memory"; reg = <0x00000000 0x04000000>; // 64MB }; From 1cdc9141cdd8fcb79a76e05f93e6183cafe7cda1 Mon Sep 17 00:00:00 2001 From: Chengfeng Ye Date: Sun, 24 Oct 2021 03:46:11 -0700 Subject: [PATCH 1718/5344] ALSA: gus: fix null pointer dereference on pointer block [ Upstream commit a0d21bb3279476c777434c40d969ea88ca64f9aa ] The pointer block return from snd_gf1_dma_next_block could be null, so there is a potential null pointer dereference issue. Fix this by adding a null check before dereference. Signed-off-by: Chengfeng Ye Link: https://lore.kernel.org/r/20211024104611.9919-1-cyeaa@connect.ust.hk Signed-off-by: Takashi Iwai Signed-off-by: Sasha Levin --- sound/isa/gus/gus_dma.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/isa/gus/gus_dma.c b/sound/isa/gus/gus_dma.c index a1c770d826dda..6d664dd8dde0b 100644 --- a/sound/isa/gus/gus_dma.c +++ b/sound/isa/gus/gus_dma.c @@ -126,6 +126,8 @@ static void snd_gf1_dma_interrupt(struct snd_gus_card * gus) } block = snd_gf1_dma_next_block(gus); spin_unlock(&gus->dma_lock); + if (!block) + return; snd_gf1_dma_program(gus, block->addr, block->buf_addr, block->count, (unsigned short) block->cmd); kfree(block); #if 0 From 329787c772da9dce0a4795b37507d0e58f79422b Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 14 Oct 2021 13:44:24 +1100 Subject: [PATCH 1719/5344] powerpc/dcr: Use cmplwi instead of 3-argument cmpli MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit fef071be57dc43679a32d5b0e6ee176d6f12e9f2 ] In dcr-low.S we use cmpli with three arguments, instead of four arguments as defined in the ISA: cmpli cr0,r3,1024 This appears to be a PPC440-ism, looking at the "PPC440x5 CPU Core User’s Manual" it shows cmpli having no L field, but implied to be 0 due to the core being 32-bit. It mentions that the ISA defines four arguments and recommends using cmplwi. It also corresponds to the old POWER instruction set, which had no L field there, a reserved bit instead. dcr-low.S is only built 32-bit, because it is only built when DCR_NATIVE=y, which is only selected by 40x and 44x. Looking at the generated code (with gcc/gas) we see cmplwi as expected. Although gas is happy with the 3-argument version when building for 32-bit, the LLVM assembler is not and errors out with: arch/powerpc/sysdev/dcr-low.S:27:10: error: invalid operand for instruction cmpli 0,%r3,1024; ... ^ Switch to the cmplwi extended opcode, which avoids any confusion when reading the ISA, fixes the issue with the LLVM assembler, and also means the code could be built 64-bit in future (though that's very unlikely). Reported-by: Nick Desaulniers Reviewed-by: Nick Desaulniers Signed-off-by: Michael Ellerman BugLink: https://github.com/ClangBuiltLinux/linux/issues/1419 Link: https://lore.kernel.org/r/20211014024424.528848-1-mpe@ellerman.id.au Signed-off-by: Sasha Levin --- arch/powerpc/sysdev/dcr-low.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/sysdev/dcr-low.S b/arch/powerpc/sysdev/dcr-low.S index efeeb1b885a17..329b9c4ae5429 100644 --- a/arch/powerpc/sysdev/dcr-low.S +++ b/arch/powerpc/sysdev/dcr-low.S @@ -11,7 +11,7 @@ #include #define DCR_ACCESS_PROLOG(table) \ - cmpli cr0,r3,1024; \ + cmplwi cr0,r3,1024; \ rlwinm r3,r3,4,18,27; \ lis r5,table@h; \ ori r5,r5,table@l; \ From b6687b5d606c613d702bcc123917eefe6af8b6ba Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Tue, 22 Dec 2020 12:54:01 -0800 Subject: [PATCH 1720/5344] sh: check return code of request_irq [ Upstream commit 0e38225c92c7964482a8bb6b3e37fde4319e965c ] request_irq is marked __must_check, but the call in shx3_prepare_cpus has a void return type, so it can't propagate failure to the caller. Follow cues from hexagon and just print an error. Fixes: c7936b9abcf5 ("sh: smp: Hook in to the generic IPI handler for SH-X3 SMP.") Cc: Miguel Ojeda Cc: Paul Mundt Reported-by: Guenter Roeck Signed-off-by: Nick Desaulniers Tested-by: John Paul Adrian Glaubitz Reviewed-by: Miguel Ojeda Signed-off-by: Rich Felker Signed-off-by: Sasha Levin --- arch/sh/kernel/cpu/sh4a/smp-shx3.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/sh/kernel/cpu/sh4a/smp-shx3.c b/arch/sh/kernel/cpu/sh4a/smp-shx3.c index f8a2bec0f260b..1261dc7b84e8b 100644 --- a/arch/sh/kernel/cpu/sh4a/smp-shx3.c +++ b/arch/sh/kernel/cpu/sh4a/smp-shx3.c @@ -73,8 +73,9 @@ static void shx3_prepare_cpus(unsigned int max_cpus) BUILD_BUG_ON(SMP_MSG_NR >= 8); for (i = 0; i < SMP_MSG_NR; i++) - request_irq(104 + i, ipi_interrupt_handler, - IRQF_PERCPU, "IPI", (void *)(long)i); + if (request_irq(104 + i, ipi_interrupt_handler, + IRQF_PERCPU, "IPI", (void *)(long)i)) + pr_err("Failed to request irq %d\n", i); for (i = 0; i < max_cpus; i++) set_cpu_present(i, true); From acd8890c1327a63e94776f783617495e6bffa99b Mon Sep 17 00:00:00 2001 From: Lu Wei Date: Thu, 26 Nov 2020 10:43:11 +0800 Subject: [PATCH 1721/5344] maple: fix wrong return value of maple_bus_init(). [ Upstream commit bde82ee391fa6d3ad054313c4aa7b726d32515ce ] If KMEM_CACHE or maple_alloc_dev failed, the maple_bus_init() will return 0 rather than error, because the retval is not changed after KMEM_CACHE or maple_alloc_dev failed. Fixes: 17be2d2b1c33 ("sh: Add maple bus support for the SEGA Dreamcast.") Reported-by: Hulk Robot Signed-off-by: Lu Wei Acked-by: John Paul Adrian Glaubitz Signed-off-by: Rich Felker Signed-off-by: Sasha Levin --- drivers/sh/maple/maple.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/sh/maple/maple.c b/drivers/sh/maple/maple.c index e5d7fb81ad665..44a931d41a132 100644 --- a/drivers/sh/maple/maple.c +++ b/drivers/sh/maple/maple.c @@ -835,8 +835,10 @@ static int __init maple_bus_init(void) maple_queue_cache = KMEM_CACHE(maple_buffer, SLAB_HWCACHE_ALIGN); - if (!maple_queue_cache) + if (!maple_queue_cache) { + retval = -ENOMEM; goto cleanup_bothirqs; + } INIT_LIST_HEAD(&maple_waitq); INIT_LIST_HEAD(&maple_sentq); @@ -849,6 +851,7 @@ static int __init maple_bus_init(void) if (!mdev[i]) { while (i-- > 0) maple_free_dev(mdev[i]); + retval = -ENOMEM; goto cleanup_cache; } baseunits[i] = mdev[i]; From f372ee4d5b4c2ebb70bd52e2a17367fcd9ee4523 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 21 Sep 2021 22:37:30 +0800 Subject: [PATCH 1722/5344] f2fs: fix up f2fs_lookup tracepoints [ Upstream commit 70a9ac36ffd807ac506ed0b849f3e8ce3c6623f2 ] Fix up a misuse that the filename pointer isn't always valid in the ring buffer, and we should copy the content instead. Fixes: 0c5e36db17f5 ("f2fs: trace f2fs_lookup") Signed-off-by: Gao Xiang Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- include/trace/events/f2fs.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 1796ff99c3e9c..a7613efc271ab 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -793,20 +793,20 @@ TRACE_EVENT(f2fs_lookup_start, TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) - __field(const char *, name) + __string(name, dentry->d_name.name) __field(unsigned int, flags) ), TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->ino = dir->i_ino; - __entry->name = dentry->d_name.name; + __assign_str(name, dentry->d_name.name); __entry->flags = flags; ), TP_printk("dev = (%d,%d), pino = %lu, name:%s, flags:%u", show_dev_ino(__entry), - __entry->name, + __get_str(name), __entry->flags) ); @@ -820,7 +820,7 @@ TRACE_EVENT(f2fs_lookup_end, TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) - __field(const char *, name) + __string(name, dentry->d_name.name) __field(nid_t, cino) __field(int, err) ), @@ -828,14 +828,14 @@ TRACE_EVENT(f2fs_lookup_end, TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->ino = dir->i_ino; - __entry->name = dentry->d_name.name; + __assign_str(name, dentry->d_name.name); __entry->cino = ino; __entry->err = err; ), TP_printk("dev = (%d,%d), pino = %lu, name:%s, ino:%u, err:%d", show_dev_ino(__entry), - __entry->name, + __get_str(name), __entry->cino, __entry->err) ); From 02348bd5b6b90a73acda751908c577439d1a8990 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 4 Oct 2021 17:19:10 -0700 Subject: [PATCH 1723/5344] sh: fix kconfig unmet dependency warning for FRAME_POINTER [ Upstream commit fda1bc533094a7db68b11e7503d2c6c73993d12a ] FRAME_POINTER depends on DEBUG_KERNEL so DWARF_UNWINDER should depend on DEBUG_KERNEL before selecting FRAME_POINTER. WARNING: unmet direct dependencies detected for FRAME_POINTER Depends on [n]: DEBUG_KERNEL [=n] && (M68K || UML || SUPERH [=y]) || ARCH_WANT_FRAME_POINTERS [=n] Selected by [y]: - DWARF_UNWINDER [=y] Fixes: bd353861c735 ("sh: dwarf unwinder support.") Signed-off-by: Randy Dunlap Cc: Matt Fleming Cc: Matt Fleming Cc: Yoshinori Sato Cc: John Paul Adrian Glaubitz Cc: Geert Uytterhoeven Reviewed-by: Geert Uytterhoeven Tested-by: John Paul Adrian Glaubitz Signed-off-by: Rich Felker Signed-off-by: Sasha Levin --- arch/sh/Kconfig.debug | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/sh/Kconfig.debug b/arch/sh/Kconfig.debug index 010b6c33bbba2..71acd3d9b9e83 100644 --- a/arch/sh/Kconfig.debug +++ b/arch/sh/Kconfig.debug @@ -58,6 +58,7 @@ config DUMP_CODE config DWARF_UNWINDER bool "Enable the DWARF unwinder for stacktraces" + depends on DEBUG_KERNEL select FRAME_POINTER depends on SUPERH32 default n From 3ef3ad76360ff286ad6a3ac8508a8a89fdb92fcf Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 4 Oct 2021 17:19:12 -0700 Subject: [PATCH 1724/5344] sh: math-emu: drop unused functions [ Upstream commit e25c252a9b033523c626f039d4b9a304f12f6775 ] Delete ieee_fpe_handler() since it is not used. After that is done, delete denormal_to_double() since it is not used: .../arch/sh/math-emu/math.c:505:12: error: 'ieee_fpe_handler' defined but not used [-Werror=unused-function] 505 | static int ieee_fpe_handler(struct pt_regs *regs) .../arch/sh/math-emu/math.c:477:13: error: 'denormal_to_double' defined but not used [-Werror=unused-function] 477 | static void denormal_to_double(struct sh_fpu_soft_struct *fpu, int n) Fixes: 7caf62de25554da3 ("sh: remove unused do_fpu_error") Signed-off-by: Randy Dunlap Cc: Takashi YOSHII Cc: Yoshinori Sato Cc: John Paul Adrian Glaubitz Reviewed-by: Geert Uytterhoeven Signed-off-by: Rich Felker Signed-off-by: Sasha Levin --- arch/sh/math-emu/math.c | 103 ---------------------------------------- 1 file changed, 103 deletions(-) diff --git a/arch/sh/math-emu/math.c b/arch/sh/math-emu/math.c index e8be0eca0444a..615ba932c398e 100644 --- a/arch/sh/math-emu/math.c +++ b/arch/sh/math-emu/math.c @@ -467,109 +467,6 @@ static int fpu_emulate(u16 code, struct sh_fpu_soft_struct *fregs, struct pt_reg return id_sys(fregs, regs, code); } -/** - * denormal_to_double - Given denormalized float number, - * store double float - * - * @fpu: Pointer to sh_fpu_soft structure - * @n: Index to FP register - */ -static void denormal_to_double(struct sh_fpu_soft_struct *fpu, int n) -{ - unsigned long du, dl; - unsigned long x = fpu->fpul; - int exp = 1023 - 126; - - if (x != 0 && (x & 0x7f800000) == 0) { - du = (x & 0x80000000); - while ((x & 0x00800000) == 0) { - x <<= 1; - exp--; - } - x &= 0x007fffff; - du |= (exp << 20) | (x >> 3); - dl = x << 29; - - fpu->fp_regs[n] = du; - fpu->fp_regs[n+1] = dl; - } -} - -/** - * ieee_fpe_handler - Handle denormalized number exception - * - * @regs: Pointer to register structure - * - * Returns 1 when it's handled (should not cause exception). - */ -static int ieee_fpe_handler(struct pt_regs *regs) -{ - unsigned short insn = *(unsigned short *)regs->pc; - unsigned short finsn; - unsigned long nextpc; - int nib[4] = { - (insn >> 12) & 0xf, - (insn >> 8) & 0xf, - (insn >> 4) & 0xf, - insn & 0xf}; - - if (nib[0] == 0xb || - (nib[0] == 0x4 && nib[2] == 0x0 && nib[3] == 0xb)) /* bsr & jsr */ - regs->pr = regs->pc + 4; - - if (nib[0] == 0xa || nib[0] == 0xb) { /* bra & bsr */ - nextpc = regs->pc + 4 + ((short) ((insn & 0xfff) << 4) >> 3); - finsn = *(unsigned short *) (regs->pc + 2); - } else if (nib[0] == 0x8 && nib[1] == 0xd) { /* bt/s */ - if (regs->sr & 1) - nextpc = regs->pc + 4 + ((char) (insn & 0xff) << 1); - else - nextpc = regs->pc + 4; - finsn = *(unsigned short *) (regs->pc + 2); - } else if (nib[0] == 0x8 && nib[1] == 0xf) { /* bf/s */ - if (regs->sr & 1) - nextpc = regs->pc + 4; - else - nextpc = regs->pc + 4 + ((char) (insn & 0xff) << 1); - finsn = *(unsigned short *) (regs->pc + 2); - } else if (nib[0] == 0x4 && nib[3] == 0xb && - (nib[2] == 0x0 || nib[2] == 0x2)) { /* jmp & jsr */ - nextpc = regs->regs[nib[1]]; - finsn = *(unsigned short *) (regs->pc + 2); - } else if (nib[0] == 0x0 && nib[3] == 0x3 && - (nib[2] == 0x0 || nib[2] == 0x2)) { /* braf & bsrf */ - nextpc = regs->pc + 4 + regs->regs[nib[1]]; - finsn = *(unsigned short *) (regs->pc + 2); - } else if (insn == 0x000b) { /* rts */ - nextpc = regs->pr; - finsn = *(unsigned short *) (regs->pc + 2); - } else { - nextpc = regs->pc + 2; - finsn = insn; - } - - if ((finsn & 0xf1ff) == 0xf0ad) { /* fcnvsd */ - struct task_struct *tsk = current; - - if ((tsk->thread.xstate->softfpu.fpscr & (1 << 17))) { - /* FPU error */ - denormal_to_double (&tsk->thread.xstate->softfpu, - (finsn >> 8) & 0xf); - tsk->thread.xstate->softfpu.fpscr &= - ~(FPSCR_CAUSE_MASK | FPSCR_FLAG_MASK); - task_thread_info(tsk)->status |= TS_USEDFPU; - } else { - force_sig_fault(SIGFPE, FPE_FLTINV, - (void __user *)regs->pc); - } - - regs->pc = nextpc; - return 1; - } - - return 0; -} - /** * fpu_init - Initialize FPU registers * @fpu: Pointer to software emulated FPU registers. From 8c2c01be0f24f25009f0bfb40ab68b701a8f06fa Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 4 Oct 2021 17:19:13 -0700 Subject: [PATCH 1725/5344] sh: define __BIG_ENDIAN for math-emu [ Upstream commit b929926f01f2d14635345d22eafcf60feed1085e ] Fix this by defining both ENDIAN macros in so that they can be utilized in according to the latter's comment: /* Allow sfp-machine to have its own byte order definitions. */ (This is what is done in arch/nds32/include/asm/sfp-machine.h.) This placates these build warnings: In file included from ../arch/sh/math-emu/math.c:23: .../include/math-emu/single.h:50:21: warning: "__BIG_ENDIAN" is not defined, evaluates to 0 [-Wundef] 50 | #if __BYTE_ORDER == __BIG_ENDIAN In file included from ../arch/sh/math-emu/math.c:24: .../include/math-emu/double.h:59:21: warning: "__BIG_ENDIAN" is not defined, evaluates to 0 [-Wundef] 59 | #if __BYTE_ORDER == __BIG_ENDIAN Fixes: 4b565680d163 ("sh: math-emu support") Signed-off-by: Randy Dunlap Cc: Yoshinori Sato Cc: John Paul Adrian Glaubitz Reviewed-by: Geert Uytterhoeven Tested-by: John Paul Adrian Glaubitz Signed-off-by: Rich Felker Signed-off-by: Sasha Levin --- arch/sh/include/asm/sfp-machine.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/sh/include/asm/sfp-machine.h b/arch/sh/include/asm/sfp-machine.h index cbc7cf8c97ce6..2d2423478b71d 100644 --- a/arch/sh/include/asm/sfp-machine.h +++ b/arch/sh/include/asm/sfp-machine.h @@ -13,6 +13,14 @@ #ifndef _SFP_MACHINE_H #define _SFP_MACHINE_H +#ifdef __BIG_ENDIAN__ +#define __BYTE_ORDER __BIG_ENDIAN +#define __LITTLE_ENDIAN 0 +#else +#define __BYTE_ORDER __LITTLE_ENDIAN +#define __BIG_ENDIAN 0 +#endif + #define _FP_W_TYPE_SIZE 32 #define _FP_W_TYPE unsigned long #define _FP_WS_TYPE signed long From a14099c1ce29a56b5cc5412f8af5593dfe4e1ecf Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Fri, 1 Oct 2021 18:20:33 +0100 Subject: [PATCH 1726/5344] clk: ingenic: Fix bugs with divided dividers [ Upstream commit ed84ef1cd7eddf933d4ffce2caa8161d6f947245 ] Two fixes in one: - In the "impose hardware constraints" block, the "logical" divider value (aka. not translated to the hardware) was clamped to fit in the register area, but this totally ignored the fact that the divider value can itself have a fixed divider. - The code that made sure that the divider value returned by the function was a multiple of its own fixed divider could result in a wrong value being calculated, because it was rounded down instead of rounded up. Fixes: 4afe2d1a6ed5 ("clk: ingenic: Allow divider value to be divided") Co-developed-by: Artur Rojek Signed-off-by: Artur Rojek Signed-off-by: Paul Cercueil Link: https://lore.kernel.org/r/20211001172033.122329-1-paul@crapouillou.net Signed-off-by: Stephen Boyd Signed-off-by: Sasha Levin --- drivers/clk/ingenic/cgu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/clk/ingenic/cgu.c b/drivers/clk/ingenic/cgu.c index 7490d4f4d9366..dff759c0f6193 100644 --- a/drivers/clk/ingenic/cgu.c +++ b/drivers/clk/ingenic/cgu.c @@ -426,15 +426,15 @@ ingenic_clk_calc_div(const struct ingenic_cgu_clk_info *clk_info, } /* Impose hardware constraints */ - div = min_t(unsigned, div, 1 << clk_info->div.bits); - div = max_t(unsigned, div, 1); + div = clamp_t(unsigned int, div, clk_info->div.div, + clk_info->div.div << clk_info->div.bits); /* * If the divider value itself must be divided before being written to * the divider register, we must ensure we don't have any bits set that * would be lost as a result of doing so. */ - div /= clk_info->div.div; + div = DIV_ROUND_UP(div, clk_info->div.div); div *= clk_info->div.div; return div; From fe36d12a0d202c05e4ee3d9d2183ae35d5ab4eaa Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Thu, 23 Sep 2021 09:24:49 +0930 Subject: [PATCH 1727/5344] clk/ast2600: Fix soc revision for AHB [ Upstream commit f45c5b1c27293f834682e89003f88b3512329ab4 ] Move the soc revision parsing to the initial probe, saving the driver from parsing the register multiple times. Use this variable to select the correct divisor table for the AHB clock. Before this fix the A2 would have used the A0 table. Fixes: 2d491066ccd4 ("clk: ast2600: Fix AHB clock divider for A1") Signed-off-by: Joel Stanley Link: https://lore.kernel.org/r/20210922235449.213631-1-joel@jms.id.au Reviewed-by: Andrew Jeffery Signed-off-by: Stephen Boyd Signed-off-by: Sasha Levin --- drivers/clk/clk-ast2600.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/clk/clk-ast2600.c b/drivers/clk/clk-ast2600.c index af957179b135e..48122f574cb65 100644 --- a/drivers/clk/clk-ast2600.c +++ b/drivers/clk/clk-ast2600.c @@ -48,6 +48,8 @@ static DEFINE_SPINLOCK(aspeed_g6_clk_lock); static struct clk_hw_onecell_data *aspeed_g6_clk_data; static void __iomem *scu_g6_base; +/* AST2600 revision: A0, A1, A2, etc */ +static u8 soc_rev; /* * Clocks marked with CLK_IS_CRITICAL: @@ -190,9 +192,8 @@ static struct clk_hw *ast2600_calc_pll(const char *name, u32 val) static struct clk_hw *ast2600_calc_apll(const char *name, u32 val) { unsigned int mult, div; - u32 chip_id = readl(scu_g6_base + ASPEED_G6_SILICON_REV); - if (((chip_id & CHIP_REVISION_ID) >> 16) >= 2) { + if (soc_rev >= 2) { if (val & BIT(24)) { /* Pass through mode */ mult = div = 1; @@ -664,7 +665,7 @@ static const u32 ast2600_a1_axi_ahb200_tbl[] = { static void __init aspeed_g6_cc(struct regmap *map) { struct clk_hw *hw; - u32 val, div, divbits, chip_id, axi_div, ahb_div; + u32 val, div, divbits, axi_div, ahb_div; clk_hw_register_fixed_rate(NULL, "clkin", NULL, 0, 25000000); @@ -695,8 +696,7 @@ static void __init aspeed_g6_cc(struct regmap *map) axi_div = 2; divbits = (val >> 11) & 0x3; - regmap_read(map, ASPEED_G6_SILICON_REV, &chip_id); - if (chip_id & BIT(16)) { + if (soc_rev >= 1) { if (!divbits) { ahb_div = ast2600_a1_axi_ahb200_tbl[(val >> 8) & 0x3]; if (val & BIT(16)) @@ -741,6 +741,8 @@ static void __init aspeed_g6_cc_init(struct device_node *np) if (!scu_g6_base) return; + soc_rev = (readl(scu_g6_base + ASPEED_G6_SILICON_REV) & CHIP_REVISION_ID) >> 16; + aspeed_g6_clk_data = kzalloc(struct_size(aspeed_g6_clk_data, hws, ASPEED_G6_NUM_CLKS), GFP_KERNEL); if (!aspeed_g6_clk_data) From f98ecb4db60f38d34bf0fd311adcfe80e34527b2 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Thu, 4 Nov 2021 04:11:55 +0300 Subject: [PATCH 1728/5344] clk: qcom: gcc-msm8996: Drop (again) gcc_aggre1_pnoc_ahb_clk [ Upstream commit 05cf3ec00d460b50088d421fb878a0f83f57e262 ] The gcc_aggre1_pnoc_ahb_clk is crucial for the proper MSM8996/APQ8096 functioning. If it gets disabled, several subsytems will stop working (including eMMC/SDCC and USB). There are no in-kernel users of this clock, so it is much simpler to remove from the kernel. The clock was first removed in the commit 9e60de1cf270 ("clk: qcom: Remove gcc_aggre1_pnoc_ahb_clk from msm8996") by Stephen Boyd, but got added back in the commit b567752144e3 ("clk: qcom: Add some missing gcc clks for msm8996") by Rajendra Nayak. Let's remove it again in hope that nobody adds it back. Reported-by: Vladimir Zapolskiy Cc: Rajendra Nayak Cc: Konrad Dybcio Fixes: b567752144e3 ("clk: qcom: Add some missing gcc clks for msm8996") Signed-off-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20211104011155.2209654-1-dmitry.baryshkov@linaro.org Signed-off-by: Stephen Boyd Signed-off-by: Sasha Levin --- drivers/clk/qcom/gcc-msm8996.c | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/drivers/clk/qcom/gcc-msm8996.c b/drivers/clk/qcom/gcc-msm8996.c index d004cdaa0e39a..c1e1148f0261d 100644 --- a/drivers/clk/qcom/gcc-msm8996.c +++ b/drivers/clk/qcom/gcc-msm8996.c @@ -2937,20 +2937,6 @@ static struct clk_branch gcc_smmu_aggre0_ahb_clk = { }, }; -static struct clk_branch gcc_aggre1_pnoc_ahb_clk = { - .halt_reg = 0x82014, - .clkr = { - .enable_reg = 0x82014, - .enable_mask = BIT(0), - .hw.init = &(struct clk_init_data){ - .name = "gcc_aggre1_pnoc_ahb_clk", - .parent_names = (const char *[]){ "periph_noc_clk_src" }, - .num_parents = 1, - .ops = &clk_branch2_ops, - }, - }, -}; - static struct clk_branch gcc_aggre2_ufs_axi_clk = { .halt_reg = 0x83014, .clkr = { @@ -3453,7 +3439,6 @@ static struct clk_regmap *gcc_msm8996_clocks[] = { [GCC_AGGRE0_CNOC_AHB_CLK] = &gcc_aggre0_cnoc_ahb_clk.clkr, [GCC_SMMU_AGGRE0_AXI_CLK] = &gcc_smmu_aggre0_axi_clk.clkr, [GCC_SMMU_AGGRE0_AHB_CLK] = &gcc_smmu_aggre0_ahb_clk.clkr, - [GCC_AGGRE1_PNOC_AHB_CLK] = &gcc_aggre1_pnoc_ahb_clk.clkr, [GCC_AGGRE2_UFS_AXI_CLK] = &gcc_aggre2_ufs_axi_clk.clkr, [GCC_AGGRE2_USB3_AXI_CLK] = &gcc_aggre2_usb3_axi_clk.clkr, [GCC_QSPI_AHB_CLK] = &gcc_qspi_ahb_clk.clkr, From ee638fce3702fd775e9859cd70ebe79967677d1b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 6 Nov 2021 08:49:11 -0700 Subject: [PATCH 1729/5344] mips: BCM63XX: ensure that CPU_SUPPORTS_32BIT_KERNEL is set [ Upstream commit 5eeaafc8d69373c095e461bdb39e5c9b62228ac5 ] Several header files need info on CONFIG_32BIT or CONFIG_64BIT, but kconfig symbol BCM63XX does not provide that info. This leads to many build errors, e.g.: arch/mips/include/asm/page.h:196:13: error: use of undeclared identifier 'CAC_BASE' return x - PAGE_OFFSET + PHYS_OFFSET; arch/mips/include/asm/mach-generic/spaces.h:91:23: note: expanded from macro 'PAGE_OFFSET' #define PAGE_OFFSET (CAC_BASE + PHYS_OFFSET) arch/mips/include/asm/io.h:134:28: error: use of undeclared identifier 'CAC_BASE' return (void *)(address + PAGE_OFFSET - PHYS_OFFSET); arch/mips/include/asm/mach-generic/spaces.h:91:23: note: expanded from macro 'PAGE_OFFSET' #define PAGE_OFFSET (CAC_BASE + PHYS_OFFSET) arch/mips/include/asm/uaccess.h:82:10: error: use of undeclared identifier '__UA_LIMIT' return (__UA_LIMIT & (addr | (addr + size) | __ua_size(size))) == 0; Selecting the SYS_HAS_CPU_BMIPS* symbols causes SYS_HAS_CPU_BMIPS to be set, which then selects CPU_SUPPORT_32BIT_KERNEL, which causes CONFIG_32BIT to be set. (a bit more indirect than v1 [RFC].) Fixes: e7300d04bd08 ("MIPS: BCM63xx: Add support for the Broadcom BCM63xx family of SOCs.") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Thomas Bogendoerfer Cc: Florian Fainelli Cc: bcm-kernel-feedback-list@broadcom.com Cc: linux-mips@vger.kernel.org Cc: Paul Burton Cc: Maxime Bizon Cc: Ralf Baechle Suggested-by: Florian Fainelli Acked-by: Florian Fainelli Signed-off-by: Thomas Bogendoerfer Signed-off-by: Sasha Levin --- arch/mips/Kconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 041d34975ea2c..9749818eed6d6 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -294,6 +294,9 @@ config BCM63XX select SYS_SUPPORTS_32BIT_KERNEL select SYS_SUPPORTS_BIG_ENDIAN select SYS_HAS_EARLY_PRINTK + select SYS_HAS_CPU_BMIPS32_3300 + select SYS_HAS_CPU_BMIPS4350 + select SYS_HAS_CPU_BMIPS4380 select SWAP_IO_SPACE select GPIOLIB select HAVE_CLK From dd8c11b08dd9301344cd5dcbdb557571b6f50836 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Thu, 4 Nov 2021 17:51:20 +0000 Subject: [PATCH 1730/5344] sched/core: Mitigate race cpus_share_cache()/update_top_cache_domain() [ Upstream commit 42dc938a590c96eeb429e1830123fef2366d9c80 ] Nothing protects the access to the per_cpu variable sd_llc_id. When testing the same CPU (i.e. this_cpu == that_cpu), a race condition exists with update_top_cache_domain(). One scenario being: CPU1 CPU2 ================================================================== per_cpu(sd_llc_id, CPUX) => 0 partition_sched_domains_locked() detach_destroy_domains() cpus_share_cache(CPUX, CPUX) update_top_cache_domain(CPUX) per_cpu(sd_llc_id, CPUX) => 0 per_cpu(sd_llc_id, CPUX) = CPUX per_cpu(sd_llc_id, CPUX) => CPUX return false ttwu_queue_cond() wouldn't catch smp_processor_id() == cpu and the result is a warning triggered from ttwu_queue_wakelist(). Avoid a such race in cpus_share_cache() by always returning true when this_cpu == that_cpu. Fixes: 518cd6234178 ("sched: Only queue remote wakeups when crossing cache boundaries") Reported-by: Jing-Ting Wu Signed-off-by: Vincent Donnefort Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20211104175120.857087-1-vincent.donnefort@arm.com Signed-off-by: Sasha Levin --- kernel/sched/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3c132771cffcd..0ab4122c97699 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2483,6 +2483,9 @@ void wake_up_if_idle(int cpu) bool cpus_share_cache(int this_cpu, int that_cpu) { + if (this_cpu == that_cpu) + return true; + return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } #endif /* CONFIG_SMP */ From 961c4c4587f351355ce7d433fe770f9485328b3c Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sun, 4 Oct 2020 17:14:05 -0500 Subject: [PATCH 1731/5344] tracing: Save normal string variables [ Upstream commit 63a1e5de3006f4ad713e4d72bcb404d0301e853d ] String variables created as field variables and save variables are already handled properly by having their values copied when set. The same isn't done for normal variables, but needs to be - simply saving a pointer to a string contained in an old event isn't sufficient, since that event's data may quickly become overwritten and therefore a string pointer to it could yield garbage. This change uses the same mechanism as field variables and simply appends the new strings to the existing per-element field_var_str[] array allocated for that purpose. Link: https://lkml.kernel.org/r/1c1a03798b02e67307412a0c719d1bfb69b13007.1601848695.git.zanussi@kernel.org Fixes: 02205a6752f2 (tracing: Add support for 'field variables') Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Sasha Levin --- kernel/trace/trace_events_hist.c | 34 ++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 6c8c9869d71cc..b3af29acf930a 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -149,6 +149,8 @@ struct hist_field { */ unsigned int var_ref_idx; bool read_once; + + unsigned int var_str_idx; }; static u64 hist_field_none(struct hist_field *field, @@ -351,6 +353,7 @@ struct hist_trigger_data { unsigned int n_keys; unsigned int n_fields; unsigned int n_vars; + unsigned int n_var_str; unsigned int key_size; struct tracing_map_sort_key sort_keys[TRACING_MAP_SORT_KEYS_MAX]; unsigned int n_sort_keys; @@ -2305,7 +2308,12 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt) } } - n_str = hist_data->n_field_var_str + hist_data->n_save_var_str; + n_str = hist_data->n_field_var_str + hist_data->n_save_var_str + + hist_data->n_var_str; + if (n_str > SYNTH_FIELDS_MAX) { + hist_elt_data_free(elt_data); + return -EINVAL; + } size = STR_VAR_LEN_MAX; @@ -4599,6 +4607,7 @@ static int create_var_field(struct hist_trigger_data *hist_data, { struct trace_array *tr = hist_data->event_file->tr; unsigned long flags = 0; + int ret; if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX)) return -EINVAL; @@ -4613,7 +4622,12 @@ static int create_var_field(struct hist_trigger_data *hist_data, if (WARN_ON(hist_data->n_vars > TRACING_MAP_VARS_MAX)) return -EINVAL; - return __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags); + ret = __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags); + + if (hist_data->fields[val_idx]->flags & HIST_FIELD_FL_STRING) + hist_data->fields[val_idx]->var_str_idx = hist_data->n_var_str++; + + return ret; } static int create_val_fields(struct hist_trigger_data *hist_data, @@ -5333,6 +5347,22 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, hist_val = hist_field->fn(hist_field, elt, rbe, rec); if (hist_field->flags & HIST_FIELD_FL_VAR) { var_idx = hist_field->var.idx; + + if (hist_field->flags & HIST_FIELD_FL_STRING) { + unsigned int str_start, var_str_idx, idx; + char *str, *val_str; + + str_start = hist_data->n_field_var_str + + hist_data->n_save_var_str; + var_str_idx = hist_field->var_str_idx; + idx = str_start + var_str_idx; + + str = elt_data->field_var_str[idx]; + val_str = (char *)(uintptr_t)hist_val; + strscpy(str, val_str, STR_VAR_LEN_MAX); + + hist_val = (u64)(uintptr_t)str; + } tracing_map_set_var(elt, var_idx, hist_val); continue; } From 35a7e8670ee09b40b718f508ef1848b4f73d04c1 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 13 Nov 2021 01:02:08 +0900 Subject: [PATCH 1732/5344] tracing/histogram: Do not copy the fixed-size char array field over the field size [ Upstream commit 63f84ae6b82bb4dff672f76f30c6fd7b9d3766bc ] Do not copy the fixed-size char array field of the events over the field size. The histogram treats char array as a string and there are 2 types of char array in the event, fixed-size and dynamic string. The dynamic string (__data_loc) field must be null terminated, but the fixed-size char array field may not be null terminated (not a string, but just a data). In that case, histogram can copy the data after the field. This uses the original field size for fixed-size char array field to restrict the histogram not to access over the original field size. Link: https://lkml.kernel.org/r/163673292822.195747.3696966210526410250.stgit@devnote2 Fixes: 02205a6752f2 (tracing: Add support for 'field variables') Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Sasha Levin --- kernel/trace/trace_events_hist.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index b3af29acf930a..e7bc856d91aa0 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2590,9 +2590,10 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, if (!hist_field->type) goto free; - if (field->filter_type == FILTER_STATIC_STRING) + if (field->filter_type == FILTER_STATIC_STRING) { hist_field->fn = hist_field_string; - else if (field->filter_type == FILTER_DYN_STRING) + hist_field->size = field->size; + } else if (field->filter_type == FILTER_DYN_STRING) hist_field->fn = hist_field_dynstring; else hist_field->fn = hist_field_pstring; @@ -3530,7 +3531,7 @@ static inline void __update_field_vars(struct tracing_map_elt *elt, char *str = elt_data->field_var_str[j++]; char *val_str = (char *)(uintptr_t)var_val; - strscpy(str, val_str, STR_VAR_LEN_MAX); + strscpy(str, val_str, val->size); var_val = (u64)(uintptr_t)str; } tracing_map_set_var(elt, var_idx, var_val); @@ -5359,7 +5360,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, str = elt_data->field_var_str[idx]; val_str = (char *)(uintptr_t)hist_val; - strscpy(str, val_str, STR_VAR_LEN_MAX); + strscpy(str, val_str, hist_field->size); hist_val = (u64)(uintptr_t)str; } From a749fb14a3213f5b52e70ad7a8104d8de9c601be Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 7 Nov 2021 08:40:47 +0200 Subject: [PATCH 1733/5344] RDMA/netlink: Add __maybe_unused to static inline in C file commit 83dde7498fefeb920b1def317421262317d178e5 upstream. Like other commits in the tree add __maybe_unused to a static inline in a C file because some clang compilers will complain about unused code: >> drivers/infiniband/core/nldev.c:2543:1: warning: unused function '__chk_RDMA_NL_NLDEV' MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_NLDEV, 5); ^ Fixes: e3bf14bdc17a ("rdma: Autoload netlink client modules") Link: https://lore.kernel.org/r/4a8101919b765e01d7fde6f27fd572c958deeb4a.1636267207.git.leonro@nvidia.com Reported-by: kernel test robot Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe Signed-off-by: Greg Kroah-Hartman --- include/rdma/rdma_netlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index ab22759de7ea0..7b3ef7a3d8d04 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -30,7 +30,7 @@ enum rdma_nl_flags { * constant as well and the compiler checks they are the same. */ #define MODULE_ALIAS_RDMA_NETLINK(_index, _val) \ - static inline void __chk_##_index(void) \ + static inline void __maybe_unused __chk_##_index(void) \ { \ BUILD_BUG_ON(_index != _val); \ } \ From b8d31060d821792237448c526029873bab8b8f18 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 11 Nov 2021 23:45:25 -0800 Subject: [PATCH 1734/5344] perf bpf: Avoid memory leak from perf_env__insert_btf() [ Upstream commit 4924b1f7c46711762fd0e65c135ccfbcfd6ded1f ] perf_env__insert_btf() doesn't insert if a duplicate BTF id is encountered and this causes a memory leak. Modify the function to return a success/error value and then free the memory if insertion didn't happen. v2. Adds a return -1 when the insertion error occurs in perf_env__fetch_btf. This doesn't affect anything as the result is never checked. Fixes: 3792cb2ff43b1b19 ("perf bpf: Save BTF in a rbtree in perf_env") Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Daniel Borkmann Cc: Jiri Olsa Cc: John Fastabend Cc: KP Singh Cc: Mark Rutland Cc: Martin KaFai Lau Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Song Liu Cc: Stephane Eranian Cc: Tiezhu Yang Cc: Yonghong Song Cc: bpf@vger.kernel.org Cc: netdev@vger.kernel.org Link: http://lore.kernel.org/lkml/20211112074525.121633-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Sasha Levin --- tools/perf/util/bpf-event.c | 6 +++++- tools/perf/util/env.c | 5 ++++- tools/perf/util/env.h | 2 +- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index c766813d56be0..782c0c8a9a836 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -108,7 +108,11 @@ static int perf_env__fetch_btf(struct perf_env *env, node->data_size = data_size; memcpy(node->data, data, data_size); - perf_env__insert_btf(env, node); + if (!perf_env__insert_btf(env, node)) { + /* Insertion failed because of a duplicate. */ + free(node); + return -1; + } return 0; } diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index 0fafcf264d235..ef64e197bc8df 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -69,12 +69,13 @@ struct bpf_prog_info_node *perf_env__find_bpf_prog_info(struct perf_env *env, return node; } -void perf_env__insert_btf(struct perf_env *env, struct btf_node *btf_node) +bool perf_env__insert_btf(struct perf_env *env, struct btf_node *btf_node) { struct rb_node *parent = NULL; __u32 btf_id = btf_node->id; struct btf_node *node; struct rb_node **p; + bool ret = true; down_write(&env->bpf_progs.lock); p = &env->bpf_progs.btfs.rb_node; @@ -88,6 +89,7 @@ void perf_env__insert_btf(struct perf_env *env, struct btf_node *btf_node) p = &(*p)->rb_right; } else { pr_debug("duplicated btf %u\n", btf_id); + ret = false; goto out; } } @@ -97,6 +99,7 @@ void perf_env__insert_btf(struct perf_env *env, struct btf_node *btf_node) env->bpf_progs.btfs_cnt++; out: up_write(&env->bpf_progs.lock); + return ret; } struct btf_node *perf_env__find_btf(struct perf_env *env, __u32 btf_id) diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h index db40906e29373..37028215d4a53 100644 --- a/tools/perf/util/env.h +++ b/tools/perf/util/env.h @@ -117,6 +117,6 @@ void perf_env__insert_bpf_prog_info(struct perf_env *env, struct bpf_prog_info_node *info_node); struct bpf_prog_info_node *perf_env__find_bpf_prog_info(struct perf_env *env, __u32 prog_id); -void perf_env__insert_btf(struct perf_env *env, struct btf_node *btf_node); +bool perf_env__insert_btf(struct perf_env *env, struct btf_node *btf_node); struct btf_node *perf_env__find_btf(struct perf_env *env, __u32 btf_id); #endif /* __PERF_ENV_H */ From dc8381880cf228b922280d81d98535c09646b1e8 Mon Sep 17 00:00:00 2001 From: Sohaib Mohamed Date: Fri, 12 Nov 2021 22:11:33 +0200 Subject: [PATCH 1735/5344] perf bench futex: Fix memory leak of perf_cpu_map__new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 88e48238d53682281c9de2a0b65d24d3b64542a0 ] ASan reports memory leaks while running: $ sudo ./perf bench futex all The leaks are caused by perf_cpu_map__new not being freed. This patch adds the missing perf_cpu_map__put since it calls cpu_map_delete implicitly. Fixes: 9c3516d1b850ea93 ("libperf: Add perf_cpu_map__new()/perf_cpu_map__read() functions") Signed-off-by: Sohaib Mohamed Cc: Alexander Shishkin Cc: André Almeida Cc: Darren Hart Cc: Davidlohr Bueso Cc: Ian Rogers Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sohaib Mohamed Cc: Thomas Gleixner Link: http://lore.kernel.org/lkml/20211112201134.77892-1-sohaib.amhmd@gmail.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Sasha Levin --- tools/perf/bench/futex-lock-pi.c | 1 + tools/perf/bench/futex-requeue.c | 1 + tools/perf/bench/futex-wake-parallel.c | 1 + tools/perf/bench/futex-wake.c | 1 + 4 files changed, 4 insertions(+) diff --git a/tools/perf/bench/futex-lock-pi.c b/tools/perf/bench/futex-lock-pi.c index 30d97121dc4fb..c54013806ba4a 100644 --- a/tools/perf/bench/futex-lock-pi.c +++ b/tools/perf/bench/futex-lock-pi.c @@ -224,6 +224,7 @@ int bench_futex_lock_pi(int argc, const char **argv) print_summary(); free(worker); + perf_cpu_map__put(cpu); return ret; err: usage_with_options(bench_futex_lock_pi_usage, options); diff --git a/tools/perf/bench/futex-requeue.c b/tools/perf/bench/futex-requeue.c index a00a6891447ab..11b7dbd374864 100644 --- a/tools/perf/bench/futex-requeue.c +++ b/tools/perf/bench/futex-requeue.c @@ -215,6 +215,7 @@ int bench_futex_requeue(int argc, const char **argv) print_summary(); free(worker); + perf_cpu_map__put(cpu); return ret; err: usage_with_options(bench_futex_requeue_usage, options); diff --git a/tools/perf/bench/futex-wake-parallel.c b/tools/perf/bench/futex-wake-parallel.c index a053cf2b70397..c3033d0905db1 100644 --- a/tools/perf/bench/futex-wake-parallel.c +++ b/tools/perf/bench/futex-wake-parallel.c @@ -319,6 +319,7 @@ int bench_futex_wake_parallel(int argc, const char **argv) print_summary(); free(blocked_worker); + perf_cpu_map__put(cpu); return ret; } #endif /* HAVE_PTHREAD_BARRIER */ diff --git a/tools/perf/bench/futex-wake.c b/tools/perf/bench/futex-wake.c index 58906e9499bb0..ad44a78392dc4 100644 --- a/tools/perf/bench/futex-wake.c +++ b/tools/perf/bench/futex-wake.c @@ -209,5 +209,6 @@ int bench_futex_wake(int argc, const char **argv) print_summary(); free(worker); + perf_cpu_map__put(cpu); return ret; } From a872ed4177307855c4e9f1ab241656df32a261b9 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Sun, 14 Nov 2021 01:36:36 +0300 Subject: [PATCH 1736/5344] net: bnx2x: fix variable dereferenced before check [ Upstream commit f8885ac89ce310570e5391fe0bf0ec9c7c9b4fdc ] Smatch says: bnx2x_init_ops.h:640 bnx2x_ilt_client_mem_op() warn: variable dereferenced before check 'ilt' (see line 638) Move ilt_cli variable initialization _after_ ilt validation, because it's unsafe to deref the pointer before validation check. Fixes: 523224a3b3cd ("bnx2x, cnic, bnx2i: use new FW/HSI") Signed-off-by: Pavel Skripkin Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_init_ops.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_init_ops.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_init_ops.h index 1835d2e451c01..fc7fce642666c 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_init_ops.h +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_init_ops.h @@ -635,11 +635,13 @@ static int bnx2x_ilt_client_mem_op(struct bnx2x *bp, int cli_num, { int i, rc; struct bnx2x_ilt *ilt = BP_ILT(bp); - struct ilt_client_info *ilt_cli = &ilt->clients[cli_num]; + struct ilt_client_info *ilt_cli; if (!ilt || !ilt->lines) return -1; + ilt_cli = &ilt->clients[cli_num]; + if (ilt_cli->flags & (ILT_CLIENT_SKIP_INIT | ILT_CLIENT_SKIP_MEM)) return 0; From 35199b87027df15848a54c786d6710db7abf8e9a Mon Sep 17 00:00:00 2001 From: Nicholas Nunley Date: Fri, 4 Jun 2021 09:48:53 -0700 Subject: [PATCH 1737/5344] iavf: check for null in iavf_fix_features [ Upstream commit 8a4a126f4be88eb8b5f00a165ab58c35edf4ef76 ] If the driver has lost contact with the PF then it enters a disabled state and frees adapter->vf_res. However, ndo_fix_features can still be called on the interface, so we need to check for this condition first. Since we have no information on the features at this time simply leave them unmodified and return. Fixes: c4445aedfe09 ("i40evf: Fix VLAN features") Signed-off-by: Nicholas Nunley Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/iavf/iavf_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index bc46c262b42d8..d6a239ba0240e 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -3425,7 +3425,8 @@ static netdev_features_t iavf_fix_features(struct net_device *netdev, { struct iavf_adapter *adapter = netdev_priv(netdev); - if (!(adapter->vf_res->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_VLAN)) + if (adapter->vf_res && + !(adapter->vf_res->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_VLAN)) features &= ~(NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_FILTER); From 154b0707af4a35936f2ec8acdaf4de634c429aed Mon Sep 17 00:00:00 2001 From: Nicholas Nunley Date: Fri, 4 Jun 2021 09:48:54 -0700 Subject: [PATCH 1738/5344] iavf: free q_vectors before queues in iavf_disable_vf [ Upstream commit 89f22f129696ab53cfbc608e0a2184d0fea46ac1 ] iavf_free_queues() clears adapter->num_active_queues, which iavf_free_q_vectors() relies on, so swap the order of these two function calls in iavf_disable_vf(). This resolves a panic encountered when the interface is disabled and then later brought up again after PF communication is restored. Fixes: 65c7006f234c ("i40evf: assign num_active_queues inside i40evf_alloc_queues") Signed-off-by: Nicholas Nunley Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/iavf/iavf_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index d6a239ba0240e..c7e365267bc0f 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -2057,8 +2057,8 @@ static void iavf_disable_vf(struct iavf_adapter *adapter) iavf_free_misc_irq(adapter); iavf_reset_interrupt_capability(adapter); - iavf_free_queues(adapter); iavf_free_q_vectors(adapter); + iavf_free_queues(adapter); memset(adapter->vf_res, 0, IAVF_VIRTCHNL_VF_RESOURCE_SIZE); iavf_shutdown_adminq(&adapter->hw); adapter->netdev->flags &= ~IFF_UP; From eaccb4c2bbd060c8792b54e0f2c2c20f3b1be9e0 Mon Sep 17 00:00:00 2001 From: Piotr Marczak Date: Fri, 4 Jun 2021 09:48:56 -0700 Subject: [PATCH 1739/5344] iavf: Fix failure to exit out from last all-multicast mode [ Upstream commit 8905072a192fffe9389255489db250c73ecab008 ] The driver could only quit allmulti when allmulti and promisc modes are turn on at the same time. If promisc had been off there was no way to turn off allmulti mode. The patch corrects this behavior. Switching allmulti does not depends on promisc state mode anymore Fixes: f42a5c74da99 ("i40e: Add allmulti support for the VF") Signed-off-by: Piotr Marczak Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/iavf/iavf_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index c7e365267bc0f..43afe887cac9e 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -1626,8 +1626,7 @@ static int iavf_process_aq_command(struct iavf_adapter *adapter) iavf_set_promiscuous(adapter, FLAG_VF_MULTICAST_PROMISC); return 0; } - - if ((adapter->aq_required & IAVF_FLAG_AQ_RELEASE_PROMISC) && + if ((adapter->aq_required & IAVF_FLAG_AQ_RELEASE_PROMISC) || (adapter->aq_required & IAVF_FLAG_AQ_RELEASE_ALLMULTI)) { iavf_set_promiscuous(adapter, 0); return 0; From 9b663e2a7e73f8c7360a938a03e3cf98f76e8ee5 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 4 Jun 2021 09:48:57 -0700 Subject: [PATCH 1740/5344] iavf: prevent accidental free of filter structure [ Upstream commit 4f0400803818f2642f066d3eacaf013f23554cc7 ] In iavf_config_clsflower, the filter structure could be accidentally released at the end, if iavf_parse_cls_flower or iavf_handle_tclass ever return a non-zero but positive value. In this case, the function continues through to the end, and will call kfree() on the filter structure even though it has been added to the linked list. This can actually happen because iavf_parse_cls_flower will return a positive IAVF_ERR_CONFIG value instead of the traditional negative error codes. Fix this by ensuring that the kfree() check and error checks are similar. Use the more idiomatic "if (err)" to catch all non-zero error codes. Fixes: 0075fa0fadd0 ("i40evf: Add support to apply cloud filters") Signed-off-by: Jacob Keller Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/iavf/iavf_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 43afe887cac9e..6f6cd013eef3e 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -3033,11 +3033,11 @@ static int iavf_configure_clsflower(struct iavf_adapter *adapter, /* start out with flow type and eth type IPv4 to begin with */ filter->f.flow_type = VIRTCHNL_TCP_V4_FLOW; err = iavf_parse_cls_flower(adapter, cls_flower, filter); - if (err < 0) + if (err) goto err; err = iavf_handle_tclass(adapter, tc, filter); - if (err < 0) + if (err) goto err; /* add filter to the list */ From e9f1a4be2be9fa8b4e9ce35b20c96ada59439c27 Mon Sep 17 00:00:00 2001 From: Mitch Williams Date: Fri, 4 Jun 2021 09:48:58 -0700 Subject: [PATCH 1741/5344] iavf: validate pointers [ Upstream commit 131b0edc4028bb88bb472456b1ddba526cfb7036 ] In some cases, the ethtool get_rxfh handler may be called with a null key or indir parameter. So check these pointers, or you will have a very bad day. Fixes: 43a3d9ba34c9 ("i40evf: Allow PF driver to configure RSS") Signed-off-by: Mitch Williams Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/iavf/iavf_ethtool.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c index 758bef02a2a86..ad1e796e5544a 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c +++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c @@ -962,14 +962,13 @@ static int iavf_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key, if (hfunc) *hfunc = ETH_RSS_HASH_TOP; - if (!indir) - return 0; - - memcpy(key, adapter->rss_key, adapter->rss_key_size); + if (key) + memcpy(key, adapter->rss_key, adapter->rss_key_size); - /* Each 32 bits pointed by 'indir' is stored with a lut entry */ - for (i = 0; i < adapter->rss_lut_size; i++) - indir[i] = (u32)adapter->rss_lut[i]; + if (indir) + /* Each 32 bits pointed by 'indir' is stored with a lut entry */ + for (i = 0; i < adapter->rss_lut_size; i++) + indir[i] = (u32)adapter->rss_lut[i]; return 0; } From f9b5f3c0bde752a2d4f69c92f58787ac8c7f10ed Mon Sep 17 00:00:00 2001 From: Surabhi Boob Date: Fri, 4 Jun 2021 09:48:59 -0700 Subject: [PATCH 1742/5344] iavf: Fix for the false positive ASQ/ARQ errors while issuing VF reset [ Upstream commit 321421b57a12e933f92b228e0e6d0b2c6541f41d ] While issuing VF Reset from the guest OS, the VF driver prints logs about critical / Overflow error detection. This is not an actual error since the VF_MBX_ARQLEN register is set to all FF's for a short period of time and the VF would catch the bits set if it was reading the register during that spike of time. This patch introduces an additional check to ignore this condition since the VF is in reset. Fixes: 19b73d8efaa4 ("i40evf: Add additional check for reset") Signed-off-by: Surabhi Boob Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/iavf/iavf_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 6f6cd013eef3e..484c2a6f1625d 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -2341,7 +2341,7 @@ static void iavf_adminq_task(struct work_struct *work) /* check for error indications */ val = rd32(hw, hw->aq.arq.len); - if (val == 0xdeadbeef) /* indicates device in reset */ + if (val == 0xdeadbeef || val == 0xffffffff) /* device in reset */ goto freedom; oldval = val; if (val & IAVF_VF_ARQLEN1_ARQVFE_MASK) { From b97e3c842e79e3a1ce08f728a5aed2fb16919911 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 10 Nov 2021 23:28:24 +0000 Subject: [PATCH 1743/5344] MIPS: generic/yamon-dt: fix uninitialized variable error [ Upstream commit 255e51da15baed47531beefd02f222e4dc01f1c1 ] In the case where fw_getenv returns an error when fetching values for ememsizea and memsize then variable phys_memsize is not assigned a variable and will be uninitialized on a zero check of phys_memsize. Fix this by initializing phys_memsize to zero. Cleans up cppcheck error: arch/mips/generic/yamon-dt.c:100:7: error: Uninitialized variable: phys_memsize [uninitvar] Fixes: f41d2430bbd6 ("MIPS: generic/yamon-dt: Support > 256MB of RAM") Signed-off-by: Colin Ian King Signed-off-by: Thomas Bogendoerfer Signed-off-by: Sasha Levin --- arch/mips/generic/yamon-dt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/generic/yamon-dt.c b/arch/mips/generic/yamon-dt.c index a3aa22c77cadc..a07a5edbcda78 100644 --- a/arch/mips/generic/yamon-dt.c +++ b/arch/mips/generic/yamon-dt.c @@ -75,7 +75,7 @@ static unsigned int __init gen_fdt_mem_array( __init int yamon_dt_append_memory(void *fdt, const struct yamon_mem_region *regions) { - unsigned long phys_memsize, memsize; + unsigned long phys_memsize = 0, memsize; __be32 mem_array[2 * MAX_MEM_ARRAY_ENTRIES]; unsigned int mem_entries; int i, err, mem_off; From 5d14ce381a076e501bbd6d8efba90c0b37697034 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 14 Nov 2021 16:42:18 -0800 Subject: [PATCH 1744/5344] mips: bcm63xx: add support for clk_get_parent() [ Upstream commit e8f67482e5a4bc8d0b65d606d08cb60ee123b468 ] BCM63XX selects HAVE_LEGACY_CLK but does not provide/support clk_get_parent(), so add a simple implementation of that function so that callers of it will build without errors. Fixes these build errors: mips-linux-ld: drivers/iio/adc/ingenic-adc.o: in function `jz4770_adc_init_clk_div': ingenic-adc.c:(.text+0xe4): undefined reference to `clk_get_parent' mips-linux-ld: drivers/iio/adc/ingenic-adc.o: in function `jz4725b_adc_init_clk_div': ingenic-adc.c:(.text+0x1b8): undefined reference to `clk_get_parent' Fixes: e7300d04bd08 ("MIPS: BCM63xx: Add support for the Broadcom BCM63xx family of SOCs." ) Signed-off-by: Randy Dunlap Reported-by: kernel test robot Suggested-by: Russell King (Oracle) Cc: Artur Rojek Cc: Paul Cercueil Cc: linux-mips@vger.kernel.org Cc: Jonathan Cameron Cc: Lars-Peter Clausen Cc: linux-iio@vger.kernel.org Cc: Florian Fainelli Cc: Andy Shevchenko Cc: Russell King Cc: bcm-kernel-feedback-list@broadcom.com Cc: Jonas Gorski Reviewed-by: Andy Shevchenko Acked-by: Jonathan Cameron Acked-by: Russell King (Oracle) Acked-by: Florian Fainelli Signed-off-by: Thomas Bogendoerfer Signed-off-by: Sasha Levin --- arch/mips/bcm63xx/clk.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/mips/bcm63xx/clk.c b/arch/mips/bcm63xx/clk.c index 164115944a7fd..aba6e2d6a736c 100644 --- a/arch/mips/bcm63xx/clk.c +++ b/arch/mips/bcm63xx/clk.c @@ -381,6 +381,12 @@ void clk_disable(struct clk *clk) EXPORT_SYMBOL(clk_disable); +struct clk *clk_get_parent(struct clk *clk) +{ + return NULL; +} +EXPORT_SYMBOL(clk_get_parent); + unsigned long clk_get_rate(struct clk *clk) { if (!clk) From 65f47b19cdf39403af399bfdb78ca99a89058b12 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 14 Nov 2021 17:20:51 -0800 Subject: [PATCH 1745/5344] mips: lantiq: add support for clk_get_parent() [ Upstream commit fc1aabb088860d6cf9dd03612b7a6f0de91ccac2 ] Provide a simple implementation of clk_get_parent() in the lantiq subarch so that callers of it will build without errors. Fixes this build error: ERROR: modpost: "clk_get_parent" [drivers/iio/adc/ingenic-adc.ko] undefined! Fixes: 171bb2f19ed6 ("MIPS: Lantiq: Add initial support for Lantiq SoCs") Signed-off-by: Randy Dunlap Suggested-by: Russell King (Oracle) Cc: linux-mips@vger.kernel.org Cc: John Crispin Cc: Thomas Bogendoerfer Cc: Jonathan Cameron Cc: linux-iio@vger.kernel.org Cc: Russell King Cc: Andy Shevchenko Acked-by: Jonathan Cameron Acked-by: John Crispin Signed-off-by: Thomas Bogendoerfer Signed-off-by: Sasha Levin --- arch/mips/lantiq/clk.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/mips/lantiq/clk.c b/arch/mips/lantiq/clk.c index dd819e31fcbbf..4916cccf378fd 100644 --- a/arch/mips/lantiq/clk.c +++ b/arch/mips/lantiq/clk.c @@ -158,6 +158,12 @@ void clk_deactivate(struct clk *clk) } EXPORT_SYMBOL(clk_deactivate); +struct clk *clk_get_parent(struct clk *clk) +{ + return NULL; +} +EXPORT_SYMBOL(clk_get_parent); + static inline u32 get_counter_resolution(void) { u32 res; From bb4fede3e7f1683d3d76eb58364a80f23a6c95ef Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 7 Nov 2021 20:57:07 +0100 Subject: [PATCH 1746/5344] platform/x86: hp_accel: Fix an error handling path in 'lis3lv02d_probe()' [ Upstream commit c961a7d2aa23ae19e0099fbcdf1040fb760eea83 ] If 'led_classdev_register()' fails, some additional resources should be released. Add the missing 'i8042_remove_filter()' and 'lis3lv02d_remove_fs()' calls that are already in the remove function but are missing here. Fixes: a4c724d0723b ("platform: hp_accel: add a i8042 filter to remove HPQ6000 data from kb bus stream") Fixes: 9e0c79782143 ("lis3lv02d: merge with leds hp disk") Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/5a4f218f8f16d2e3a7906b7ca3654ffa946895f8.1636314074.git.christophe.jaillet@wanadoo.fr Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Signed-off-by: Sasha Levin --- drivers/platform/x86/hp_accel.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/platform/x86/hp_accel.c b/drivers/platform/x86/hp_accel.c index 8c0867bda8280..0dfaa1a43b674 100644 --- a/drivers/platform/x86/hp_accel.c +++ b/drivers/platform/x86/hp_accel.c @@ -372,9 +372,11 @@ static int lis3lv02d_add(struct acpi_device *device) INIT_WORK(&hpled_led.work, delayed_set_status_worker); ret = led_classdev_register(NULL, &hpled_led.led_classdev); if (ret) { + i8042_remove_filter(hp_accel_i8042_filter); lis3lv02d_joystick_disable(&lis3_dev); lis3lv02d_poweroff(&lis3_dev); flush_work(&hpled_led.work); + lis3lv02d_remove_fs(&lis3_dev); return ret; } From 6eb8bb686ebf1ad36f1f9b901f4e67b51565983c Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 5 Nov 2021 17:10:48 -0500 Subject: [PATCH 1747/5344] scsi: core: sysfs: Fix hang when device state is set via sysfs [ Upstream commit 4edd8cd4e86dd3047e5294bbefcc0a08f66a430f ] This fixes a regression added with: commit f0f82e2476f6 ("scsi: core: Fix capacity set to zero after offlinining device") The problem is that after iSCSI recovery, iscsid will call into the kernel to set the dev's state to running, and with that patch we now call scsi_rescan_device() with the state_mutex held. If the SCSI error handler thread is just starting to test the device in scsi_send_eh_cmnd() then it's going to try to grab the state_mutex. We are then stuck, because when scsi_rescan_device() tries to send its I/O scsi_queue_rq() calls -> scsi_host_queue_ready() -> scsi_host_in_recovery() which will return true (the host state is still in recovery) and I/O will just be requeued. scsi_send_eh_cmnd() will then never be able to grab the state_mutex to finish error handling. To prevent the deadlock move the rescan-related code to after we drop the state_mutex. This also adds a check for if we are already in the running state. This prevents extra scans and helps the iscsid case where if the transport class has already onlined the device during its recovery process then we don't need userspace to do it again plus possibly block that daemon. Link: https://lore.kernel.org/r/20211105221048.6541-3-michael.christie@oracle.com Fixes: f0f82e2476f6 ("scsi: core: Fix capacity set to zero after offlinining device") Cc: Bart Van Assche Cc: lijinlin Cc: Wu Bo Reviewed-by: Lee Duncan Reviewed-by: Wu Bo Signed-off-by: Mike Christie Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/scsi_sysfs.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index 12064ce76777d..16432d42a50aa 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -776,6 +776,7 @@ store_state_field(struct device *dev, struct device_attribute *attr, int i, ret; struct scsi_device *sdev = to_scsi_device(dev); enum scsi_device_state state = 0; + bool rescan_dev = false; for (i = 0; i < ARRAY_SIZE(sdev_states); i++) { const int len = strlen(sdev_states[i].name); @@ -794,20 +795,27 @@ store_state_field(struct device *dev, struct device_attribute *attr, } mutex_lock(&sdev->state_mutex); - ret = scsi_device_set_state(sdev, state); - /* - * If the device state changes to SDEV_RUNNING, we need to - * run the queue to avoid I/O hang, and rescan the device - * to revalidate it. Running the queue first is necessary - * because another thread may be waiting inside - * blk_mq_freeze_queue_wait() and because that call may be - * waiting for pending I/O to finish. - */ - if (ret == 0 && state == SDEV_RUNNING) { + if (sdev->sdev_state == SDEV_RUNNING && state == SDEV_RUNNING) { + ret = count; + } else { + ret = scsi_device_set_state(sdev, state); + if (ret == 0 && state == SDEV_RUNNING) + rescan_dev = true; + } + mutex_unlock(&sdev->state_mutex); + + if (rescan_dev) { + /* + * If the device state changes to SDEV_RUNNING, we need to + * run the queue to avoid I/O hang, and rescan the device + * to revalidate it. Running the queue first is necessary + * because another thread may be waiting inside + * blk_mq_freeze_queue_wait() and because that call may be + * waiting for pending I/O to finish. + */ blk_mq_run_hw_queues(sdev->request_queue, true); scsi_rescan_device(dev); } - mutex_unlock(&sdev->state_mutex); return ret == 0 ? count : -EINVAL; } From dd49bebbf03dc03310ee703002cebb408012bf6c Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 12 Nov 2021 11:33:11 -0500 Subject: [PATCH 1748/5344] net: sched: act_mirred: drop dst for the direction from egress to ingress [ Upstream commit f799ada6bf2397c351220088b9b0980125c77280 ] Without dropping dst, the packets sent from local mirred/redirected to ingress will may still use the old dst. ip_rcv() will drop it as the old dst is for output and its .input is dst_discard. This patch is to fix by also dropping dst for those packets that are mirred or redirected from egress to ingress in act_mirred. Note that we don't drop it for the direction change from ingress to egress, as on which there might be a user case attaching a metadata dst by act_tunnel_key that would be used later. Fixes: b57dc7c13ea9 ("net/sched: Introduce action ct") Signed-off-by: Xin Long Acked-by: Cong Wang Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/sched/act_mirred.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index e3ff884a48c56..b87d2a1ee0b16 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -218,6 +219,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, bool want_ingress; bool is_redirect; bool expects_nh; + bool at_ingress; int m_eaction; int mac_len; bool at_nh; @@ -253,7 +255,8 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, * ingress - that covers the TC S/W datapath. */ is_redirect = tcf_mirred_is_act_redirect(m_eaction); - use_reinsert = skb_at_tc_ingress(skb) && is_redirect && + at_ingress = skb_at_tc_ingress(skb); + use_reinsert = at_ingress && is_redirect && tcf_mirred_can_reinsert(retval); if (!use_reinsert) { skb2 = skb_clone(skb, GFP_ATOMIC); @@ -261,10 +264,12 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, goto out; } + want_ingress = tcf_mirred_act_wants_ingress(m_eaction); + /* All mirred/redirected skbs should clear previous ct info */ nf_reset_ct(skb2); - - want_ingress = tcf_mirred_act_wants_ingress(m_eaction); + if (want_ingress && !at_ingress) /* drop dst for egress -> ingress */ + skb_dst_drop(skb2); expects_nh = want_ingress || !m_mac_header_xmit; at_nh = skb->data == skb_network_header(skb); From b7f43a82ec1ce86091994dd7e22e2e3fc8148a10 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Tue, 16 Nov 2021 18:17:12 +0300 Subject: [PATCH 1749/5344] net: dpaa2-eth: fix use-after-free in dpaa2_eth_remove [ Upstream commit 9b5a333272a48c2f8b30add7a874e46e8b26129c ] Access to netdev after free_netdev() will cause use-after-free bug. Move debug log before free_netdev() call to avoid it. Fixes: 7472dd9f6499 ("staging: fsl-dpaa2/eth: Move print message") Signed-off-by: Pavel Skripkin Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 7af7cc7c8669a..34540e604f748 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -3616,10 +3616,10 @@ static int dpaa2_eth_remove(struct fsl_mc_device *ls_dev) fsl_mc_portal_free(priv->mc_io); - free_netdev(net_dev); - dev_dbg(net_dev->dev.parent, "Removed interface %s\n", net_dev->name); + free_netdev(net_dev); + return 0; } From a2e58f98f959330bce5a2ee953c3a53732530d6d Mon Sep 17 00:00:00 2001 From: Jonathan Davies Date: Tue, 16 Nov 2021 17:42:42 +0000 Subject: [PATCH 1750/5344] net: virtio_net_hdr_to_skb: count transport header in UFO [ Upstream commit cf9acc90c80ecbee00334aa85d92f4e74014bcff ] virtio_net_hdr_to_skb does not set the skb's gso_size and gso_type correctly for UFO packets received via virtio-net that are a little over the GSO size. This can lead to problems elsewhere in the networking stack, e.g. ovs_vport_send dropping over-sized packets if gso_size is not set. This is due to the comparison if (skb->len - p_off > gso_size) not properly accounting for the transport layer header. p_off includes the size of the transport layer header (thlen), so skb->len - p_off is the size of the TCP/UDP payload. gso_size is read from the virtio-net header. For UFO, fragmentation happens at the IP level so does not need to include the UDP header. Hence the calculation could be comparing a TCP/UDP payload length with an IP payload length, causing legitimate virtio-net packets to have lack gso_type/gso_size information. Example: a UDP packet with payload size 1473 has IP payload size 1481. If the guest used UFO, it is not fragmented and the virtio-net header's flags indicate that it is a GSO frame (VIRTIO_NET_HDR_GSO_UDP), with gso_size = 1480 for an MTU of 1500. skb->len will be 1515 and p_off will be 42, so skb->len - p_off = 1473. Hence the comparison fails, and shinfo->gso_size and gso_type are not set as they should be. Instead, add the UDP header length before comparing to gso_size when using UFO. In this way, it is the size of the IP payload that is compared to gso_size. Fixes: 6dd912f82680 ("net: check untrusted gso_size at kernel entry") Signed-off-by: Jonathan Davies Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/linux/virtio_net.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 8ca4a776b4c6a..22dd48c825600 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -138,10 +138,15 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { u16 gso_size = __virtio16_to_cpu(little_endian, hdr->gso_size); + unsigned int nh_off = p_off; struct skb_shared_info *shinfo = skb_shinfo(skb); + /* UFO may not include transport header in gso_size. */ + if (gso_type & SKB_GSO_UDP) + nh_off -= thlen; + /* Too small packets are not really GSO ones. */ - if (skb->len - p_off > gso_size) { + if (skb->len - nh_off > gso_size) { shinfo->gso_size = gso_size; shinfo->gso_type = gso_type; From b2d9810f86db0756b9f4904630aa2aef99fb688e Mon Sep 17 00:00:00 2001 From: Eryk Rybak Date: Thu, 21 Jan 2021 16:17:22 +0000 Subject: [PATCH 1751/5344] i40e: Fix correct max_pkt_size on VF RX queue [ Upstream commit 6afbd7b3c53cb7417189f476e99d431daccb85b0 ] Setting VLAN port increasing RX queue max_pkt_size by 4 bytes to take VLAN tag into account. Trigger the VF reset when setting port VLAN for VF to renegotiate its capabilities and reinitialize. Fixes: ba4e003d29c1 ("i40e: don't hold spinlock while resetting VF") Signed-off-by: Sylwester Dziedziuch Signed-off-by: Aleksandr Loktionov Signed-off-by: Eryk Rybak Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- .../ethernet/intel/i40e/i40e_virtchnl_pf.c | 53 ++++--------------- 1 file changed, 9 insertions(+), 44 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index e561073054865..16641c19b7f73 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -621,14 +621,13 @@ static int i40e_config_vsi_rx_queue(struct i40e_vf *vf, u16 vsi_id, u16 vsi_queue_id, struct virtchnl_rxq_info *info) { + u16 pf_queue_id = i40e_vc_get_pf_queue_id(vf, vsi_id, vsi_queue_id); struct i40e_pf *pf = vf->pf; + struct i40e_vsi *vsi = pf->vsi[vf->lan_vsi_idx]; struct i40e_hw *hw = &pf->hw; struct i40e_hmc_obj_rxq rx_ctx; - u16 pf_queue_id; int ret = 0; - pf_queue_id = i40e_vc_get_pf_queue_id(vf, vsi_id, vsi_queue_id); - /* clear the context structure first */ memset(&rx_ctx, 0, sizeof(struct i40e_hmc_obj_rxq)); @@ -666,6 +665,10 @@ static int i40e_config_vsi_rx_queue(struct i40e_vf *vf, u16 vsi_id, } rx_ctx.rxmax = info->max_pkt_size; + /* if port VLAN is configured increase the max packet size */ + if (vsi->info.pvid) + rx_ctx.rxmax += VLAN_HLEN; + /* enable 32bytes desc always */ rx_ctx.dsize = 1; @@ -4050,34 +4053,6 @@ int i40e_ndo_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac) return ret; } -/** - * i40e_vsi_has_vlans - True if VSI has configured VLANs - * @vsi: pointer to the vsi - * - * Check if a VSI has configured any VLANs. False if we have a port VLAN or if - * we have no configured VLANs. Do not call while holding the - * mac_filter_hash_lock. - */ -static bool i40e_vsi_has_vlans(struct i40e_vsi *vsi) -{ - bool have_vlans; - - /* If we have a port VLAN, then the VSI cannot have any VLANs - * configured, as all MAC/VLAN filters will be assigned to the PVID. - */ - if (vsi->info.pvid) - return false; - - /* Since we don't have a PVID, we know that if the device is in VLAN - * mode it must be because of a VLAN filter configured on this VSI. - */ - spin_lock_bh(&vsi->mac_filter_hash_lock); - have_vlans = i40e_is_vsi_in_vlan(vsi); - spin_unlock_bh(&vsi->mac_filter_hash_lock); - - return have_vlans; -} - /** * i40e_ndo_set_vf_port_vlan * @netdev: network interface device structure @@ -4134,19 +4109,9 @@ int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, int vf_id, /* duplicate request, so just return success */ goto error_pvid; - if (i40e_vsi_has_vlans(vsi)) { - dev_err(&pf->pdev->dev, - "VF %d has already configured VLAN filters and the administrator is requesting a port VLAN override.\nPlease unload and reload the VF driver for this change to take effect.\n", - vf_id); - /* Administrator Error - knock the VF offline until he does - * the right thing by reconfiguring his network correctly - * and then reloading the VF driver. - */ - i40e_vc_disable_vf(vf); - /* During reset the VF got a new VSI, so refresh the pointer. */ - vsi = pf->vsi[vf->lan_vsi_idx]; - } - + i40e_vc_disable_vf(vf); + /* During reset the VF got a new VSI, so refresh a pointer. */ + vsi = pf->vsi[vf->lan_vsi_idx]; /* Locked once because multiple functions below iterate list */ spin_lock_bh(&vsi->mac_filter_hash_lock); From 7c1b2b98f443ae0ca84e011a32d14ea020e3b0f9 Mon Sep 17 00:00:00 2001 From: Michal Maloszewski Date: Wed, 24 Feb 2021 12:07:48 +0000 Subject: [PATCH 1752/5344] i40e: Fix NULL ptr dereference on VSI filter sync [ Upstream commit 37d9e304acd903a445df8208b8a13d707902dea6 ] Remove the reason of null pointer dereference in sync VSI filters. Added new I40E_VSI_RELEASING flag to signalize deleting and releasing of VSI resources to sync this thread with sync filters subtask. Without this patch it is possible to start update the VSI filter list after VSI is removed, that's causing a kernel oops. Fixes: 41c445ff0f48 ("i40e: main driver core") Signed-off-by: Grzegorz Szczurek Signed-off-by: Michal Maloszewski Reviewed-by: Przemyslaw Patynowski Reviewed-by: Witold Fijalkowski Reviewed-by: Jaroslaw Gawin Reviewed-by: Aleksandr Loktionov Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/i40e/i40e.h | 1 + drivers/net/ethernet/intel/i40e/i40e_main.c | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index e571c6116c4b7..e7e61b9a75ecd 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -169,6 +169,7 @@ enum i40e_vsi_state_t { __I40E_VSI_OVERFLOW_PROMISC, __I40E_VSI_REINIT_REQUESTED, __I40E_VSI_DOWN_REQUESTED, + __I40E_VSI_RELEASING, /* This must be last as it determines the size of the BITMAP */ __I40E_VSI_STATE_SIZE__, }; diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 06fcd41495548..5af2d18763aaf 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -2609,7 +2609,8 @@ static void i40e_sync_filters_subtask(struct i40e_pf *pf) for (v = 0; v < pf->num_alloc_vsi; v++) { if (pf->vsi[v] && - (pf->vsi[v]->flags & I40E_VSI_FLAG_FILTER_CHANGED)) { + (pf->vsi[v]->flags & I40E_VSI_FLAG_FILTER_CHANGED) && + !test_bit(__I40E_VSI_RELEASING, pf->vsi[v]->state)) { int ret = i40e_sync_vsi_filters(pf->vsi[v]); if (ret) { @@ -13389,7 +13390,7 @@ int i40e_vsi_release(struct i40e_vsi *vsi) dev_info(&pf->pdev->dev, "Can't remove PF VSI\n"); return -ENODEV; } - + set_bit(__I40E_VSI_RELEASING, vsi->state); uplink_seid = vsi->uplink_seid; if (vsi->type != I40E_VSI_SRIOV) { if (vsi->netdev_registered) { From a0696b5fd3ea6d9c1b0d3779337344c49f875870 Mon Sep 17 00:00:00 2001 From: Eryk Rybak Date: Fri, 23 Apr 2021 13:43:25 +0200 Subject: [PATCH 1753/5344] i40e: Fix changing previously set num_queue_pairs for PFs [ Upstream commit d2a69fefd75683004ffe87166de5635b3267ee07 ] Currently, the i40e_vsi_setup_queue_map is basing the count of queues in TCs on a VSI's alloc_queue_pairs member which is not changed throughout any user's action (for example via ethtool's set_channels callback). This implies that vsi->tc_config.tc_info[n].qcount value that is given to the kernel via netdev_set_tc_queue() that notifies about the count of queues per particular traffic class is constant even if user has changed the total count of queues. This in turn caused the kernel warning after setting the queue count to the lower value than the initial one: $ ethtool -l ens801f0 Channel parameters for ens801f0: Pre-set maximums: RX: 0 TX: 0 Other: 1 Combined: 64 Current hardware settings: RX: 0 TX: 0 Other: 1 Combined: 64 $ ethtool -L ens801f0 combined 40 [dmesg] Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled! Reason was that vsi->alloc_queue_pairs stayed at 64 value which was used to set the qcount on TC0 (by default only TC0 exists so all of the existing queues are assigned to TC0). we update the offset/qcount via netdev_set_tc_queue() back to the old value but then the netif_set_real_num_tx_queues() is using the vsi->num_queue_pairs as a value which got set to 40. Fix it by using vsi->req_queue_pairs as a queue count that will be distributed across TCs. Do it only for non-zero values, which implies that user actually requested the new count of queues. For VSIs other than main, stay with the vsi->alloc_queue_pairs as we only allow manipulating the queue count on main VSI. Fixes: bc6d33c8d93f ("i40e: Fix the number of queues available to be mapped for use") Co-developed-by: Maciej Fijalkowski Signed-off-by: Maciej Fijalkowski Co-developed-by: Przemyslaw Patynowski Signed-off-by: Przemyslaw Patynowski Signed-off-by: Eryk Rybak Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/i40e/i40e_main.c | 35 ++++++++++++++------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 5af2d18763aaf..2895d8912c85f 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -1776,6 +1776,7 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi, bool is_add) { struct i40e_pf *pf = vsi->back; + u16 num_tc_qps = 0; u16 sections = 0; u8 netdev_tc = 0; u16 numtc = 1; @@ -1783,13 +1784,29 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi, u8 offset; u16 qmap; int i; - u16 num_tc_qps = 0; sections = I40E_AQ_VSI_PROP_QUEUE_MAP_VALID; offset = 0; + if (vsi->type == I40E_VSI_MAIN) { + /* This code helps add more queue to the VSI if we have + * more cores than RSS can support, the higher cores will + * be served by ATR or other filters. Furthermore, the + * non-zero req_queue_pairs says that user requested a new + * queue count via ethtool's set_channels, so use this + * value for queues distribution across traffic classes + */ + if (vsi->req_queue_pairs > 0) + vsi->num_queue_pairs = vsi->req_queue_pairs; + else if (pf->flags & I40E_FLAG_MSIX_ENABLED) + vsi->num_queue_pairs = pf->num_lan_msix; + } + /* Number of queues per enabled TC */ - num_tc_qps = vsi->alloc_queue_pairs; + if (vsi->type == I40E_VSI_MAIN) + num_tc_qps = vsi->num_queue_pairs; + else + num_tc_qps = vsi->alloc_queue_pairs; if (enabled_tc && (vsi->back->flags & I40E_FLAG_DCB_ENABLED)) { /* Find numtc from enabled TC bitmap */ for (i = 0, numtc = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) { @@ -1867,16 +1884,10 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi, } ctxt->info.tc_mapping[i] = cpu_to_le16(qmap); } - - /* Set actual Tx/Rx queue pairs */ - vsi->num_queue_pairs = offset; - if ((vsi->type == I40E_VSI_MAIN) && (numtc == 1)) { - if (vsi->req_queue_pairs > 0) - vsi->num_queue_pairs = vsi->req_queue_pairs; - else if (pf->flags & I40E_FLAG_MSIX_ENABLED) - vsi->num_queue_pairs = pf->num_lan_msix; - } - + /* Do not change previously set num_queue_pairs for PFs */ + if ((vsi->type == I40E_VSI_MAIN && numtc != 1) || + vsi->type != I40E_VSI_MAIN) + vsi->num_queue_pairs = offset; /* Scheduler section valid can only be set for ADD VSI */ if (is_add) { sections |= I40E_AQ_VSI_PROP_SCHED_VALID; From 2e535dd0883fd7967ecda9765be1439885198654 Mon Sep 17 00:00:00 2001 From: Eryk Rybak Date: Fri, 23 Apr 2021 13:43:26 +0200 Subject: [PATCH 1754/5344] i40e: Fix ping is lost after configuring ADq on VF [ Upstream commit 9e0a603cb7dce2a19d98116d42de84b6db26d716 ] Properly reconfigure VF VSIs after VF request ADQ. Created new function to update queue mapping and queue pairs per TC with AQ update VSI. This sets proper RSS size on NIC. VFs num_queue_pairs should not be changed during setup of queue maps. Previously, VF main VSI in ADQ had configured too many queues and had wrong RSS size, which lead to packets not being consumed and drops in connectivity. Fixes: bc6d33c8d93f ("i40e: Fix the number of queues available to be mapped for use") Co-developed-by: Przemyslaw Patynowski Signed-off-by: Przemyslaw Patynowski Signed-off-by: Eryk Rybak Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/i40e/i40e.h | 1 + drivers/net/ethernet/intel/i40e/i40e_main.c | 64 ++++++++++++++++++- .../ethernet/intel/i40e/i40e_virtchnl_pf.c | 17 +++-- 3 files changed, 74 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index e7e61b9a75ecd..f8422dbfd54e6 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -1147,6 +1147,7 @@ void i40e_ptp_save_hw_time(struct i40e_pf *pf); void i40e_ptp_restore_hw_time(struct i40e_pf *pf); void i40e_ptp_init(struct i40e_pf *pf); void i40e_ptp_stop(struct i40e_pf *pf); +int i40e_update_adq_vsi_queues(struct i40e_vsi *vsi, int vsi_offset); int i40e_is_vsi_uplink_mode_veb(struct i40e_vsi *vsi); i40e_status i40e_get_partition_bw_setting(struct i40e_pf *pf); i40e_status i40e_set_partition_bw_setting(struct i40e_pf *pf); diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 2895d8912c85f..908b5bfed6f01 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -1787,6 +1787,8 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi, sections = I40E_AQ_VSI_PROP_QUEUE_MAP_VALID; offset = 0; + /* zero out queue mapping, it will get updated on the end of the function */ + memset(ctxt->info.queue_mapping, 0, sizeof(ctxt->info.queue_mapping)); if (vsi->type == I40E_VSI_MAIN) { /* This code helps add more queue to the VSI if we have @@ -1803,10 +1805,12 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi, } /* Number of queues per enabled TC */ - if (vsi->type == I40E_VSI_MAIN) + if (vsi->type == I40E_VSI_MAIN || + (vsi->type == I40E_VSI_SRIOV && vsi->num_queue_pairs != 0)) num_tc_qps = vsi->num_queue_pairs; else num_tc_qps = vsi->alloc_queue_pairs; + if (enabled_tc && (vsi->back->flags & I40E_FLAG_DCB_ENABLED)) { /* Find numtc from enabled TC bitmap */ for (i = 0, numtc = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) { @@ -1884,10 +1888,12 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi, } ctxt->info.tc_mapping[i] = cpu_to_le16(qmap); } - /* Do not change previously set num_queue_pairs for PFs */ + /* Do not change previously set num_queue_pairs for PFs and VFs*/ if ((vsi->type == I40E_VSI_MAIN && numtc != 1) || - vsi->type != I40E_VSI_MAIN) + (vsi->type == I40E_VSI_SRIOV && vsi->num_queue_pairs == 0) || + (vsi->type != I40E_VSI_MAIN && vsi->type != I40E_VSI_SRIOV)) vsi->num_queue_pairs = offset; + /* Scheduler section valid can only be set for ADD VSI */ if (is_add) { sections |= I40E_AQ_VSI_PROP_SCHED_VALID; @@ -5383,6 +5389,58 @@ static void i40e_vsi_update_queue_map(struct i40e_vsi *vsi, sizeof(vsi->info.tc_mapping)); } +/** + * i40e_update_adq_vsi_queues - update queue mapping for ADq VSI + * @vsi: the VSI being reconfigured + * @vsi_offset: offset from main VF VSI + */ +int i40e_update_adq_vsi_queues(struct i40e_vsi *vsi, int vsi_offset) +{ + struct i40e_vsi_context ctxt = {}; + struct i40e_pf *pf; + struct i40e_hw *hw; + int ret; + + if (!vsi) + return I40E_ERR_PARAM; + pf = vsi->back; + hw = &pf->hw; + + ctxt.seid = vsi->seid; + ctxt.pf_num = hw->pf_id; + ctxt.vf_num = vsi->vf_id + hw->func_caps.vf_base_id + vsi_offset; + ctxt.uplink_seid = vsi->uplink_seid; + ctxt.connection_type = I40E_AQ_VSI_CONN_TYPE_NORMAL; + ctxt.flags = I40E_AQ_VSI_TYPE_VF; + ctxt.info = vsi->info; + + i40e_vsi_setup_queue_map(vsi, &ctxt, vsi->tc_config.enabled_tc, + false); + if (vsi->reconfig_rss) { + vsi->rss_size = min_t(int, pf->alloc_rss_size, + vsi->num_queue_pairs); + ret = i40e_vsi_config_rss(vsi); + if (ret) { + dev_info(&pf->pdev->dev, "Failed to reconfig rss for num_queues\n"); + return ret; + } + vsi->reconfig_rss = false; + } + + ret = i40e_aq_update_vsi_params(hw, &ctxt, NULL); + if (ret) { + dev_info(&pf->pdev->dev, "Update vsi config failed, err %s aq_err %s\n", + i40e_stat_str(hw, ret), + i40e_aq_str(hw, hw->aq.asq_last_status)); + return ret; + } + /* update the local VSI info with updated queue map */ + i40e_vsi_update_queue_map(vsi, &ctxt); + vsi->info.valid_sections = 0; + + return ret; +} + /** * i40e_vsi_config_tc - Configure VSI Tx Scheduler for given TC map * @vsi: VSI to be configured diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 16641c19b7f73..6e61aea42a0dd 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -2100,11 +2100,12 @@ static int i40e_vc_config_queues_msg(struct i40e_vf *vf, u8 *msg) struct virtchnl_vsi_queue_config_info *qci = (struct virtchnl_vsi_queue_config_info *)msg; struct virtchnl_queue_pair_info *qpi; - struct i40e_pf *pf = vf->pf; u16 vsi_id, vsi_queue_id = 0; - u16 num_qps_all = 0; + struct i40e_pf *pf = vf->pf; i40e_status aq_ret = 0; int i, j = 0, idx = 0; + struct i40e_vsi *vsi; + u16 num_qps_all = 0; if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states)) { aq_ret = I40E_ERR_PARAM; @@ -2193,9 +2194,15 @@ static int i40e_vc_config_queues_msg(struct i40e_vf *vf, u8 *msg) pf->vsi[vf->lan_vsi_idx]->num_queue_pairs = qci->num_queue_pairs; } else { - for (i = 0; i < vf->num_tc; i++) - pf->vsi[vf->ch[i].vsi_idx]->num_queue_pairs = - vf->ch[i].num_qps; + for (i = 0; i < vf->num_tc; i++) { + vsi = pf->vsi[vf->ch[i].vsi_idx]; + vsi->num_queue_pairs = vf->ch[i].num_qps; + + if (i40e_update_adq_vsi_queues(vsi, i)) { + aq_ret = I40E_ERR_CONFIG; + goto error_param; + } + } } error_param: From cb5f5a475c231233d568a389048e2ee806564d0e Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Mon, 21 Jun 2021 08:37:31 +0000 Subject: [PATCH 1755/5344] i40e: Fix creation of first queue by omitting it if is not power of two [ Upstream commit 2e6d218c1ec6fb9cd70693b78134cbc35ae0b5a9 ] Reject TCs creation with proper message if the first queue assignment is not equal to the power of two. The first queue number was checked too late in the second queue iteration, if second queue was configured at all. Now if first queue value is not a power of two, then trying to create qdisc will be rejected. Fixes: 8f88b3034db3 ("i40e: Add infrastructure for queue channel support") Signed-off-by: Grzegorz Szczurek Signed-off-by: Jedrzej Jagielski Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/i40e/i40e_main.c | 59 +++++++-------------- 1 file changed, 19 insertions(+), 40 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 908b5bfed6f01..3045661b71ad8 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -5731,24 +5731,6 @@ static void i40e_remove_queue_channels(struct i40e_vsi *vsi) INIT_LIST_HEAD(&vsi->ch_list); } -/** - * i40e_is_any_channel - channel exist or not - * @vsi: ptr to VSI to which channels are associated with - * - * Returns true or false if channel(s) exist for associated VSI or not - **/ -static bool i40e_is_any_channel(struct i40e_vsi *vsi) -{ - struct i40e_channel *ch, *ch_tmp; - - list_for_each_entry_safe(ch, ch_tmp, &vsi->ch_list, list) { - if (ch->initialized) - return true; - } - - return false; -} - /** * i40e_get_max_queues_for_channel * @vsi: ptr to VSI to which channels are associated with @@ -6256,26 +6238,15 @@ int i40e_create_queue_channel(struct i40e_vsi *vsi, /* By default we are in VEPA mode, if this is the first VF/VMDq * VSI to be added switch to VEB mode. */ - if ((!(pf->flags & I40E_FLAG_VEB_MODE_ENABLED)) || - (!i40e_is_any_channel(vsi))) { - if (!is_power_of_2(vsi->tc_config.tc_info[0].qcount)) { - dev_dbg(&pf->pdev->dev, - "Failed to create channel. Override queues (%u) not power of 2\n", - vsi->tc_config.tc_info[0].qcount); - return -EINVAL; - } - if (!(pf->flags & I40E_FLAG_VEB_MODE_ENABLED)) { - pf->flags |= I40E_FLAG_VEB_MODE_ENABLED; + if (!(pf->flags & I40E_FLAG_VEB_MODE_ENABLED)) { + pf->flags |= I40E_FLAG_VEB_MODE_ENABLED; - if (vsi->type == I40E_VSI_MAIN) { - if (pf->flags & I40E_FLAG_TC_MQPRIO) - i40e_do_reset(pf, I40E_PF_RESET_FLAG, - true); - else - i40e_do_reset_safe(pf, - I40E_PF_RESET_FLAG); - } + if (vsi->type == I40E_VSI_MAIN) { + if (pf->flags & I40E_FLAG_TC_MQPRIO) + i40e_do_reset(pf, I40E_PF_RESET_FLAG, true); + else + i40e_do_reset_safe(pf, I40E_PF_RESET_FLAG); } /* now onwards for main VSI, number of queues will be value * of TC0's queue count @@ -7567,12 +7538,20 @@ static int i40e_setup_tc(struct net_device *netdev, void *type_data) vsi->seid); need_reset = true; goto exit; - } else { - dev_info(&vsi->back->pdev->dev, - "Setup channel (id:%u) utilizing num_queues %d\n", - vsi->seid, vsi->tc_config.tc_info[0].qcount); + } else if (enabled_tc && + (!is_power_of_2(vsi->tc_config.tc_info[0].qcount))) { + netdev_info(netdev, + "Failed to create channel. Override queues (%u) not power of 2\n", + vsi->tc_config.tc_info[0].qcount); + ret = -EINVAL; + need_reset = true; + goto exit; } + dev_info(&vsi->back->pdev->dev, + "Setup channel (id:%u) utilizing num_queues %d\n", + vsi->seid, vsi->tc_config.tc_info[0].qcount); + if (pf->flags & I40E_FLAG_TC_MQPRIO) { if (vsi->mqprio_qopt.max_rate[0]) { u64 max_tx_rate = vsi->mqprio_qopt.max_rate[0]; From 20b33629cd26f552bf641616e0705ddcbdc930be Mon Sep 17 00:00:00 2001 From: Grzegorz Szczurek Date: Fri, 29 Oct 2021 11:26:01 +0200 Subject: [PATCH 1756/5344] i40e: Fix display error code in dmesg [ Upstream commit 5aff430d4e33a0b48a6b3d5beb06f79da23f9916 ] Fix misleading display error in dmesg if tc filter return fail. Only i40e status error code should be converted to string, not linux error code. Otherwise, we return false information about the error. Fixes: 2f4b411a3d67 ("i40e: Enable cloud filters via tc-flower") Signed-off-by: Grzegorz Szczurek Signed-off-by: Mateusz Palczewski Tested-by: Dave Switzer Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/i40e/i40e_main.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 3045661b71ad8..f373640a9adcb 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -8116,9 +8116,8 @@ static int i40e_configure_clsflower(struct i40e_vsi *vsi, err = i40e_add_del_cloud_filter(vsi, filter, true); if (err) { - dev_err(&pf->pdev->dev, - "Failed to add cloud filter, err %s\n", - i40e_stat_str(&pf->hw, err)); + dev_err(&pf->pdev->dev, "Failed to add cloud filter, err %d\n", + err); goto err; } From 817fed8aff4c6a14b7c9a467901d344c6da59f19 Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Mon, 15 Nov 2021 22:56:00 +0800 Subject: [PATCH 1757/5344] NFC: reorganize the functions in nci_request [ Upstream commit 86cdf8e38792545161dbe3350a7eced558ba4d15 ] There is a possible data race as shown below: thread-A in nci_request() | thread-B in nci_close_device() | mutex_lock(&ndev->req_lock); test_bit(NCI_UP, &ndev->flags); | ... | test_and_clear_bit(NCI_UP, &ndev->flags) mutex_lock(&ndev->req_lock); | | This race will allow __nci_request() to be awaked while the device is getting removed. Similar to commit e2cb6b891ad2 ("bluetooth: eliminate the potential race condition when removing the HCI controller"). this patch alters the function sequence in nci_request() to prevent the data races between the nci_close_device(). Signed-off-by: Lin Ma Fixes: 6a2968aaf50c ("NFC: basic NCI protocol implementation") Link: https://lore.kernel.org/r/20211115145600.8320-1-linma@zju.edu.cn Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/nfc/nci/core.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 6a34a0a786eaa..1d0aa9e6044bf 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -144,12 +144,15 @@ inline int nci_request(struct nci_dev *ndev, { int rc; - if (!test_bit(NCI_UP, &ndev->flags)) - return -ENETDOWN; - /* Serialize all requests */ mutex_lock(&ndev->req_lock); - rc = __nci_request(ndev, req, opt, timeout); + /* check the state after obtaing the lock against any races + * from nci_close_device when the device gets removed. + */ + if (test_bit(NCI_UP, &ndev->flags)) + rc = __nci_request(ndev, req, opt, timeout); + else + rc = -ENETDOWN; mutex_unlock(&ndev->req_lock); return rc; From 49865565765cbc4b599654f787f2ef913060d4fd Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Wed, 10 Nov 2021 16:36:04 +0100 Subject: [PATCH 1758/5344] drm/nouveau: hdmigv100.c: fix corrupted HDMI Vendor InfoFrame [ Upstream commit 3cc1ae1fa70ab369e4645e38ce335a19438093ad ] gv100_hdmi_ctrl() writes vendor_infoframe.subpack0_high to 0x6f0110, and then overwrites it with 0. Just drop the overwrite with 0, that's clearly a mistake. Because of this issue the HDMI VIC is 0 instead of 1 in the HDMI Vendor InfoFrame when transmitting 4kp30. Signed-off-by: Hans Verkuil Fixes: 290ffeafcc1a ("drm/nouveau/disp/gv100: initial support") Reviewed-by: Ben Skeggs Signed-off-by: Karol Herbst Link: https://patchwork.freedesktop.org/patch/msgid/3d3bd0f7-c150-2479-9350-35d394ee772d@xs4all.nl Signed-off-by: Sasha Levin --- drivers/gpu/drm/nouveau/nvkm/engine/disp/hdmigv100.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/hdmigv100.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/hdmigv100.c index 6e3c450eaacef..3ff49344abc77 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/hdmigv100.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/hdmigv100.c @@ -62,7 +62,6 @@ gv100_hdmi_ctrl(struct nvkm_ior *ior, int head, bool enable, u8 max_ac_packet, nvkm_wr32(device, 0x6f0108 + hdmi, vendor_infoframe.header); nvkm_wr32(device, 0x6f010c + hdmi, vendor_infoframe.subpack0_low); nvkm_wr32(device, 0x6f0110 + hdmi, vendor_infoframe.subpack0_high); - nvkm_wr32(device, 0x6f0110 + hdmi, 0x00000000); nvkm_wr32(device, 0x6f0114 + hdmi, 0x00000000); nvkm_wr32(device, 0x6f0118 + hdmi, 0x00000000); nvkm_wr32(device, 0x6f011c + hdmi, 0x00000000); From d014902d65a93d99efebdcb54c56b48a351a1423 Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Tue, 16 Nov 2021 23:26:52 +0800 Subject: [PATCH 1759/5344] NFC: reorder the logic in nfc_{un,}register_device [ Upstream commit 3e3b5dfcd16a3e254aab61bd1e8c417dd4503102 ] There is a potential UAF between the unregistration routine and the NFC netlink operations. The race that cause that UAF can be shown as below: (FREE) | (USE) nfcmrvl_nci_unregister_dev | nfc_genl_dev_up nci_close_device | nci_unregister_device | nfc_get_device nfc_unregister_device | nfc_dev_up rfkill_destory | device_del | rfkill_blocked ... | ... The root cause for this race is concluded below: 1. The rfkill_blocked (USE) in nfc_dev_up is supposed to be placed after the device_is_registered check. 2. Since the netlink operations are possible just after the device_add in nfc_register_device, the nfc_dev_up() can happen anywhere during the rfkill creation process, which leads to data race. This patch reorder these actions to permit 1. Once device_del is finished, the nfc_dev_up cannot dereference the rfkill object. 2. The rfkill_register need to be placed after the device_add of nfc_dev because the parent device need to be created first. So this patch keeps the order but inject device_lock to prevent the data race. Signed-off-by: Lin Ma Fixes: be055b2f89b5 ("NFC: RFKILL support") Reviewed-by: Jakub Kicinski Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20211116152652.19217-1-linma@zju.edu.cn Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/nfc/core.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/net/nfc/core.c b/net/nfc/core.c index c5f9c3ee82f8e..e752692d36802 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -94,13 +94,13 @@ int nfc_dev_up(struct nfc_dev *dev) device_lock(&dev->dev); - if (dev->rfkill && rfkill_blocked(dev->rfkill)) { - rc = -ERFKILL; + if (!device_is_registered(&dev->dev)) { + rc = -ENODEV; goto error; } - if (!device_is_registered(&dev->dev)) { - rc = -ENODEV; + if (dev->rfkill && rfkill_blocked(dev->rfkill)) { + rc = -ERFKILL; goto error; } @@ -1118,11 +1118,7 @@ int nfc_register_device(struct nfc_dev *dev) if (rc) pr_err("Could not register llcp device\n"); - rc = nfc_genl_device_added(dev); - if (rc) - pr_debug("The userspace won't be notified that the device %s was added\n", - dev_name(&dev->dev)); - + device_lock(&dev->dev); dev->rfkill = rfkill_alloc(dev_name(&dev->dev), &dev->dev, RFKILL_TYPE_NFC, &nfc_rfkill_ops, dev); if (dev->rfkill) { @@ -1131,6 +1127,12 @@ int nfc_register_device(struct nfc_dev *dev) dev->rfkill = NULL; } } + device_unlock(&dev->dev); + + rc = nfc_genl_device_added(dev); + if (rc) + pr_debug("The userspace won't be notified that the device %s was added\n", + dev_name(&dev->dev)); return 0; } @@ -1147,10 +1149,17 @@ void nfc_unregister_device(struct nfc_dev *dev) pr_debug("dev_name=%s\n", dev_name(&dev->dev)); + rc = nfc_genl_device_removed(dev); + if (rc) + pr_debug("The userspace won't be notified that the device %s " + "was removed\n", dev_name(&dev->dev)); + + device_lock(&dev->dev); if (dev->rfkill) { rfkill_unregister(dev->rfkill); rfkill_destroy(dev->rfkill); } + device_unlock(&dev->dev); if (dev->ops->check_presence) { device_lock(&dev->dev); @@ -1160,11 +1169,6 @@ void nfc_unregister_device(struct nfc_dev *dev) cancel_work_sync(&dev->check_pres_work); } - rc = nfc_genl_device_removed(dev); - if (rc) - pr_debug("The userspace won't be notified that the device %s " - "was removed\n", dev_name(&dev->dev)); - nfc_llcp_unregister_device(dev); mutex_lock(&nfc_devlist_mutex); From 4a8bbd41d0443ce8f30c0d80d63e2ccd6c4cdac2 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 24 Sep 2021 01:10:31 +1000 Subject: [PATCH 1760/5344] KVM: PPC: Book3S HV: Use GLOBAL_TOC for kvmppc_h_set_dabr/xdabr() [ Upstream commit dae581864609d36fb58855fd59880b4941ce9d14 ] kvmppc_h_set_dabr(), and kvmppc_h_set_xdabr() which jumps into it, need to use _GLOBAL_TOC to setup the kernel TOC pointer, because kvmppc_h_set_dabr() uses LOAD_REG_ADDR() to load dawr_force_enable. When called from hcall_try_real_mode() we have the kernel TOC in r2, established near the start of kvmppc_interrupt_hv(), so there is no issue. But they can also be called from kvmppc_pseries_do_hcall() which is module code, so the access ends up happening with the kvm-hv module's r2, which will not point at dawr_force_enable and could even cause a fault. With the current code layout and compilers we haven't observed a fault in practice, the load hits somewhere in kvm-hv.ko and silently returns some bogus value. Note that we we expect p8/p9 guests to use the DAWR, but SLOF uses h_set_dabr() to test if sc1 works correctly, see SLOF's lib/libhvcall/brokensc1.c. Fixes: c1fe190c0672 ("powerpc: Add force enable of DAWR on P9 option") Signed-off-by: Michael Ellerman Reviewed-by: Daniel Axtens Link: https://lore.kernel.org/r/20210923151031.72408-1-mpe@ellerman.id.au Signed-off-by: Sasha Levin --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index f9c7326672b95..c9c6619564ffa 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -2535,7 +2535,7 @@ hcall_real_table: .globl hcall_real_table_end hcall_real_table_end: -_GLOBAL(kvmppc_h_set_xdabr) +_GLOBAL_TOC(kvmppc_h_set_xdabr) EXPORT_SYMBOL_GPL(kvmppc_h_set_xdabr) andi. r0, r5, DABRX_USER | DABRX_KERNEL beq 6f @@ -2545,7 +2545,7 @@ EXPORT_SYMBOL_GPL(kvmppc_h_set_xdabr) 6: li r3, H_PARAMETER blr -_GLOBAL(kvmppc_h_set_dabr) +_GLOBAL_TOC(kvmppc_h_set_dabr) EXPORT_SYMBOL_GPL(kvmppc_h_set_dabr) li r5, DABRX_USER | DABRX_KERNEL 3: From 963d17e6fd058f4e4e115d2d2434ef1d663bf95d Mon Sep 17 00:00:00 2001 From: Alexander Antonov Date: Mon, 15 Nov 2021 12:03:32 +0300 Subject: [PATCH 1761/5344] perf/x86/intel/uncore: Fix filter_tid mask for CHA events on Skylake Server [ Upstream commit e324234e0aa881b7841c7c713306403e12b069ff ] According Uncore Reference Manual: any of the CHA events may be filtered by Thread/Core-ID by using tid modifier in CHA Filter 0 Register. Update skx_cha_hw_config() to follow Uncore Guide. Fixes: cd34cd97b7b4 ("perf/x86/intel/uncore: Add Skylake server uncore support") Signed-off-by: Alexander Antonov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20211115090334.3789-2-alexander.antonov@linux.intel.com Signed-off-by: Sasha Levin --- arch/x86/events/intel/uncore_snbep.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index e6e2d0d6aa78e..2b1b28be72ff2 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3516,6 +3516,9 @@ static int skx_cha_hw_config(struct intel_uncore_box *box, struct perf_event *ev struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; struct extra_reg *er; int idx = 0; + /* Any of the CHA events may be filtered by Thread/Core-ID.*/ + if (event->hw.config & SNBEP_CBO_PMON_CTL_TID_EN) + idx = SKX_CHA_MSR_PMON_BOX_FILTER_TID; for (er = skx_uncore_cha_extra_regs; er->msr; er++) { if (er->event != (event->hw.config & er->config_mask)) From d611d79fabf76d11d585423c577b5a9b0cb5d09b Mon Sep 17 00:00:00 2001 From: Alexander Antonov Date: Mon, 15 Nov 2021 12:03:33 +0300 Subject: [PATCH 1762/5344] perf/x86/intel/uncore: Fix IIO event constraints for Skylake Server [ Upstream commit 3866ae319c846a612109c008f43cba80b8c15e86 ] According to the latest uncore document, COMP_BUF_OCCUPANCY (0xd5) event can be collected on 2-3 counters. Update uncore IIO event constraints for Skylake Server. Fixes: cd34cd97b7b4 ("perf/x86/intel/uncore: Add Skylake server uncore support") Signed-off-by: Alexander Antonov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20211115090334.3789-3-alexander.antonov@linux.intel.com Signed-off-by: Sasha Levin --- arch/x86/events/intel/uncore_snbep.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 2b1b28be72ff2..9625137aaed53 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3586,6 +3586,7 @@ static struct event_constraint skx_uncore_iio_constraints[] = { UNCORE_EVENT_CONSTRAINT(0xc0, 0xc), UNCORE_EVENT_CONSTRAINT(0xc5, 0xc), UNCORE_EVENT_CONSTRAINT(0xd4, 0xc), + UNCORE_EVENT_CONSTRAINT(0xd5, 0xc), EVENT_CONSTRAINT_END }; From 65b9d1f65d705517c4386f9b5a69d2753445cc28 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 16 Nov 2021 11:06:38 +0100 Subject: [PATCH 1763/5344] s390/kexec: fix return code handling [ Upstream commit 20c76e242e7025bd355619ba67beb243ba1a1e95 ] kexec_file_add_ipl_report ignores that ipl_report_finish may fail and can return an error pointer instead of a valid pointer. Fix this and simplify by returning NULL in case of an error and let the only caller handle this case. Fixes: 99feaa717e55 ("s390/kexec_file: Create ipl report and pass to next kernel") Signed-off-by: Heiko Carstens Signed-off-by: Sasha Levin --- arch/s390/kernel/ipl.c | 3 ++- arch/s390/kernel/machine_kexec_file.c | 8 +++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 6837affc19e81..7795cdee6427d 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -1783,7 +1783,7 @@ void *ipl_report_finish(struct ipl_report *report) buf = vzalloc(report->size); if (!buf) - return ERR_PTR(-ENOMEM); + goto out; ptr = buf; memcpy(ptr, report->ipib, report->ipib->hdr.len); @@ -1822,6 +1822,7 @@ void *ipl_report_finish(struct ipl_report *report) } BUG_ON(ptr > buf + report->size); +out: return buf; } diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index f9e4baa64b675..c1090f0b1f6a6 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -170,6 +170,7 @@ static int kexec_file_add_ipl_report(struct kimage *image, struct kexec_buf buf; unsigned long addr; void *ptr, *end; + int ret; buf.image = image; @@ -199,7 +200,10 @@ static int kexec_file_add_ipl_report(struct kimage *image, ptr += len; } + ret = -ENOMEM; buf.buffer = ipl_report_finish(data->report); + if (!buf.buffer) + goto out; buf.bufsz = data->report->size; buf.memsz = buf.bufsz; @@ -209,7 +213,9 @@ static int kexec_file_add_ipl_report(struct kimage *image, data->kernel_buf + offsetof(struct lowcore, ipl_parmblock_ptr); *lc_ipl_parmblock_ptr = (__u32)buf.mem; - return kexec_add_buffer(&buf); + ret = kexec_add_buffer(&buf); +out: + return ret; } void *kexec_file_add_components(struct kimage *image, From 03389f2b5d4f56b2e4d1854e81bde4a8b6535022 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Tue, 19 Oct 2021 15:36:45 -0700 Subject: [PATCH 1764/5344] arm64: vdso32: suppress error message for 'make mrproper' commit 14831fad73f5ac30ac61760487d95a538e6ab3cb upstream. When running the following command without arm-linux-gnueabi-gcc in one's $PATH, the following warning is observed: $ ARCH=arm64 CROSS_COMPILE_COMPAT=arm-linux-gnueabi- make -j72 LLVM=1 mrproper make[1]: arm-linux-gnueabi-gcc: No such file or directory This is because KCONFIG is not run for mrproper, so CONFIG_CC_IS_CLANG is not set, and we end up eagerly evaluating various variables that try to invoke CC_COMPAT. This is a similar problem to what was observed in commit dc960bfeedb0 ("h8300: suppress error messages for 'make clean'") Reported-by: Lucas Henneman Suggested-by: Masahiro Yamada Signed-off-by: Nick Desaulniers Reviewed-by: Vincenzo Frascino Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor Link: https://lore.kernel.org/r/20211019223646.1146945-4-ndesaulniers@google.com Signed-off-by: Will Deacon Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kernel/vdso32/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index 40dffe60b8454..894791f03596d 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -32,7 +32,8 @@ cc32-as-instr = $(call try-run,\ # As a result we set our own flags here. # KBUILD_CPPFLAGS and NOSTDINC_FLAGS from top-level Makefile -VDSO_CPPFLAGS := -D__KERNEL__ -nostdinc -isystem $(shell $(CC_COMPAT) -print-file-name=include) +VDSO_CPPFLAGS := -D__KERNEL__ -nostdinc +VDSO_CPPFLAGS += -isystem $(shell $(CC_COMPAT) -print-file-name=include 2>/dev/null) VDSO_CPPFLAGS += $(LINUXINCLUDE) # Common C and assembly flags From 10ed7eb7f3a6b519b8327f16b674325a45438635 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 12 Nov 2021 08:56:03 +0100 Subject: [PATCH 1765/5344] tun: fix bonding active backup with arp monitoring commit a31d27fbed5d518734cb60956303eb15089a7634 upstream. As stated in the bonding doc, trans_start must be set manually for drivers using NETIF_F_LLTX: Drivers that use NETIF_F_LLTX flag must also update netdev_queue->trans_start. If they do not, then the ARP monitor will immediately fail any slaves using that driver, and those slaves will stay down. Link: https://www.kernel.org/doc/html/v5.15/networking/bonding.html#arp-monitor-operation Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/tun.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 7c40ae058e6d1..10211ea605140 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1071,6 +1071,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); int txq = skb->queue_mapping; + struct netdev_queue *queue; struct tun_file *tfile; int len = skb->len; @@ -1117,6 +1118,10 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) if (ptr_ring_produce(&tfile->tx_ring, skb)) goto drop; + /* NETIF_F_LLTX requires to do our own update of trans_start */ + queue = netdev_get_tx_queue(dev, txq); + queue->trans_start = jiffies; + /* Notify and wake up reader process */ if (tfile->flags & TUN_FASYNC) kill_fasync(&tfile->fasync, SIGIO, POLL_IN); From 3382642d461c5e76b747987d1be8507f122632d0 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 19 Nov 2021 16:43:28 -0800 Subject: [PATCH 1766/5344] hexagon: export raw I/O routines for modules commit ffb92ce826fd801acb0f4e15b75e4ddf0d189bde upstream. Patch series "Fixes for ARCH=hexagon allmodconfig", v2. This series fixes some issues noticed with ARCH=hexagon allmodconfig. This patch (of 3): When building ARCH=hexagon allmodconfig, the following errors occur: ERROR: modpost: "__raw_readsl" [drivers/i3c/master/svc-i3c-master.ko] undefined! ERROR: modpost: "__raw_writesl" [drivers/i3c/master/dw-i3c-master.ko] undefined! ERROR: modpost: "__raw_readsl" [drivers/i3c/master/dw-i3c-master.ko] undefined! ERROR: modpost: "__raw_writesl" [drivers/i3c/master/i3c-master-cdns.ko] undefined! ERROR: modpost: "__raw_readsl" [drivers/i3c/master/i3c-master-cdns.ko] undefined! Export these symbols so that modules can use them without any errors. Link: https://lkml.kernel.org/r/20211115174250.1994179-1-nathan@kernel.org Link: https://lkml.kernel.org/r/20211115174250.1994179-2-nathan@kernel.org Fixes: 013bf24c3829 ("Hexagon: Provide basic implementation and/or stubs for I/O routines.") Signed-off-by: Nathan Chancellor Acked-by: Brian Cain Cc: Nick Desaulniers Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- arch/hexagon/lib/io.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/hexagon/lib/io.c b/arch/hexagon/lib/io.c index d35d69d6588c4..55f75392857b0 100644 --- a/arch/hexagon/lib/io.c +++ b/arch/hexagon/lib/io.c @@ -27,6 +27,7 @@ void __raw_readsw(const void __iomem *addr, void *data, int len) *dst++ = *src; } +EXPORT_SYMBOL(__raw_readsw); /* * __raw_writesw - read words a short at a time @@ -47,6 +48,7 @@ void __raw_writesw(void __iomem *addr, const void *data, int len) } +EXPORT_SYMBOL(__raw_writesw); /* Pretty sure len is pre-adjusted for the length of the access already */ void __raw_readsl(const void __iomem *addr, void *data, int len) @@ -62,6 +64,7 @@ void __raw_readsl(const void __iomem *addr, void *data, int len) } +EXPORT_SYMBOL(__raw_readsl); void __raw_writesl(void __iomem *addr, const void *data, int len) { @@ -76,3 +79,4 @@ void __raw_writesl(void __iomem *addr, const void *data, int len) } +EXPORT_SYMBOL(__raw_writesl); From 002df837f1b5ac91a077fd5ca21f6b16160649ec Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 19 Nov 2021 16:43:18 -0800 Subject: [PATCH 1767/5344] ipc: WARN if trying to remove ipc object which is absent commit 126e8bee943e9926238c891e2df5b5573aee76bc upstream. Patch series "shm: shm_rmid_forced feature fixes". Some time ago I met kernel crash after CRIU restore procedure, fortunately, it was CRIU restore, so, I had dump files and could do restore many times and crash reproduced easily. After some investigation I've constructed the minimal reproducer. It was found that it's use-after-free and it happens only if sysctl kernel.shm_rmid_forced = 1. The key of the problem is that the exit_shm() function not handles shp's object destroy when task->sysvshm.shm_clist contains items from different IPC namespaces. In most cases this list will contain only items from one IPC namespace. How can this list contain object from different namespaces? The exit_shm() function is designed to clean up this list always when process leaves IPC namespace. But we made a mistake a long time ago and did not add a exit_shm() call into the setns() syscall procedures. The first idea was just to add this call to setns() syscall but it obviously changes semantics of setns() syscall and that's userspace-visible change. So, I gave up on this idea. The first real attempt to address the issue was just to omit forced destroy if we meet shp object not from current task IPC namespace [1]. But that was not the best idea because task->sysvshm.shm_clist was protected by rwsem which belongs to current task IPC namespace. It means that list corruption may occur. Second approach is just extend exit_shm() to properly handle shp's from different IPC namespaces [2]. This is really non-trivial thing, I've put a lot of effort into that but not believed that it's possible to make it fully safe, clean and clear. Thanks to the efforts of Manfred Spraul working an elegant solution was designed. Thanks a lot, Manfred! Eric also suggested the way to address the issue in ("[RFC][PATCH] shm: In shm_exit destroy all created and never attached segments") Eric's idea was to maintain a list of shm_clists one per IPC namespace, use lock-less lists. But there is some extra memory consumption-related concerns. An alternative solution which was suggested by me was implemented in ("shm: reset shm_clist on setns but omit forced shm destroy"). The idea is pretty simple, we add exit_shm() syscall to setns() but DO NOT destroy shm segments even if sysctl kernel.shm_rmid_forced = 1, we just clean up the task->sysvshm.shm_clist list. This chages semantics of setns() syscall a little bit but in comparision to the "naive" solution when we just add exit_shm() without any special exclusions this looks like a safer option. [1] https://lkml.org/lkml/2021/7/6/1108 [2] https://lkml.org/lkml/2021/7/14/736 This patch (of 2): Let's produce a warning if we trying to remove non-existing IPC object from IPC namespace kht/idr structures. This allows us to catch possible bugs when the ipc_rmid() function was called with inconsistent struct ipc_ids*, struct kern_ipc_perm* arguments. Link: https://lkml.kernel.org/r/20211027224348.611025-1-alexander.mikhalitsyn@virtuozzo.com Link: https://lkml.kernel.org/r/20211027224348.611025-2-alexander.mikhalitsyn@virtuozzo.com Co-developed-by: Manfred Spraul Signed-off-by: Manfred Spraul Signed-off-by: Alexander Mikhalitsyn Cc: "Eric W. Biederman" Cc: Davidlohr Bueso Cc: Greg KH Cc: Andrei Vagin Cc: Pavel Tikhomirov Cc: Vasily Averin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- ipc/util.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ipc/util.c b/ipc/util.c index 1821b6386d3b4..09c3bd9f8e768 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -446,8 +446,8 @@ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids, static void ipc_kht_remove(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) { if (ipcp->key != IPC_PRIVATE) - rhashtable_remove_fast(&ids->key_ht, &ipcp->khtnode, - ipc_kht_params); + WARN_ON_ONCE(rhashtable_remove_fast(&ids->key_ht, &ipcp->khtnode, + ipc_kht_params)); } /** @@ -462,7 +462,7 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) { int idx = ipcid_to_idx(ipcp->id); - idr_remove(&ids->ipcs_idr, idx); + WARN_ON_ONCE(idr_remove(&ids->ipcs_idr, idx) != ipcp); ipc_kht_remove(ids, ipcp); ids->in_use--; ipcp->deleted = true; From 498ea9c46862af74b234c4a1fdfae2e532298ce8 Mon Sep 17 00:00:00 2001 From: Rustam Kovhaev Date: Fri, 19 Nov 2021 16:43:37 -0800 Subject: [PATCH 1768/5344] mm: kmemleak: slob: respect SLAB_NOLEAKTRACE flag commit 34dbc3aaf5d9e89ba6cc5e24add9458c21ab1950 upstream. When kmemleak is enabled for SLOB, system does not boot and does not print anything to the console. At the very early stage in the boot process we hit infinite recursion from kmemleak_init() and eventually kernel crashes. kmemleak_init() specifies SLAB_NOLEAKTRACE for KMEM_CACHE(), but kmem_cache_create_usercopy() removes it because CACHE_CREATE_MASK is not valid for SLOB. Let's fix CACHE_CREATE_MASK and make kmemleak work with SLOB Link: https://lkml.kernel.org/r/20211115020850.3154366-1-rkovhaev@gmail.com Fixes: d8843922fba4 ("slab: Ignore internal flags in cache creation") Signed-off-by: Rustam Kovhaev Acked-by: Vlastimil Babka Reviewed-by: Muchun Song Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Catalin Marinas Cc: Greg Kroah-Hartman Cc: Glauber Costa Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/slab.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slab.h b/mm/slab.h index c918a716c81c8..1f2385583d844 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -149,7 +149,7 @@ static inline slab_flags_t kmem_cache_flags(unsigned int object_size, #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ SLAB_TEMPORARY | SLAB_ACCOUNT) #else -#define SLAB_CACHE_FLAGS (0) +#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE) #endif /* Common flags available with current configuration */ From c6af2a575be0295164ac942d53817e943af46167 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 4 Nov 2021 18:22:38 +0000 Subject: [PATCH 1769/5344] x86/hyperv: Fix NULL deref in set_hv_tscchange_cb() if Hyper-V setup fails commit daf972118c517b91f74ff1731417feb4270625a4 upstream. Check for a valid hv_vp_index array prior to derefencing hv_vp_index when setting Hyper-V's TSC change callback. If Hyper-V setup failed in hyperv_init(), the kernel will still report that it's running under Hyper-V, but will have silently disabled nearly all functionality. BUG: kernel NULL pointer dereference, address: 0000000000000010 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] SMP CPU: 4 PID: 1 Comm: swapper/0 Not tainted 5.15.0-rc2+ #75 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:set_hv_tscchange_cb+0x15/0xa0 Code: <8b> 04 82 8b 15 12 17 85 01 48 c1 e0 20 48 0d ee 00 01 00 f6 c6 08 ... Call Trace: kvm_arch_init+0x17c/0x280 kvm_init+0x31/0x330 vmx_init+0xba/0x13a do_one_initcall+0x41/0x1c0 kernel_init_freeable+0x1f2/0x23b kernel_init+0x16/0x120 ret_from_fork+0x22/0x30 Fixes: 93286261de1b ("x86/hyperv: Reenlightenment notifications support") Cc: stable@vger.kernel.org Cc: Vitaly Kuznetsov Signed-off-by: Sean Christopherson Reviewed-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20211104182239.1302956-2-seanjc@google.com Signed-off-by: Wei Liu Signed-off-by: Greg Kroah-Hartman --- arch/x86/hyperv/hv_init.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 812db1ac8cb11..df4a4a9dc1e84 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -163,6 +163,9 @@ void set_hv_tscchange_cb(void (*cb)(void)) return; } + if (!hv_vp_index) + return; + hv_reenlightenment_cb = cb; /* Make sure callback is registered before we write to MSRs */ From dcaeab372f2ed9982f593944a958e99fc1fedab3 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Tue, 16 Nov 2021 11:31:01 +0800 Subject: [PATCH 1770/5344] s390/kexec: fix memory leak of ipl report buffer commit 4aa9340584e37debef06fa99b56d064beb723891 upstream. unreferenced object 0x38000195000 (size 4096): comm "kexec", pid 8548, jiffies 4294953647 (age 32443.270s) hex dump (first 32 bytes): 00 00 00 c8 20 00 00 00 00 00 00 c0 02 80 00 00 .... ........... 40 40 40 40 40 40 40 40 00 00 00 00 00 00 00 00 @@@@@@@@........ backtrace: [<0000000011a2f199>] __vmalloc_node_range+0xc0/0x140 [<0000000081fa2752>] vzalloc+0x5a/0x70 [<0000000063a4c92d>] ipl_report_finish+0x2c/0x180 [<00000000553304da>] kexec_file_add_ipl_report+0xf4/0x150 [<00000000862d033f>] kexec_file_add_components+0x124/0x160 [<000000000d2717bb>] arch_kexec_kernel_image_load+0x62/0x90 [<000000002e0373b6>] kimage_file_alloc_init+0x1aa/0x2e0 [<0000000060f2d14f>] __do_sys_kexec_file_load+0x17c/0x2c0 [<000000008c86fe5a>] __s390x_sys_kexec_file_load+0x40/0x50 [<000000001fdb9dac>] __do_syscall+0x1bc/0x1f0 [<000000003ee4258d>] system_call+0x78/0xa0 Signed-off-by: Baoquan He Reviewed-by: Philipp Rudo Fixes: 99feaa717e55 ("s390/kexec_file: Create ipl report and pass to next kernel") Cc: # v5.2: 20c76e242e70: s390/kexec: fix return code handling Cc: # v5.2 Link: https://lore.kernel.org/r/20211116033101.GD21646@MiWiFi-R3L-srv Signed-off-by: Heiko Carstens Signed-off-by: Greg Kroah-Hartman --- arch/s390/include/asm/kexec.h | 6 ++++++ arch/s390/kernel/machine_kexec_file.c | 10 ++++++++++ 2 files changed, 16 insertions(+) diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h index ea398a05f6432..7f3c9ac34bd8d 100644 --- a/arch/s390/include/asm/kexec.h +++ b/arch/s390/include/asm/kexec.h @@ -74,6 +74,12 @@ void *kexec_file_add_components(struct kimage *image, int arch_kexec_do_relocs(int r_type, void *loc, unsigned long val, unsigned long addr); +#define ARCH_HAS_KIMAGE_ARCH + +struct kimage_arch { + void *ipl_buf; +}; + extern const struct kexec_file_ops s390_kexec_image_ops; extern const struct kexec_file_ops s390_kexec_elf_ops; diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index c1090f0b1f6a6..e7435f3a3d2d2 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -206,6 +207,7 @@ static int kexec_file_add_ipl_report(struct kimage *image, goto out; buf.bufsz = data->report->size; buf.memsz = buf.bufsz; + image->arch.ipl_buf = buf.buffer; data->memsz += buf.memsz; @@ -327,3 +329,11 @@ int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, return kexec_image_probe_default(image, buf, buf_len); } + +int arch_kimage_file_post_load_cleanup(struct kimage *image) +{ + vfree(image->arch.ipl_buf); + image->arch.ipl_buf = NULL; + + return kexec_image_post_load_cleanup_default(image); +} From 662e8384ab2d225ea394556addf671745f7fa5ff Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 4 Nov 2021 15:22:35 +0100 Subject: [PATCH 1771/5344] udf: Fix crash after seekdir commit a48fc69fe6588b48d878d69de223b91a386a7cb4 upstream. udf_readdir() didn't validate the directory position it should start reading from. Thus when user uses lseek(2) on directory file descriptor it can trick udf_readdir() into reading from a position in the middle of directory entry which then upsets directory parsing code resulting in errors or even possible kernel crashes. Similarly when the directory is modified between two readdir calls, the directory position need not be valid anymore. Add code to validate current offset in the directory. This is actually rather expensive for UDF as we need to read from the beginning of the directory and parse all directory entries. This is because in UDF a directory is just a stream of data containing directory entries and since file names are fully under user's control we cannot depend on detecting magic numbers and checksums in the header of directory entry as a malicious attacker could fake them. We skip this step if we detect that nothing changed since the last readdir call. Reported-by: Nathan Wilson CC: stable@vger.kernel.org Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/udf/dir.c | 32 ++++++++++++++++++++++++++++++-- fs/udf/namei.c | 3 +++ fs/udf/super.c | 2 ++ 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/fs/udf/dir.c b/fs/udf/dir.c index c19dba45aa209..d0f92a52e3bab 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "udf_i.h" #include "udf_sb.h" @@ -44,7 +45,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) struct fileIdentDesc *fi = NULL; struct fileIdentDesc cfi; udf_pblk_t block, iblock; - loff_t nf_pos; + loff_t nf_pos, emit_pos = 0; int flen; unsigned char *fname = NULL, *copy_name = NULL; unsigned char *nameptr; @@ -58,6 +59,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) int i, num, ret = 0; struct extent_position epos = { NULL, 0, {0, 0} }; struct super_block *sb = dir->i_sb; + bool pos_valid = false; if (ctx->pos == 0) { if (!dir_emit_dot(file, ctx)) @@ -68,6 +70,21 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) if (nf_pos >= size) goto out; + /* + * Something changed since last readdir (either lseek was called or dir + * changed)? We need to verify the position correctly points at the + * beginning of some dir entry so that the directory parsing code does + * not get confused. Since UDF does not have any reliable way of + * identifying beginning of dir entry (names are under user control), + * we need to scan the directory from the beginning. + */ + if (!inode_eq_iversion(dir, file->f_version)) { + emit_pos = nf_pos; + nf_pos = 0; + } else { + pos_valid = true; + } + fname = kmalloc(UDF_NAME_LEN, GFP_NOFS); if (!fname) { ret = -ENOMEM; @@ -123,13 +140,21 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) while (nf_pos < size) { struct kernel_lb_addr tloc; + loff_t cur_pos = nf_pos; - ctx->pos = (nf_pos >> 2) + 1; + /* Update file position only if we got past the current one */ + if (nf_pos >= emit_pos) { + ctx->pos = (nf_pos >> 2) + 1; + pos_valid = true; + } fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc, &elen, &offset); if (!fi) goto out; + /* Still not at offset where user asked us to read from? */ + if (cur_pos < emit_pos) + continue; liu = le16_to_cpu(cfi.lengthOfImpUse); lfi = cfi.lengthFileIdent; @@ -187,8 +212,11 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) } /* end while */ ctx->pos = (nf_pos >> 2) + 1; + pos_valid = true; out: + if (pos_valid) + file->f_version = inode_query_iversion(dir); if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); brelse(fibh.sbh); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 3c3d3b20889c8..3c009562375d3 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -30,6 +30,7 @@ #include #include #include +#include static inline int udf_match(int len1, const unsigned char *name1, int len2, const unsigned char *name2) @@ -135,6 +136,8 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, mark_buffer_dirty_inode(fibh->ebh, inode); mark_buffer_dirty_inode(fibh->sbh, inode); } + inode_inc_iversion(inode); + return 0; } diff --git a/fs/udf/super.c b/fs/udf/super.c index 5663bae95700c..5193b94c06834 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -57,6 +57,7 @@ #include #include #include +#include #include "udf_sb.h" #include "udf_i.h" @@ -149,6 +150,7 @@ static struct inode *udf_alloc_inode(struct super_block *sb) init_rwsem(&ei->i_data_sem); ei->cached_extent.lstart = -1; spin_lock_init(&ei->i_extent_cache_lock); + inode_set_iversion(&ei->vfs_inode, 1); return &ei->vfs_inode; } From 6e4c8a70443014b12a2b20bc860567520c786872 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 2 Nov 2021 14:49:16 +0200 Subject: [PATCH 1772/5344] btrfs: fix memory ordering between normal and ordered work functions commit 45da9c1767ac31857df572f0a909fbe88fd5a7e9 upstream. Ordered work functions aren't guaranteed to be handled by the same thread which executed the normal work functions. The only way execution between normal/ordered functions is synchronized is via the WORK_DONE_BIT, unfortunately the used bitops don't guarantee any ordering whatsoever. This manifested as seemingly inexplicable crashes on ARM64, where async_chunk::inode is seen as non-null in async_cow_submit which causes submit_compressed_extents to be called and crash occurs because async_chunk::inode suddenly became NULL. The call trace was similar to: pc : submit_compressed_extents+0x38/0x3d0 lr : async_cow_submit+0x50/0xd0 sp : ffff800015d4bc20 Call trace: submit_compressed_extents+0x38/0x3d0 async_cow_submit+0x50/0xd0 run_ordered_work+0xc8/0x280 btrfs_work_helper+0x98/0x250 process_one_work+0x1f0/0x4ac worker_thread+0x188/0x504 kthread+0x110/0x114 ret_from_fork+0x10/0x18 Fix this by adding respective barrier calls which ensure that all accesses preceding setting of WORK_DONE_BIT are strictly ordered before setting the flag. At the same time add a read barrier after reading of WORK_DONE_BIT in run_ordered_work which ensures all subsequent loads would be strictly ordered after reading the bit. This in turn ensures are all accesses before WORK_DONE_BIT are going to be strictly ordered before any access that can occur in ordered_func. Reported-by: Chris Murphy Fixes: 08a9ff326418 ("btrfs: Added btrfs_workqueue_struct implemented ordered execution based on kernel workqueue") CC: stable@vger.kernel.org # 4.4+ Link: https://bugzilla.redhat.com/show_bug.cgi?id=2011928 Reviewed-by: Josef Bacik Tested-by: Chris Murphy Signed-off-by: Nikolay Borisov Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/async-thread.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 11be02459b876..eb592b92aa9cd 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -237,6 +237,13 @@ static void run_ordered_work(struct __btrfs_workqueue *wq, ordered_list); if (!test_bit(WORK_DONE_BIT, &work->flags)) break; + /* + * Orders all subsequent loads after reading WORK_DONE_BIT, + * paired with the smp_mb__before_atomic in btrfs_work_helper + * this guarantees that the ordered function will see all + * updates from ordinary work function. + */ + smp_rmb(); /* * we are going to call the ordered done function, but @@ -325,6 +332,13 @@ static void btrfs_work_helper(struct work_struct *normal_work) thresh_exec_hook(wq); work->func(work); if (need_order) { + /* + * Ensures all memory accesses done in the work function are + * ordered before setting the WORK_DONE_BIT. Ensuring the thread + * which is going to executed the ordered work sees them. + * Pairs with the smp_rmb in run_ordered_work. + */ + smp_mb__before_atomic(); set_bit(WORK_DONE_BIT, &work->flags); run_ordered_work(wq, work); } From c912637767aa6127a39b434fb85871a0a6acb929 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Sun, 14 Nov 2021 17:08:17 +0100 Subject: [PATCH 1773/5344] parisc/sticon: fix reverse colors commit bec05f33ebc1006899c6d3e59a00c58881fe7626 upstream. sticon_build_attr() checked the reverse argument and flipped background and foreground color, but returned the non-reverse value afterwards. Fix this and also add two local variables for foreground and background color to make the code easier to read. Signed-off-by: Sven Schnelle Cc: Signed-off-by: Helge Deller Signed-off-by: Greg Kroah-Hartman --- drivers/video/console/sticon.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/video/console/sticon.c b/drivers/video/console/sticon.c index 79c9bd8d30254..559a7305cadaa 100644 --- a/drivers/video/console/sticon.c +++ b/drivers/video/console/sticon.c @@ -291,13 +291,13 @@ static unsigned long sticon_getxy(struct vc_data *conp, unsigned long pos, static u8 sticon_build_attr(struct vc_data *conp, u8 color, u8 intens, u8 blink, u8 underline, u8 reverse, u8 italic) { - u8 attr = ((color & 0x70) >> 1) | ((color & 7)); + u8 fg = color & 7; + u8 bg = (color & 0x70) >> 4; - if (reverse) { - color = ((color >> 3) & 0x7) | ((color & 0x7) << 3); - } - - return attr; + if (reverse) + return (fg << 3) | bg; + else + return (bg << 3) | fg; } static void sticon_invert_region(struct vc_data *conp, u16 *p, int count) From 4665c28aae35d485be44886c4b9174ce2e31ee50 Mon Sep 17 00:00:00 2001 From: Nguyen Dinh Phi Date: Thu, 28 Oct 2021 01:37:22 +0800 Subject: [PATCH 1774/5344] cfg80211: call cfg80211_stop_ap when switch from P2P_GO type commit 563fbefed46ae4c1f70cffb8eb54c02df480b2c2 upstream. If the userspace tools switch from NL80211_IFTYPE_P2P_GO to NL80211_IFTYPE_ADHOC via send_msg(NL80211_CMD_SET_INTERFACE), it does not call the cleanup cfg80211_stop_ap(), this leads to the initialization of in-use data. For example, this path re-init the sdata->assigned_chanctx_list while it is still an element of assigned_vifs list, and makes that linked list corrupt. Signed-off-by: Nguyen Dinh Phi Reported-by: syzbot+bbf402b783eeb6d908db@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/20211027173722.777287-1-phind.uet@gmail.com Cc: stable@vger.kernel.org Fixes: ac800140c20e ("cfg80211: .stop_ap when interface is going down") Signed-off-by: Johannes Berg Signed-off-by: Greg Kroah-Hartman --- net/wireless/util.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/wireless/util.c b/net/wireless/util.c index aaefaf3422a1a..95533732f9d6f 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -991,6 +991,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, switch (otype) { case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: cfg80211_stop_ap(rdev, dev, true); break; case NL80211_IFTYPE_ADHOC: From d196482d09cb34933a2532039375d34008aca479 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 25 Oct 2021 13:53:53 +0200 Subject: [PATCH 1775/5344] drm/udl: fix control-message timeout commit 5591c8f79db1729d9c5ac7f5b4d3a5c26e262d93 upstream. USB control-message timeouts are specified in milliseconds and should specifically not vary with CONFIG_HZ. Fixes: 5320918b9a87 ("drm/udl: initial UDL driver (v4)") Cc: stable@vger.kernel.org # 3.4 Signed-off-by: Johan Hovold Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20211025115353.5089-1-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/udl/udl_connector.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/udl/udl_connector.c b/drivers/gpu/drm/udl/udl_connector.c index ddb61a60c6104..ef69773b5bed5 100644 --- a/drivers/gpu/drm/udl/udl_connector.c +++ b/drivers/gpu/drm/udl/udl_connector.c @@ -29,7 +29,7 @@ static int udl_get_edid_block(void *data, u8 *buf, unsigned int block, ret = usb_control_msg(udl->udev, usb_rcvctrlpipe(udl->udev, 0), (0x02), (0x80 | (0x02 << 5)), bval, - 0xA1, read_buff, 2, HZ); + 0xA1, read_buff, 2, 1000); if (ret < 1) { DRM_ERROR("Read EDID byte %d failed err %x\n", i, ret); kfree(read_buff); From a496ba63f12e45820133da1357932ac0784a7ca0 Mon Sep 17 00:00:00 2001 From: Jeremy Cline Date: Wed, 25 Nov 2020 15:26:46 -0500 Subject: [PATCH 1776/5344] drm/nouveau: use drm_dev_unplug() during device removal commit aff2299e0d81b26304ccc6a1ec0170e437f38efc upstream. Nouveau does not currently support hot-unplugging, but it still makes sense to switch from drm_dev_unregister() to drm_dev_unplug(). drm_dev_unplug() calls drm_dev_unregister() after marking the device as unplugged, but only after any device critical sections are finished. Since nouveau isn't using drm_dev_enter() and drm_dev_exit(), there are no critical sections so this is nearly functionally equivalent. However, the DRM layer does check to see if the device is unplugged, and if it is returns appropriate error codes. In the future nouveau can add critical sections in order to truly support hot-unplugging. Cc: stable@vger.kernel.org # 5.4+ Signed-off-by: Jeremy Cline Reviewed-by: Lyude Paul Reviewed-by: Ben Skeggs Tested-by: Karol Herbst Signed-off-by: Karol Herbst Link: https://patchwork.freedesktop.org/patch/msgid/20201125202648.5220-2-jcline@redhat.com Link: https://gitlab.freedesktop.org/drm/nouveau/-/merge_requests/14 Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/nouveau/nouveau_drm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c index 5347e5bdee8cc..e18938972a895 100644 --- a/drivers/gpu/drm/nouveau/nouveau_drm.c +++ b/drivers/gpu/drm/nouveau/nouveau_drm.c @@ -779,7 +779,7 @@ nouveau_drm_device_remove(struct drm_device *dev) struct nvkm_client *client; struct nvkm_device *device; - drm_dev_unregister(dev); + drm_dev_unplug(dev); dev->irq_enabled = false; client = nvxx_client(&drm->client.base); From e73eeb2f70b690faf6a02707d76cec8b6b0bbf86 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Mon, 18 Oct 2021 17:34:17 +0300 Subject: [PATCH 1777/5344] drm/i915/dp: Ensure sink rate values are always valid MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 6c34bd4532a3f39952952ddc102737595729afc4 upstream. Atm, there are no sink rate values set for DP (vs. eDP) sinks until the DPCD capabilities are successfully read from the sink. During this time intel_dp->num_common_rates is 0 which can lead to a intel_dp->common_rates[-1] (*) access, which is an undefined behaviour, in the following cases: - In intel_dp_sync_state(), if the encoder is enabled without a sink connected to the encoder's connector (BIOS enabled a monitor, but the user unplugged the monitor until the driver loaded). - In intel_dp_sync_state() if the encoder is enabled with a sink connected, but for some reason the DPCD read has failed. - In intel_dp_compute_link_config() if modesetting a connector without a sink connected on it. - In intel_dp_compute_link_config() if modesetting a connector with a a sink connected on it, but before probing the connector first. To avoid the (*) access in all the above cases, make sure that the sink rate table - and hence the common rate table - is always valid, by setting a default minimum sink rate when registering the connector before anything could use it. I also considered setting all the DP link rates by default, so that modesetting with higher resolution modes also succeeds in the last two cases above. However in case a sink is not connected that would stop working after the first modeset, due to the LT fallback logic. So this would need more work, beyond the scope of this fix. As I mentioned in the previous patch, I don't think the issue this patch fixes is user visible, however it is an undefined behaviour by definition and triggers a BUG() in CONFIG_UBSAN builds, hence CC:stable. v2: Clear the default sink rates, before initializing these for eDP. Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/4297 References: https://gitlab.freedesktop.org/drm/intel/-/issues/4298 Suggested-by: Ville Syrjälä Cc: Ville Syrjälä Cc: Signed-off-by: Imre Deak Reviewed-by: Ville Syrjälä Acked-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20211018143417.1452632-1-imre.deak@intel.com (cherry picked from commit 3f61ef9777c0ab0f03f4af0ed6fd3e5250537a8d) Signed-off-by: Rodrigo Vivi Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/display/intel_dp.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index abc8c42b8b0c1..85583f9146305 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -166,6 +166,12 @@ static void vlv_steal_power_sequencer(struct drm_i915_private *dev_priv, enum pipe pipe); static void intel_dp_unset_edid(struct intel_dp *intel_dp); +static void intel_dp_set_default_sink_rates(struct intel_dp *intel_dp) +{ + intel_dp->sink_rates[0] = 162000; + intel_dp->num_sink_rates = 1; +} + /* update sink rates from dpcd */ static void intel_dp_set_sink_rates(struct intel_dp *intel_dp) { @@ -4261,6 +4267,9 @@ intel_edp_init_dpcd(struct intel_dp *intel_dp) */ intel_psr_init_dpcd(intel_dp); + /* Clear the default sink rates */ + intel_dp->num_sink_rates = 0; + /* Read the eDP 1.4+ supported link rates. */ if (intel_dp->edp_dpcd[0] >= DP_EDP_14) { __le16 sink_rates[DP_MAX_SUPPORTED_RATES]; @@ -7167,6 +7176,8 @@ intel_dp_init_connector(struct intel_digital_port *intel_dig_port, return false; intel_dp_set_source_rates(intel_dp); + intel_dp_set_default_sink_rates(intel_dp); + intel_dp_set_common_rates(intel_dp); intel_dp->reset_link_params = true; intel_dp->pps_pipe = INVALID_PIPE; From 8e9df28af31ec4f80fe3d83baed47bb8e30a8a5a Mon Sep 17 00:00:00 2001 From: hongao Date: Thu, 11 Nov 2021 11:32:07 +0800 Subject: [PATCH 1778/5344] drm/amdgpu: fix set scaling mode Full/Full aspect/Center not works on vga and dvi connectors commit bf552083916a7f8800477b5986940d1c9a31b953 upstream. amdgpu_connector_vga_get_modes missed function amdgpu_get_native_mode which assign amdgpu_encoder->native_mode with *preferred_mode result in amdgpu_encoder->native_mode.clock always be 0. That will cause amdgpu_connector_set_property returned early on: if ((rmx_type != DRM_MODE_SCALE_NONE) && (amdgpu_encoder->native_mode.clock == 0)) when we try to set scaling mode Full/Full aspect/Center. Add the missing function to amdgpu_connector_vga_get_mode can fix this. It also works on dvi connectors because amdgpu_connector_dvi_helper_funcs.get_mode use the same method. Signed-off-by: hongao Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c index cda0a76a733d3..0d39e386f6e9c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c @@ -829,6 +829,7 @@ static int amdgpu_connector_vga_get_modes(struct drm_connector *connector) amdgpu_connector_get_edid(connector); ret = amdgpu_connector_ddc_get_modes(connector); + amdgpu_get_native_mode(connector); return ret; } From 520fc4c70cccc1ac485ecb08a62fa95da88484f1 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 23 Nov 2021 13:16:52 +0100 Subject: [PATCH 1779/5344] Revert "net: mvpp2: disable force link UP during port init procedure" This reverts commit f595e44b161a3c751943c256b6de80dc57d5fcf8 which is commit 87508224485323ce2d4e7fb929ec80f51adcc238 upstream. It causes reported problems so should be removed. Link: https://lore.kernel.org/r/YZv1SBrYTXmorcLJ@shell.armlinux.org.uk Reported-by: Jordan Vrtanoski Reported-by: Russell King Cc: Stefan Chulski Cc: Marcin Wojtas Cc: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index dac0e51e6aafd..16a390c77d198 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -4545,7 +4545,7 @@ static int mvpp2_port_init(struct mvpp2_port *port) struct mvpp2 *priv = port->priv; struct mvpp2_txq_pcpu *txq_pcpu; unsigned int thread; - int queue, err, val; + int queue, err; /* Checks for hardware constraints */ if (port->first_rxq + port->nrxqs > @@ -4559,18 +4559,6 @@ static int mvpp2_port_init(struct mvpp2_port *port) mvpp2_egress_disable(port); mvpp2_port_disable(port); - if (mvpp2_is_xlg(port->phy_interface)) { - val = readl(port->base + MVPP22_XLG_CTRL0_REG); - val &= ~MVPP22_XLG_CTRL0_FORCE_LINK_PASS; - val |= MVPP22_XLG_CTRL0_FORCE_LINK_DOWN; - writel(val, port->base + MVPP22_XLG_CTRL0_REG); - } else { - val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG); - val &= ~MVPP2_GMAC_FORCE_LINK_PASS; - val |= MVPP2_GMAC_FORCE_LINK_DOWN; - writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG); - } - port->tx_time_coal = MVPP2_TXDONE_COAL_USEC; port->txqs = devm_kcalloc(dev, port->ntxqs, sizeof(*port->txqs), From 9e0b67d8018d359bed1de457cb89bdf4dcd4d3b6 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Wed, 10 Nov 2021 18:18:14 -0800 Subject: [PATCH 1780/5344] perf/core: Avoid put_page() when GUP fails commit 4716023a8f6a0f4a28047f14dd7ebdc319606b84 upstream. PEBS PERF_SAMPLE_PHYS_ADDR events use perf_virt_to_phys() to convert PMU sampled virtual addresses to physical using get_user_page_fast_only() and page_to_phys(). Some get_user_page_fast_only() error cases return false, indicating no page reference, but still initialize the output page pointer with an unreferenced page. In these error cases perf_virt_to_phys() calls put_page(). This causes page reference count underflow, which can lead to unintentional page sharing. Fix perf_virt_to_phys() to only put_page() if get_user_page_fast_only() returns a referenced page. Fixes: fc7ce9c74c3ad ("perf/core, x86: Add PERF_SAMPLE_PHYS_ADDR") Signed-off-by: Greg Thelen Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211111021814.757086-1-gthelen@google.com Signed-off-by: Greg Kroah-Hartman --- kernel/events/core.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 6ed10918668b8..b30375ba2db76 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6564,7 +6564,6 @@ void perf_output_sample(struct perf_output_handle *handle, static u64 perf_virt_to_phys(u64 virt) { u64 phys_addr = 0; - struct page *p = NULL; if (!virt) return 0; @@ -6583,14 +6582,15 @@ static u64 perf_virt_to_phys(u64 virt) * If failed, leave phys_addr as 0. */ if (current->mm != NULL) { + struct page *p; + pagefault_disable(); - if (__get_user_pages_fast(virt, 1, 0, &p) == 1) + if (__get_user_pages_fast(virt, 1, 0, &p) == 1) { phys_addr = page_to_phys(p) + virt % PAGE_SIZE; + put_page(p); + } pagefault_enable(); } - - if (p) - put_page(p); } return phys_addr; From cb6a59f8bc1616dddc5918f0bb8174be64180fab Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sat, 20 Nov 2021 13:40:51 +0100 Subject: [PATCH 1781/5344] batman-adv: Consider fragmentation for needed_headroom MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 4ca23e2c2074465bff55ea14221175fecdf63c5f upstream. If a batman-adv packets has to be fragmented, then the original batman-adv packet header is not stripped away. Instead, only a new header is added in front of the packet after it was split. This size must be considered to avoid cost intensive reallocations during the transmission through the various device layers. Fixes: 7bca68c7844b ("batman-adv: Add lower layer needed_(head|tail)room to own ones") Reported-by: Linus Lüssing Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich Signed-off-by: Sven Eckelmann Signed-off-by: Greg Kroah-Hartman --- net/batman-adv/hard-interface.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index afb52282d5bd0..18e644f3cb309 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -554,6 +554,9 @@ static void batadv_hardif_recalc_extra_skbroom(struct net_device *soft_iface) needed_headroom = lower_headroom + (lower_header_len - ETH_HLEN); needed_headroom += batadv_max_header_len(); + /* fragmentation headers don't strip the unicast/... header */ + needed_headroom += sizeof(struct batadv_frag_packet); + soft_iface->needed_headroom = needed_headroom; soft_iface->needed_tailroom = lower_tailroom; } From 9f8d8c13e581b03c51ffca25787ff46be1bb6124 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sat, 20 Nov 2021 13:40:52 +0100 Subject: [PATCH 1782/5344] batman-adv: Reserve needed_*room for fragments commit c5cbfc87558168ef4c3c27ce36eba6b83391db19 upstream. The batadv net_device is trying to propagate the needed_headroom and needed_tailroom from the lower devices. This is needed to avoid cost intensive reallocations using pskb_expand_head during the transmission. But the fragmentation code split the skb's without adding extra room at the end/beginning of the various fragments. This reduced the performance of transmissions over complex scenarios (batadv on vxlan on wireguard) because the lower devices had to perform the reallocations at least once. Fixes: ee75ed88879a ("batman-adv: Fragment and send skbs larger than mtu") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich Signed-off-by: Sven Eckelmann Signed-off-by: Greg Kroah-Hartman --- net/batman-adv/fragmentation.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 385fccdcf69d0..f1d202ff396cd 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -391,6 +391,7 @@ bool batadv_frag_skb_fwd(struct sk_buff *skb, /** * batadv_frag_create() - create a fragment from skb + * @net_dev: outgoing device for fragment * @skb: skb to create fragment from * @frag_head: header to use in new fragment * @fragment_size: size of new fragment @@ -401,22 +402,25 @@ bool batadv_frag_skb_fwd(struct sk_buff *skb, * * Return: the new fragment, NULL on error. */ -static struct sk_buff *batadv_frag_create(struct sk_buff *skb, +static struct sk_buff *batadv_frag_create(struct net_device *net_dev, + struct sk_buff *skb, struct batadv_frag_packet *frag_head, unsigned int fragment_size) { + unsigned int ll_reserved = LL_RESERVED_SPACE(net_dev); + unsigned int tailroom = net_dev->needed_tailroom; struct sk_buff *skb_fragment; unsigned int header_size = sizeof(*frag_head); unsigned int mtu = fragment_size + header_size; - skb_fragment = netdev_alloc_skb(NULL, mtu + ETH_HLEN); + skb_fragment = dev_alloc_skb(ll_reserved + mtu + tailroom); if (!skb_fragment) goto err; skb_fragment->priority = skb->priority; /* Eat the last mtu-bytes of the skb */ - skb_reserve(skb_fragment, header_size + ETH_HLEN); + skb_reserve(skb_fragment, ll_reserved + header_size); skb_split(skb, skb_fragment, skb->len - fragment_size); /* Add the header */ @@ -439,11 +443,12 @@ int batadv_frag_send_packet(struct sk_buff *skb, struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node) { + struct net_device *net_dev = neigh_node->if_incoming->net_dev; struct batadv_priv *bat_priv; struct batadv_hard_iface *primary_if = NULL; struct batadv_frag_packet frag_header; struct sk_buff *skb_fragment; - unsigned int mtu = neigh_node->if_incoming->net_dev->mtu; + unsigned int mtu = net_dev->mtu; unsigned int header_size = sizeof(frag_header); unsigned int max_fragment_size, num_fragments; int ret; @@ -503,7 +508,7 @@ int batadv_frag_send_packet(struct sk_buff *skb, goto put_primary_if; } - skb_fragment = batadv_frag_create(skb, &frag_header, + skb_fragment = batadv_frag_create(net_dev, skb, &frag_header, max_fragment_size); if (!skb_fragment) { ret = -ENOMEM; From 93d44447f3d763b10814a153a960f6eeb2c5923f Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sat, 20 Nov 2021 13:40:53 +0100 Subject: [PATCH 1783/5344] batman-adv: Don't always reallocate the fragmentation skb head commit 992b03b88e36254e26e9a4977ab948683e21bd9f upstream. When a packet is fragmented by batman-adv, the original batman-adv header is not modified. Only a new fragmentation is inserted between the original one and the ethernet header. The code must therefore make sure that it has a writable region of this size in the skbuff head. But it is not useful to always reallocate the skbuff by this size even when there would be more than enough headroom still in the skb. The reallocation is just to costly during in this codepath. Fixes: ee75ed88879a ("batman-adv: Fragment and send skbs larger than mtu") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich Signed-off-by: Sven Eckelmann Signed-off-by: Greg Kroah-Hartman --- net/batman-adv/fragmentation.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index f1d202ff396cd..0da90e73c79bf 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -527,13 +527,14 @@ int batadv_frag_send_packet(struct sk_buff *skb, frag_header.no++; } - /* Make room for the fragment header. */ - if (batadv_skb_head_push(skb, header_size) < 0 || - pskb_expand_head(skb, header_size + ETH_HLEN, 0, GFP_ATOMIC) < 0) { - ret = -ENOMEM; + /* make sure that there is at least enough head for the fragmentation + * and ethernet headers + */ + ret = skb_cow_head(skb, ETH_HLEN + header_size); + if (ret < 0) goto put_primary_if; - } + skb_push(skb, header_size); memcpy(skb->data, &frag_header, header_size); /* Send the last fragment */ From a388d7a3568882f47a9fe357e998a9592131c33e Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 5 Nov 2021 10:09:25 +0100 Subject: [PATCH 1784/5344] ASoC: DAPM: Cover regression by kctl change notification fix commit 827b0913a9d9d07a0c3e559dbb20ca4d6d285a54 upstream. The recent fix for DAPM to correct the kctl change notification by the commit 5af82c81b2c4 ("ASoC: DAPM: Fix missing kctl change notifications") caused other regressions since it changed the behavior of snd_soc_dapm_set_pin() that is called from several API functions. Formerly it returned always 0 for success, but now it returns 0 or 1. This patch addresses it, restoring the old behavior of snd_soc_dapm_set_pin() while keeping the fix in snd_soc_dapm_put_pin_switch(). Fixes: 5af82c81b2c4 ("ASoC: DAPM: Fix missing kctl change notifications") Reported-by: Yu-Hsuan Hsu Cc: Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20211105090925.20575-1-tiwai@suse.de Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- sound/soc/soc-dapm.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index 66f6b698a5436..5876be5dd9bae 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -2542,8 +2542,13 @@ static struct snd_soc_dapm_widget *dapm_find_widget( return NULL; } -static int snd_soc_dapm_set_pin(struct snd_soc_dapm_context *dapm, - const char *pin, int status) +/* + * set the DAPM pin status: + * returns 1 when the value has been updated, 0 when unchanged, or a negative + * error code; called from kcontrol put callback + */ +static int __snd_soc_dapm_set_pin(struct snd_soc_dapm_context *dapm, + const char *pin, int status) { struct snd_soc_dapm_widget *w = dapm_find_widget(dapm, pin, true); int ret = 0; @@ -2569,6 +2574,18 @@ static int snd_soc_dapm_set_pin(struct snd_soc_dapm_context *dapm, return ret; } +/* + * similar as __snd_soc_dapm_set_pin(), but returns 0 when successful; + * called from several API functions below + */ +static int snd_soc_dapm_set_pin(struct snd_soc_dapm_context *dapm, + const char *pin, int status) +{ + int ret = __snd_soc_dapm_set_pin(dapm, pin, status); + + return ret < 0 ? ret : 0; +} + /** * snd_soc_dapm_sync_unlocked - scan and power dapm paths * @dapm: DAPM context @@ -3584,10 +3601,10 @@ int snd_soc_dapm_put_pin_switch(struct snd_kcontrol *kcontrol, const char *pin = (const char *)kcontrol->private_value; int ret; - if (ucontrol->value.integer.value[0]) - ret = snd_soc_dapm_enable_pin(&card->dapm, pin); - else - ret = snd_soc_dapm_disable_pin(&card->dapm, pin); + mutex_lock_nested(&card->dapm_mutex, SND_SOC_DAPM_CLASS_RUNTIME); + ret = __snd_soc_dapm_set_pin(&card->dapm, pin, + !!ucontrol->value.integer.value[0]); + mutex_unlock(&card->dapm_mutex); snd_soc_dapm_sync(&card->dapm); return ret; From 1d2dfb6025c9bbd95077107ba3b405ca0210d895 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 18 Oct 2021 22:40:28 +0200 Subject: [PATCH 1785/5344] usb: max-3421: Use driver data instead of maintaining a list of bound devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit fc153aba3ef371d0d76eb88230ed4e0dee5b38f2 upstream. Instead of maintaining a single-linked list of devices that must be searched linearly in .remove() just use spi_set_drvdata() to remember the link between the spi device and the driver struct. Then the global list and the next member can be dropped. This simplifies the driver, reduces the memory footprint and the time to search the list. Also it makes obvious that there is always a corresponding driver struct for a given device in .remove(), so the error path for !max3421_hcd can be dropped, too. As a side effect this fixes a data inconsistency when .probe() races with itself for a second max3421 device in manipulating max3421_hcd_list. A similar race is fixed in .remove(), too. Fixes: 2d53139f3162 ("Add support for using a MAX3421E chip as a host driver.") Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20211018204028.2914597-1-u.kleine-koenig@pengutronix.de Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/max3421-hcd.c | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/drivers/usb/host/max3421-hcd.c b/drivers/usb/host/max3421-hcd.c index 5ef0747225f6b..e0b25dd869094 100644 --- a/drivers/usb/host/max3421-hcd.c +++ b/drivers/usb/host/max3421-hcd.c @@ -125,8 +125,6 @@ struct max3421_hcd { struct task_struct *spi_thread; - struct max3421_hcd *next; - enum max3421_rh_state rh_state; /* lower 16 bits contain port status, upper 16 bits the change mask: */ u32 port_status; @@ -174,8 +172,6 @@ struct max3421_ep { u8 retransmit; /* packet needs retransmission */ }; -static struct max3421_hcd *max3421_hcd_list; - #define MAX3421_FIFO_SIZE 64 #define MAX3421_SPI_DIR_RD 0 /* read register from MAX3421 */ @@ -1882,9 +1878,8 @@ max3421_probe(struct spi_device *spi) } set_bit(HCD_FLAG_POLL_RH, &hcd->flags); max3421_hcd = hcd_to_max3421(hcd); - max3421_hcd->next = max3421_hcd_list; - max3421_hcd_list = max3421_hcd; INIT_LIST_HEAD(&max3421_hcd->ep_list); + spi_set_drvdata(spi, max3421_hcd); max3421_hcd->tx = kmalloc(sizeof(*max3421_hcd->tx), GFP_KERNEL); if (!max3421_hcd->tx) @@ -1934,28 +1929,18 @@ max3421_probe(struct spi_device *spi) static int max3421_remove(struct spi_device *spi) { - struct max3421_hcd *max3421_hcd = NULL, **prev; - struct usb_hcd *hcd = NULL; + struct max3421_hcd *max3421_hcd; + struct usb_hcd *hcd; unsigned long flags; - for (prev = &max3421_hcd_list; *prev; prev = &(*prev)->next) { - max3421_hcd = *prev; - hcd = max3421_to_hcd(max3421_hcd); - if (hcd->self.controller == &spi->dev) - break; - } - if (!max3421_hcd) { - dev_err(&spi->dev, "no MAX3421 HCD found for SPI device %p\n", - spi); - return -ENODEV; - } + max3421_hcd = spi_get_drvdata(spi); + hcd = max3421_to_hcd(max3421_hcd); usb_remove_hcd(hcd); spin_lock_irqsave(&max3421_hcd->lock, flags); kthread_stop(max3421_hcd->spi_thread); - *prev = max3421_hcd->next; spin_unlock_irqrestore(&max3421_hcd->lock, flags); From 072f04e0f502e8086f60f7f92c0430a6ef403ae2 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 23 Sep 2021 21:12:52 +0300 Subject: [PATCH 1786/5344] ice: Delete always true check of PF pointer commit 2ff04286a9569675948f39cec2c6ad47c3584633 upstream. PF pointer is always valid when PCI core calls its .shutdown() and .remove() callbacks. There is no need to check it again. Fixes: 837f08fdecbe ("ice: Add basic driver framework for Intel(R) E800 Series") Signed-off-by: Leon Romanovsky Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/ice/ice_main.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index d0ccb7ad447b1..6370e96ebfacc 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -3005,9 +3005,6 @@ static void ice_remove(struct pci_dev *pdev) struct ice_pf *pf = pci_get_drvdata(pdev); int i; - if (!pf) - return; - for (i = 0; i < ICE_MAX_RESET_WAIT; i++) { if (!ice_is_reset_in_progress(pf->state)) break; From 5d898248d2bddd752bd7ae2add73cbca4e643f55 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Thu, 25 Jun 2020 16:03:12 +0800 Subject: [PATCH 1787/5344] tlb: mmu_gather: add tlb_flush_*_range APIs commit 2631ed00b0498810f8d5c2163c6b5270d893687b upstream. tlb_flush_{pte|pmd|pud|p4d}_range() adjust the tlb->start and tlb->end, then set corresponding cleared_*. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Zhenyu Ye Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20200625080314.230-5-yezhenyu2@huawei.com Signed-off-by: Catalin Marinas Signed-off-by: Greg Kroah-Hartman --- include/asm-generic/tlb.h | 55 ++++++++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 15 deletions(-) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index c716ea81e6531..46294ef620ff9 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -495,6 +495,38 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm } #endif +/* + * tlb_flush_{pte|pmd|pud|p4d}_range() adjust the tlb->start and tlb->end, + * and set corresponding cleared_*. + */ +static inline void tlb_flush_pte_range(struct mmu_gather *tlb, + unsigned long address, unsigned long size) +{ + __tlb_adjust_range(tlb, address, size); + tlb->cleared_ptes = 1; +} + +static inline void tlb_flush_pmd_range(struct mmu_gather *tlb, + unsigned long address, unsigned long size) +{ + __tlb_adjust_range(tlb, address, size); + tlb->cleared_pmds = 1; +} + +static inline void tlb_flush_pud_range(struct mmu_gather *tlb, + unsigned long address, unsigned long size) +{ + __tlb_adjust_range(tlb, address, size); + tlb->cleared_puds = 1; +} + +static inline void tlb_flush_p4d_range(struct mmu_gather *tlb, + unsigned long address, unsigned long size) +{ + __tlb_adjust_range(tlb, address, size); + tlb->cleared_p4ds = 1; +} + #ifndef __tlb_remove_tlb_entry #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) #endif @@ -508,19 +540,17 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm */ #define tlb_remove_tlb_entry(tlb, ptep, address) \ do { \ - __tlb_adjust_range(tlb, address, PAGE_SIZE); \ - tlb->cleared_ptes = 1; \ + tlb_flush_pte_range(tlb, address, PAGE_SIZE); \ __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ do { \ unsigned long _sz = huge_page_size(h); \ - __tlb_adjust_range(tlb, address, _sz); \ if (_sz == PMD_SIZE) \ - tlb->cleared_pmds = 1; \ + tlb_flush_pmd_range(tlb, address, _sz); \ else if (_sz == PUD_SIZE) \ - tlb->cleared_puds = 1; \ + tlb_flush_pud_range(tlb, address, _sz); \ __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) @@ -534,8 +564,7 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm #define tlb_remove_pmd_tlb_entry(tlb, pmdp, address) \ do { \ - __tlb_adjust_range(tlb, address, HPAGE_PMD_SIZE); \ - tlb->cleared_pmds = 1; \ + tlb_flush_pmd_range(tlb, address, HPAGE_PMD_SIZE); \ __tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \ } while (0) @@ -549,8 +578,7 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm #define tlb_remove_pud_tlb_entry(tlb, pudp, address) \ do { \ - __tlb_adjust_range(tlb, address, HPAGE_PUD_SIZE); \ - tlb->cleared_puds = 1; \ + tlb_flush_pud_range(tlb, address, HPAGE_PUD_SIZE); \ __tlb_remove_pud_tlb_entry(tlb, pudp, address); \ } while (0) @@ -575,9 +603,8 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm #ifndef pte_free_tlb #define pte_free_tlb(tlb, ptep, address) \ do { \ - __tlb_adjust_range(tlb, address, PAGE_SIZE); \ + tlb_flush_pmd_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ - tlb->cleared_pmds = 1; \ __pte_free_tlb(tlb, ptep, address); \ } while (0) #endif @@ -585,9 +612,8 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm #ifndef pmd_free_tlb #define pmd_free_tlb(tlb, pmdp, address) \ do { \ - __tlb_adjust_range(tlb, address, PAGE_SIZE); \ + tlb_flush_pud_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ - tlb->cleared_puds = 1; \ __pmd_free_tlb(tlb, pmdp, address); \ } while (0) #endif @@ -596,9 +622,8 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm #ifndef pud_free_tlb #define pud_free_tlb(tlb, pudp, address) \ do { \ - __tlb_adjust_range(tlb, address, PAGE_SIZE); \ + tlb_flush_p4d_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ - tlb->cleared_p4ds = 1; \ __pud_free_tlb(tlb, pudp, address); \ } while (0) #endif From c83fde15a6c022aef816ff755bd75d2f13076832 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Sun, 21 Nov 2021 12:40:07 -0800 Subject: [PATCH 1788/5344] hugetlbfs: flush TLBs correctly after huge_pmd_unshare commit a4a118f2eead1d6c49e00765de89878288d4b890 upstream. When __unmap_hugepage_range() calls to huge_pmd_unshare() succeed, a TLB flush is missing. This TLB flush must be performed before releasing the i_mmap_rwsem, in order to prevent an unshared PMDs page from being released and reused before the TLB flush took place. Arguably, a comprehensive solution would use mmu_gather interface to batch the TLB flushes and the PMDs page release, however it is not an easy solution: (1) try_to_unmap_one() and try_to_migrate_one() also call huge_pmd_unshare() and they cannot use the mmu_gather interface; and (2) deferring the release of the page reference for the PMDs page until after i_mmap_rwsem is dropeed can confuse huge_pmd_unshare() into thinking PMDs are shared when they are not. Fix __unmap_hugepage_range() by adding the missing TLB flush, and forcing a flush when unshare is successful. Fixes: 24669e58477e ("hugetlb: use mmu_gather instead of a temporary linked list for accumulating pages)" # 3.6 Signed-off-by: Nadav Amit Reviewed-by: Mike Kravetz Cc: Aneesh Kumar K.V Cc: KAMEZAWA Hiroyuki Cc: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/hugetlb.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 30845de8db7fb..4921d9391e736 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3932,6 +3932,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); struct mmu_notifier_range range; + bool force_flush = false; WARN_ON(!is_vm_hugetlb_page(vma)); BUG_ON(start & ~huge_page_mask(h)); @@ -3960,10 +3961,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, &address, ptep)) { spin_unlock(ptl); - /* - * We just unmapped a page of PMDs by clearing a PUD. - * The caller's TLB flush range should cover this area. - */ + tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE); + force_flush = true; continue; } @@ -4020,6 +4019,22 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, } mmu_notifier_invalidate_range_end(&range); tlb_end_vma(tlb, vma); + + /* + * If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We + * could defer the flush until now, since by holding i_mmap_rwsem we + * guaranteed that the last refernece would not be dropped. But we must + * do the flushing before we return, as otherwise i_mmap_rwsem will be + * dropped and the last reference to the shared PMDs page might be + * dropped as well. + * + * In theory we could defer the freeing of the PMD pages as well, but + * huge_pmd_unshare() relies on the exact page_count for the PMD page to + * detect sharing, so we cannot defer the release of the page either. + * Instead, do flush now. + */ + if (force_flush) + tlb_flush_mmu_tlbonly(tlb); } void __unmap_hugepage_range_final(struct mmu_gather *tlb, From 9d39ac9f214ec0cba49ab36355cdaf31b8939d06 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Fri, 24 Sep 2021 14:24:16 -0500 Subject: [PATCH 1789/5344] ALSA: hda: hdac_ext_stream: fix potential locking issues commit 868ddfcef31ff93ea8961b2e81ea7fe12f6f144b upstream. The code for hdac_ext_stream seems inherited from hdac_stream, and similar locking issues are present: the use of the bus->reg_lock spinlock is inconsistent, with only writes to specific fields being protected. Apply similar fix as in hdac_stream by protecting all accesses to 'link_locked' and 'decoupled' fields, with a new helper snd_hdac_ext_stream_decouple_locked() added to simplify code changes. Signed-off-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20210924192417.169243-4-pierre-louis.bossart@linux.intel.com Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- include/sound/hdaudio_ext.h | 2 ++ sound/hda/ext/hdac_ext_stream.c | 46 ++++++++++++++++++++------------- 2 files changed, 30 insertions(+), 18 deletions(-) diff --git a/include/sound/hdaudio_ext.h b/include/sound/hdaudio_ext.h index ef88b20c7b0a5..23dc8deac344d 100644 --- a/include/sound/hdaudio_ext.h +++ b/include/sound/hdaudio_ext.h @@ -88,6 +88,8 @@ struct hdac_ext_stream *snd_hdac_ext_stream_assign(struct hdac_bus *bus, struct snd_pcm_substream *substream, int type); void snd_hdac_ext_stream_release(struct hdac_ext_stream *azx_dev, int type); +void snd_hdac_ext_stream_decouple_locked(struct hdac_bus *bus, + struct hdac_ext_stream *azx_dev, bool decouple); void snd_hdac_ext_stream_decouple(struct hdac_bus *bus, struct hdac_ext_stream *azx_dev, bool decouple); void snd_hdac_ext_stop_streams(struct hdac_bus *bus); diff --git a/sound/hda/ext/hdac_ext_stream.c b/sound/hda/ext/hdac_ext_stream.c index 6b1b4b834baef..04f4070fbf366 100644 --- a/sound/hda/ext/hdac_ext_stream.c +++ b/sound/hda/ext/hdac_ext_stream.c @@ -106,20 +106,14 @@ void snd_hdac_stream_free_all(struct hdac_bus *bus) } EXPORT_SYMBOL_GPL(snd_hdac_stream_free_all); -/** - * snd_hdac_ext_stream_decouple - decouple the hdac stream - * @bus: HD-audio core bus - * @stream: HD-audio ext core stream object to initialize - * @decouple: flag to decouple - */ -void snd_hdac_ext_stream_decouple(struct hdac_bus *bus, - struct hdac_ext_stream *stream, bool decouple) +void snd_hdac_ext_stream_decouple_locked(struct hdac_bus *bus, + struct hdac_ext_stream *stream, + bool decouple) { struct hdac_stream *hstream = &stream->hstream; u32 val; int mask = AZX_PPCTL_PROCEN(hstream->index); - spin_lock_irq(&bus->reg_lock); val = readw(bus->ppcap + AZX_REG_PP_PPCTL) & mask; if (decouple && !val) @@ -128,6 +122,20 @@ void snd_hdac_ext_stream_decouple(struct hdac_bus *bus, snd_hdac_updatel(bus->ppcap, AZX_REG_PP_PPCTL, mask, 0); stream->decoupled = decouple; +} +EXPORT_SYMBOL_GPL(snd_hdac_ext_stream_decouple_locked); + +/** + * snd_hdac_ext_stream_decouple - decouple the hdac stream + * @bus: HD-audio core bus + * @stream: HD-audio ext core stream object to initialize + * @decouple: flag to decouple + */ +void snd_hdac_ext_stream_decouple(struct hdac_bus *bus, + struct hdac_ext_stream *stream, bool decouple) +{ + spin_lock_irq(&bus->reg_lock); + snd_hdac_ext_stream_decouple_locked(bus, stream, decouple); spin_unlock_irq(&bus->reg_lock); } EXPORT_SYMBOL_GPL(snd_hdac_ext_stream_decouple); @@ -252,6 +260,7 @@ hdac_ext_link_stream_assign(struct hdac_bus *bus, return NULL; } + spin_lock_irq(&bus->reg_lock); list_for_each_entry(stream, &bus->stream_list, list) { struct hdac_ext_stream *hstream = container_of(stream, struct hdac_ext_stream, @@ -266,17 +275,16 @@ hdac_ext_link_stream_assign(struct hdac_bus *bus, } if (!hstream->link_locked) { - snd_hdac_ext_stream_decouple(bus, hstream, true); + snd_hdac_ext_stream_decouple_locked(bus, hstream, true); res = hstream; break; } } if (res) { - spin_lock_irq(&bus->reg_lock); res->link_locked = 1; res->link_substream = substream; - spin_unlock_irq(&bus->reg_lock); } + spin_unlock_irq(&bus->reg_lock); return res; } @@ -292,6 +300,7 @@ hdac_ext_host_stream_assign(struct hdac_bus *bus, return NULL; } + spin_lock_irq(&bus->reg_lock); list_for_each_entry(stream, &bus->stream_list, list) { struct hdac_ext_stream *hstream = container_of(stream, struct hdac_ext_stream, @@ -301,18 +310,17 @@ hdac_ext_host_stream_assign(struct hdac_bus *bus, if (!stream->opened) { if (!hstream->decoupled) - snd_hdac_ext_stream_decouple(bus, hstream, true); + snd_hdac_ext_stream_decouple_locked(bus, hstream, true); res = hstream; break; } } if (res) { - spin_lock_irq(&bus->reg_lock); res->hstream.opened = 1; res->hstream.running = 0; res->hstream.substream = substream; - spin_unlock_irq(&bus->reg_lock); } + spin_unlock_irq(&bus->reg_lock); return res; } @@ -378,15 +386,17 @@ void snd_hdac_ext_stream_release(struct hdac_ext_stream *stream, int type) break; case HDAC_EXT_STREAM_TYPE_HOST: + spin_lock_irq(&bus->reg_lock); if (stream->decoupled && !stream->link_locked) - snd_hdac_ext_stream_decouple(bus, stream, false); + snd_hdac_ext_stream_decouple_locked(bus, stream, false); + spin_unlock_irq(&bus->reg_lock); snd_hdac_stream_release(&stream->hstream); break; case HDAC_EXT_STREAM_TYPE_LINK: - if (stream->decoupled && !stream->hstream.opened) - snd_hdac_ext_stream_decouple(bus, stream, false); spin_lock_irq(&bus->reg_lock); + if (stream->decoupled && !stream->hstream.opened) + snd_hdac_ext_stream_decouple_locked(bus, stream, false); stream->link_locked = 0; stream->link_substream = NULL; spin_unlock_irq(&bus->reg_lock); From 8fa55bccb7f11d27dc2e6ce814c6adcd5442015d Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Fri, 24 Sep 2021 14:24:14 -0500 Subject: [PATCH 1790/5344] ALSA: hda: hdac_stream: fix potential locking issue in snd_hdac_stream_assign() commit 1465d06a6d8580e73ae65f8590392df58c5ed2fd upstream. The fields 'opened', 'running', 'assigned_key' are all protected by a spinlock, but the spinlock is not taken when looking for a stream. This can result in a possible race between assign() and release(). Fix by taking the spinlock before walking through the bus stream list. Signed-off-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20210924192417.169243-2-pierre-louis.bossart@linux.intel.com Signed-off-by: Takashi Iwai Cc: Scott Bruce Signed-off-by: Greg Kroah-Hartman --- sound/hda/hdac_stream.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/hda/hdac_stream.c b/sound/hda/hdac_stream.c index 682ed39f79b01..b299b8b7f871a 100644 --- a/sound/hda/hdac_stream.c +++ b/sound/hda/hdac_stream.c @@ -289,6 +289,7 @@ struct hdac_stream *snd_hdac_stream_assign(struct hdac_bus *bus, int key = (substream->pcm->device << 16) | (substream->number << 2) | (substream->stream + 1); + spin_lock_irq(&bus->reg_lock); list_for_each_entry(azx_dev, &bus->stream_list, list) { if (azx_dev->direction != substream->stream) continue; @@ -302,13 +303,12 @@ struct hdac_stream *snd_hdac_stream_assign(struct hdac_bus *bus, res = azx_dev; } if (res) { - spin_lock_irq(&bus->reg_lock); res->opened = 1; res->running = 0; res->assigned_key = key; res->substream = substream; - spin_unlock_irq(&bus->reg_lock); } + spin_unlock_irq(&bus->reg_lock); return res; } EXPORT_SYMBOL_GPL(snd_hdac_stream_assign); From 3e737a274d4c1114fad60b9edea7171317695f92 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 26 Nov 2021 10:47:23 +0100 Subject: [PATCH 1791/5344] Linux 5.4.162 Link: https://lore.kernel.org/r/20211124115654.849735859@linuxfoundation.org Link: https://lore.kernel.org/r/20211125092028.153766171@linuxfoundation.org Tested-by: Jon Hunter Tested-by: Sudip Mukherjee Tested-by: Jon Hunter Tested-by: Guenter Roeck Tested-by: Linux Kernel Functional Testing Tested-by: Hulk Robot Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index f552556966f1d..e8b05f7d3b238 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 4 -SUBLEVEL = 161 +SUBLEVEL = 162 EXTRAVERSION = NAME = Kleptomaniac Octopus From 2a823af210accbed7726b24160a087a2a6456725 Mon Sep 17 00:00:00 2001 From: Daniele Palmas Date: Fri, 19 Nov 2021 15:03:19 +0100 Subject: [PATCH 1792/5344] USB: serial: option: add Telit LE910S1 0x9200 composition commit e353f3e88720300c3d72f49a4bea54f42db1fa5e upstream. Add the following Telit LE910S1 composition: 0x9200: tty Signed-off-by: Daniele Palmas Link: https://lore.kernel.org/r/20211119140319.10448-1-dnlplm@gmail.com Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/option.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index a1e9cbe518c74..6c27483db1d35 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1267,6 +1267,8 @@ static const struct usb_device_id option_ids[] = { .driver_info = NCTRL(2) }, { USB_DEVICE(TELIT_VENDOR_ID, 0x9010), /* Telit SBL FN980 flashing device */ .driver_info = NCTRL(0) | ZLP }, + { USB_DEVICE(TELIT_VENDOR_ID, 0x9200), /* Telit LE910S1 flashing device */ + .driver_info = NCTRL(0) | ZLP }, { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, ZTE_PRODUCT_MF622, 0xff, 0xff, 0xff) }, /* ZTE WCDMA products */ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0002, 0xff, 0xff, 0xff), .driver_info = RSVD(1) }, From 9b64b12db1451d22c977ae66702cb2ea405f469e Mon Sep 17 00:00:00 2001 From: Mingjie Zhang Date: Tue, 23 Nov 2021 21:37:57 +0800 Subject: [PATCH 1793/5344] USB: serial: option: add Fibocom FM101-GL variants commit 88459e3e42760abb2299bbf6cb1026491170e02a upstream. Update the USB serial option driver support for the Fibocom FM101-GL Cat.6 LTE modules as there are actually several different variants. - VID:PID 2cb7:01a2, FM101-GL are laptop M.2 cards (with MBIM interfaces for /Linux/Chrome OS) - VID:PID 2cb7:01a4, FM101-GL for laptop debug M.2 cards(with adb interface for /Linux/Chrome OS) 0x01a2: mbim, tty, tty, diag, gnss 0x01a4: mbim, diag, tty, adb, gnss, gnss Here are the outputs of lsusb -v and usb-devices: T: Bus=02 Lev=01 Prnt=01 Port=03 Cnt=01 Dev#= 86 Spd=5000 MxCh= 0 D: Ver= 3.20 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 9 #Cfgs= 1 P: Vendor=2cb7 ProdID=01a2 Rev= 5.04 S: Manufacturer=Fibocom Wireless Inc. S: Product=Fibocom FM101-GL Module S: SerialNumber=673326ce C:* #Ifs= 6 Cfg#= 1 Atr=a0 MxPwr=896mA A: FirstIf#= 0 IfCount= 2 Cls=02(comm.) Sub=0e Prot=00 I:* If#= 0 Alt= 0 #EPs= 1 Cls=02(comm.) Sub=0e Prot=00 Driver=cdc_mbim I: If#= 1 Alt= 0 #EPs= 0 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim I:* If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=(none) I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=(none) I:* If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=(none) I:* If#= 5 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=40 Driver=(none) Bus 002 Device 084: ID 2cb7:01a2 Fibocom Wireless Inc. Fibocom FM101-GL Module Device Descriptor: bLength 18 bDescriptorType 1 bcdUSB 3.20 bDeviceClass 0 bDeviceSubClass 0 bDeviceProtocol 0 bMaxPacketSize0 9 idVendor 0x2cb7 idProduct 0x01a2 bcdDevice 5.04 iManufacturer 1 Fibocom Wireless Inc. iProduct 2 Fibocom FM101-GL Module iSerial 3 673326ce bNumConfigurations 1 Configuration Descriptor: bLength 9 bDescriptorType 2 wTotalLength 0x015d bNumInterfaces 6 bConfigurationValue 1 iConfiguration 4 MBIM_DUN_DUN_DIAG_NMEA bmAttributes 0xa0 (Bus Powered) Remote Wakeup MaxPower 896mA Interface Association: bLength 8 bDescriptorType 11 bFirstInterface 0 bInterfaceCount 2 bFunctionClass 2 Communications bFunctionSubClass 14 bFunctionProtocol 0 iFunction 0 Interface Descriptor: bLength 9 bDescriptorType 4 bInterfaceNumber 0 bAlternateSetting 0 bNumEndpoints 1 bInterfaceClass 2 Communications bInterfaceSubClass 14 bInterfaceProtocol 0 iInterface 5 Fibocom FM101-GL LTE Modem CDC Header: bcdCDC 1.10 CDC Union: bMasterInterface 0 bSlaveInterface 1 CDC MBIM: bcdMBIMVersion 1.00 wMaxControlMessage 4096 bNumberFilters 32 bMaxFilterSize 128 wMaxSegmentSize 2048 bmNetworkCapabilities 0x20 8-byte ntb input size CDC MBIM Extended: bcdMBIMExtendedVersion 1.00 bMaxOutstandingCommandMessages 64 wMTU 1500 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x81 EP 1 IN bmAttributes 3 Transfer Type Interrupt Synch Type None Usage Type Data wMaxPacketSize 0x0040 1x 64 bytes bInterval 9 bMaxBurst 0 Interface Descriptor: bLength 9 bDescriptorType 4 bInterfaceNumber 1 bAlternateSetting 0 bNumEndpoints 0 bInterfaceClass 10 CDC Data bInterfaceSubClass 0 bInterfaceProtocol 2 iInterface 0 Interface Descriptor: bLength 9 bDescriptorType 4 bInterfaceNumber 1 bAlternateSetting 1 bNumEndpoints 2 bInterfaceClass 10 CDC Data bInterfaceSubClass 0 bInterfaceProtocol 2 iInterface 6 MBIM Data Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x8e EP 14 IN bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0400 1x 1024 bytes bInterval 0 bMaxBurst 6 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x0f EP 15 OUT bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0400 1x 1024 bytes bInterval 0 bMaxBurst 2 Interface Descriptor: bLength 9 bDescriptorType 4 bInterfaceNumber 2 bAlternateSetting 0 bNumEndpoints 3 bInterfaceClass 255 Vendor Specific Class bInterfaceSubClass 255 Vendor Specific Subclass bInterfaceProtocol 64 iInterface 0 ** UNRECOGNIZED: 05 24 00 10 01 ** UNRECOGNIZED: 05 24 01 00 00 ** UNRECOGNIZED: 04 24 02 02 ** UNRECOGNIZED: 05 24 06 00 00 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x83 EP 3 IN bmAttributes 3 Transfer Type Interrupt Synch Type None Usage Type Data wMaxPacketSize 0x000a 1x 10 bytes bInterval 9 bMaxBurst 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x82 EP 2 IN bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0400 1x 1024 bytes bInterval 0 bMaxBurst 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x01 EP 1 OUT bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0400 1x 1024 bytes bInterval 0 bMaxBurst 0 Interface Descriptor: bLength 9 bDescriptorType 4 bInterfaceNumber 3 bAlternateSetting 0 bNumEndpoints 3 bInterfaceClass 255 Vendor Specific Class bInterfaceSubClass 255 Vendor Specific Subclass bInterfaceProtocol 64 iInterface 0 ** UNRECOGNIZED: 05 24 00 10 01 ** UNRECOGNIZED: 05 24 01 00 00 ** UNRECOGNIZED: 04 24 02 02 ** UNRECOGNIZED: 05 24 06 00 00 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x85 EP 5 IN bmAttributes 3 Transfer Type Interrupt Synch Type None Usage Type Data wMaxPacketSize 0x000a 1x 10 bytes bInterval 9 bMaxBurst 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x84 EP 4 IN bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0400 1x 1024 bytes bInterval 0 bMaxBurst 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x02 EP 2 OUT bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0400 1x 1024 bytes bInterval 0 bMaxBurst 0 Interface Descriptor: bLength 9 bDescriptorType 4 bInterfaceNumber 4 bAlternateSetting 0 bNumEndpoints 2 bInterfaceClass 255 Vendor Specific Class bInterfaceSubClass 255 Vendor Specific Subclass bInterfaceProtocol 48 iInterface 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x03 EP 3 OUT bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0400 1x 1024 bytes bInterval 0 bMaxBurst 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x86 EP 6 IN bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0400 1x 1024 bytes bInterval 0 bMaxBurst 0 Interface Descriptor: bLength 9 bDescriptorType 4 bInterfaceNumber 5 bAlternateSetting 0 bNumEndpoints 3 bInterfaceClass 255 Vendor Specific Class bInterfaceSubClass 0 bInterfaceProtocol 64 iInterface 0 ** UNRECOGNIZED: 05 24 00 10 01 ** UNRECOGNIZED: 05 24 01 00 00 ** UNRECOGNIZED: 04 24 02 02 ** UNRECOGNIZED: 05 24 06 00 00 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x88 EP 8 IN bmAttributes 3 Transfer Type Interrupt Synch Type None Usage Type Data wMaxPacketSize 0x000a 1x 10 bytes bInterval 9 bMaxBurst 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x87 EP 7 IN bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0400 1x 1024 bytes bInterval 0 bMaxBurst 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x04 EP 4 OUT bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0400 1x 1024 bytes bInterval 0 bMaxBurst 0 T: Bus=02 Lev=01 Prnt=01 Port=03 Cnt=01 Dev#= 85 Spd=5000 MxCh= 0 D: Ver= 3.20 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 9 #Cfgs= 1 P: Vendor=2cb7 ProdID=01a4 Rev= 5.04 S: Manufacturer=Fibocom Wireless Inc. S: Product=Fibocom FM101-GL Module S: SerialNumber=673326ce C:* #Ifs= 7 Cfg#= 1 Atr=a0 MxPwr=896mA A: FirstIf#= 0 IfCount= 2 Cls=02(comm.) Sub=0e Prot=00 I:* If#= 0 Alt= 0 #EPs= 1 Cls=02(comm.) Sub=0e Prot=00 Driver=cdc_mbim I: If#= 1 Alt= 0 #EPs= 0 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim I:* If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim I:* If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=(none) I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=(none) I:* If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none) I:* If#= 5 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=40 Driver=(none) I:* If#= 6 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=40 Driver=(none) Bus 002 Device 085: ID 2cb7:01a4 Fibocom Wireless Inc. Fibocom FM101-GL Module Device Descriptor: bLength 18 bDescriptorType 1 bcdUSB 3.20 bDeviceClass 0 bDeviceSubClass 0 bDeviceProtocol 0 bMaxPacketSize0 9 idVendor 0x2cb7 idProduct 0x01a4 bcdDevice 5.04 iManufacturer 1 Fibocom Wireless Inc. iProduct 2 Fibocom FM101-GL Module iSerial 3 673326ce bNumConfigurations 1 Configuration Descriptor: bLength 9 bDescriptorType 2 wTotalLength 0x0180 bNumInterfaces 7 bConfigurationValue 1 iConfiguration 4 MBIM_DIAG_DUN_ADB_GNSS_GNSS bmAttributes 0xa0 (Bus Powered) Remote Wakeup MaxPower 896mA Interface Association: bLength 8 bDescriptorType 11 bFirstInterface 0 bInterfaceCount 2 bFunctionClass 2 Communications bFunctionSubClass 14 bFunctionProtocol 0 iFunction 0 Interface Descriptor: bLength 9 bDescriptorType 4 bInterfaceNumber 0 bAlternateSetting 0 bNumEndpoints 1 bInterfaceClass 2 Communications bInterfaceSubClass 14 bInterfaceProtocol 0 iInterface 5 Fibocom FM101-GL LTE Modem CDC Header: bcdCDC 1.10 CDC Union: bMasterInterface 0 bSlaveInterface 1 CDC MBIM: bcdMBIMVersion 1.00 wMaxControlMessage 4096 bNumberFilters 32 bMaxFilterSize 128 wMaxSegmentSize 2048 bmNetworkCapabilities 0x20 8-byte ntb input size CDC MBIM Extended: bcdMBIMExtendedVersion 1.00 bMaxOutstandingCommandMessages 64 wMTU 1500 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x81 EP 1 IN bmAttributes 3 Transfer Type Interrupt Synch Type None Usage Type Data wMaxPacketSize 0x0040 1x 64 bytes bInterval 9 bMaxBurst 0 Interface Descriptor: bLength 9 bDescriptorType 4 bInterfaceNumber 1 bAlternateSetting 0 bNumEndpoints 0 bInterfaceClass 10 CDC Data bInterfaceSubClass 0 bInterfaceProtocol 2 iInterface 0 Interface Descriptor: bLength 9 bDescriptorType 4 bInterfaceNumber 1 bAlternateSetting 1 bNumEndpoints 2 bInterfaceClass 10 CDC Data bInterfaceSubClass 0 bInterfaceProtocol 2 iInterface 6 MBIM Data Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x8e EP 14 IN bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0400 1x 1024 bytes bInterval 0 bMaxBurst 6 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x0f EP 15 OUT bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0400 1x 1024 bytes bInterval 0 bMaxBurst 2 Interface Descriptor: bLength 9 bDescriptorType 4 bInterfaceNumber 2 bAlternateSetting 0 bNumEndpoints 2 bInterfaceClass 255 Vendor Specific Class bInterfaceSubClass 255 Vendor Specific Subclass bInterfaceProtocol 48 iInterface 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x01 EP 1 OUT bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0400 1x 1024 bytes bInterval 0 bMaxBurst 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x82 EP 2 IN bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0400 1x 1024 bytes bInterval 0 bMaxBurst 0 Interface Descriptor: bLength 9 bDescriptorType 4 bInterfaceNumber 3 bAlternateSetting 0 bNumEndpoints 3 bInterfaceClass 255 Vendor Specific Class bInterfaceSubClass 255 Vendor Specific Subclass bInterfaceProtocol 64 iInterface 0 ** UNRECOGNIZED: 05 24 00 10 01 ** UNRECOGNIZED: 05 24 01 00 00 ** UNRECOGNIZED: 04 24 02 02 ** UNRECOGNIZED: 05 24 06 00 00 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x84 EP 4 IN bmAttributes 3 Transfer Type Interrupt Synch Type None Usage Type Data wMaxPacketSize 0x000a 1x 10 bytes bInterval 9 bMaxBurst 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x83 EP 3 IN bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0400 1x 1024 bytes bInterval 0 bMaxBurst 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x02 EP 2 OUT bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0400 1x 1024 bytes bInterval 0 bMaxBurst 0 Interface Descriptor: bLength 9 bDescriptorType 4 bInterfaceNumber 4 bAlternateSetting 0 bNumEndpoints 2 bInterfaceClass 255 Vendor Specific Class bInterfaceSubClass 66 bInterfaceProtocol 1 iInterface 8 ADB Interface Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x03 EP 3 OUT bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0400 1x 1024 bytes bInterval 0 bMaxBurst 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x85 EP 5 IN bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0400 1x 1024 bytes bInterval 0 bMaxBurst 0 Interface Descriptor: bLength 9 bDescriptorType 4 bInterfaceNumber 5 bAlternateSetting 0 bNumEndpoints 3 bInterfaceClass 255 Vendor Specific Class bInterfaceSubClass 0 bInterfaceProtocol 64 iInterface 0 ** UNRECOGNIZED: 05 24 00 10 01 ** UNRECOGNIZED: 05 24 01 00 00 ** UNRECOGNIZED: 04 24 02 02 ** UNRECOGNIZED: 05 24 06 00 00 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x87 EP 7 IN bmAttributes 3 Transfer Type Interrupt Synch Type None Usage Type Data wMaxPacketSize 0x000a 1x 10 bytes bInterval 9 bMaxBurst 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x86 EP 6 IN bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0400 1x 1024 bytes bInterval 0 bMaxBurst 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x04 EP 4 OUT bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0400 1x 1024 bytes bInterval 0 bMaxBurst 0 Interface Descriptor: bLength 9 bDescriptorType 4 bInterfaceNumber 6 bAlternateSetting 0 bNumEndpoints 3 bInterfaceClass 255 Vendor Specific Class bInterfaceSubClass 0 bInterfaceProtocol 64 iInterface 0 ** UNRECOGNIZED: 05 24 00 10 01 ** UNRECOGNIZED: 05 24 01 00 00 ** UNRECOGNIZED: 04 24 02 02 ** UNRECOGNIZED: 05 24 06 00 00 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x89 EP 9 IN bmAttributes 3 Transfer Type Interrupt Synch Type None Usage Type Data wMaxPacketSize 0x000a 1x 10 bytes bInterval 9 bMaxBurst 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x88 EP 8 IN bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0400 1x 1024 bytes bInterval 0 bMaxBurst 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x05 EP 5 OUT bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0400 1x 1024 bytes bInterval 0 bMaxBurst 0 Signed-off-by: Mingjie Zhang Link: https://lore.kernel.org/r/20211123133757.37475-1-superzmj@fibocom.com Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/option.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 6c27483db1d35..74203ed5479fa 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -2096,6 +2096,9 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(0x2cb7, 0x010b, 0xff, 0xff, 0x30) }, /* Fibocom FG150 Diag */ { USB_DEVICE_AND_INTERFACE_INFO(0x2cb7, 0x010b, 0xff, 0, 0) }, /* Fibocom FG150 AT */ { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x01a0, 0xff) }, /* Fibocom NL668-AM/NL652-EU (laptop MBIM) */ + { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x01a2, 0xff) }, /* Fibocom FM101-GL (laptop MBIM) */ + { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x01a4, 0xff), /* Fibocom FM101-GL (laptop MBIM) */ + .driver_info = RSVD(4) }, { USB_DEVICE_INTERFACE_CLASS(0x2df3, 0x9d03, 0xff) }, /* LongSung M5710 */ { USB_DEVICE_INTERFACE_CLASS(0x305a, 0x1404, 0xff) }, /* GosunCn GM500 RNDIS */ { USB_DEVICE_INTERFACE_CLASS(0x305a, 0x1405, 0xff) }, /* GosunCn GM500 MBIM */ From 03738f8360f7d6daa034e08b1a8eb8eb784cecfe Mon Sep 17 00:00:00 2001 From: Minas Harutyunyan Date: Thu, 4 Nov 2021 11:36:01 +0400 Subject: [PATCH 1794/5344] usb: dwc2: gadget: Fix ISOC flow for elapsed frames commit 7ad4a0b1d46b2612f4429a72afd8f137d7efa9a9 upstream. Added updating of request frame number for elapsed frames, otherwise frame number will remain as previous use of request. This will allow function driver to correctly track frames in case of Missed ISOC occurs. Added setting request actual length to 0 for elapsed frames. In Slave mode when pushing data to RxFIFO by dwords, request actual length incrementing accordingly. But before whole packet will be pushed into RxFIFO and send to host can occurs Missed ISOC and data will not send to host. So, in this case request actual length should be reset to 0. Fixes: 91bb163e1e4f ("usb: dwc2: gadget: Fix ISOC flow for BDMA and Slave") Cc: stable Reviewed-by: John Keeping Signed-off-by: Minas Harutyunyan Link: https://lore.kernel.org/r/c356baade6e9716d312d43df08d53ae557cb8037.1636011277.git.Minas.Harutyunyan@synopsys.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc2/gadget.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/usb/dwc2/gadget.c b/drivers/usb/dwc2/gadget.c index e8b25dae09499..249e8e6aa9282 100644 --- a/drivers/usb/dwc2/gadget.c +++ b/drivers/usb/dwc2/gadget.c @@ -1198,6 +1198,8 @@ static void dwc2_hsotg_start_req(struct dwc2_hsotg *hsotg, } ctrl |= DXEPCTL_CNAK; } else { + hs_req->req.frame_number = hs_ep->target_frame; + hs_req->req.actual = 0; dwc2_hsotg_complete_request(hsotg, hs_ep, hs_req, -ENODATA); return; } @@ -2855,9 +2857,12 @@ static void dwc2_gadget_handle_ep_disabled(struct dwc2_hsotg_ep *hs_ep) do { hs_req = get_ep_head(hs_ep); - if (hs_req) + if (hs_req) { + hs_req->req.frame_number = hs_ep->target_frame; + hs_req->req.actual = 0; dwc2_hsotg_complete_request(hsotg, hs_ep, hs_req, -ENODATA); + } dwc2_gadget_incr_frame_num(hs_ep); /* Update current frame number value. */ hsotg->frame_number = dwc2_hsotg_read_frameno(hsotg); @@ -2910,8 +2915,11 @@ static void dwc2_gadget_handle_out_token_ep_disabled(struct dwc2_hsotg_ep *ep) while (dwc2_gadget_target_frame_elapsed(ep)) { hs_req = get_ep_head(ep); - if (hs_req) + if (hs_req) { + hs_req->req.frame_number = ep->target_frame; + hs_req->req.actual = 0; dwc2_hsotg_complete_request(hsotg, ep, hs_req, -ENODATA); + } dwc2_gadget_incr_frame_num(ep); /* Update current frame number value. */ @@ -3000,8 +3008,11 @@ static void dwc2_gadget_handle_nak(struct dwc2_hsotg_ep *hs_ep) while (dwc2_gadget_target_frame_elapsed(hs_ep)) { hs_req = get_ep_head(hs_ep); - if (hs_req) + if (hs_req) { + hs_req->req.frame_number = hs_ep->target_frame; + hs_req->req.actual = 0; dwc2_hsotg_complete_request(hsotg, hs_ep, hs_req, -ENODATA); + } dwc2_gadget_incr_frame_num(hs_ep); /* Update current frame number value. */ From 679fa77a25dd09d32ac7f1cdec8fb35dcbef2d61 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 5 Nov 2021 07:58:03 -0700 Subject: [PATCH 1795/5344] usb: dwc2: hcd_queue: Fix use of floating point literal commit 310780e825f3ffd211b479b8f828885a6faedd63 upstream. A new commit in LLVM causes an error on the use of 'long double' when '-mno-x87' is used, which the kernel does through an alias, '-mno-80387' (see the LLVM commit below for more details around why it does this). drivers/usb/dwc2/hcd_queue.c:1744:25: error: expression requires 'long double' type support, but target 'x86_64-unknown-linux-gnu' does not support it delay = ktime_set(0, DWC2_RETRY_WAIT_DELAY); ^ drivers/usb/dwc2/hcd_queue.c:62:34: note: expanded from macro 'DWC2_RETRY_WAIT_DELAY' #define DWC2_RETRY_WAIT_DELAY (1 * 1E6L) ^ 1 error generated. This happens due to the use of a 'long double' literal. The 'E6' part of '1E6L' causes the literal to be a 'double' then the 'L' suffix promotes it to 'long double'. There is no visible reason for a floating point value in this driver, as the value is only used as a parameter to a function that expects an integer type. Use NSEC_PER_MSEC, which is the same integer value as '1E6L', to avoid changing functionality but fix the error. Link: https://github.com/ClangBuiltLinux/linux/issues/1497 Link: https://github.com/llvm/llvm-project/commit/a8083d42b1c346e21623a1d36d1f0cadd7801d83 Fixes: 6ed30a7d8ec2 ("usb: dwc2: host: use hrtimer for NAK retries") Cc: stable Reviewed-by: Nick Desaulniers Reviewed-by: John Keeping Acked-by: Minas Harutyunyan Signed-off-by: Nathan Chancellor Link: https://lore.kernel.org/r/20211105145802.2520658-1-nathan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc2/hcd_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/dwc2/hcd_queue.c b/drivers/usb/dwc2/hcd_queue.c index 68bbac64b7536..94af71e9856f2 100644 --- a/drivers/usb/dwc2/hcd_queue.c +++ b/drivers/usb/dwc2/hcd_queue.c @@ -59,7 +59,7 @@ #define DWC2_UNRESERVE_DELAY (msecs_to_jiffies(5)) /* If we get a NAK, wait this long before retrying */ -#define DWC2_RETRY_WAIT_DELAY 1*1E6L +#define DWC2_RETRY_WAIT_DELAY (1 * NSEC_PER_MSEC) /** * dwc2_periodic_channel_available() - Checks that a channel is available for a From d78a1d8e18ec90d31daa701e9bbb16724abfdd18 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 23 Nov 2021 12:27:19 +0200 Subject: [PATCH 1796/5344] net: nexthop: fix null pointer dereference when IPv6 is not enabled commit 1c743127cc54b112b155f434756bd4b5fa565a99 upstream. When we try to add an IPv6 nexthop and IPv6 is not enabled (!CONFIG_IPV6) we'll hit a NULL pointer dereference[1] in the error path of nh_create_ipv6() due to calling ipv6_stub->fib6_nh_release. The bug has been present since the beginning of IPv6 nexthop gateway support. Commit 1aefd3de7bc6 ("ipv6: Add fib6_nh_init and release to stubs") tells us that only fib6_nh_init has a dummy stub because fib6_nh_release should not be called if fib6_nh_init returns an error, but the commit below added a call to ipv6_stub->fib6_nh_release in its error path. To fix it return the dummy stub's -EAFNOSUPPORT error directly without calling ipv6_stub->fib6_nh_release in nh_create_ipv6()'s error path. [1] Output is a bit truncated, but it clearly shows the error. BUG: kernel NULL pointer dereference, address: 000000000000000000 #PF: supervisor instruction fetch in kernel modede #PF: error_code(0x0010) - not-present pagege PGD 0 P4D 0 Oops: 0010 [#1] PREEMPT SMP NOPTI CPU: 4 PID: 638 Comm: ip Kdump: loaded Not tainted 5.16.0-rc1+ #446 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-4.fc34 04/01/2014 RIP: 0010:0x0 Code: Unable to access opcode bytes at RIP 0xffffffffffffffd6. RSP: 0018:ffff888109f5b8f0 EFLAGS: 00010286^Ac RAX: 0000000000000000 RBX: ffff888109f5ba28 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8881008a2860 RBP: ffff888109f5b9d8 R08: 0000000000000000 R09: 0000000000000000 R10: ffff888109f5b978 R11: ffff888109f5b948 R12: 00000000ffffff9f R13: ffff8881008a2a80 R14: ffff8881008a2860 R15: ffff8881008a2840 FS: 00007f98de70f100(0000) GS:ffff88822bf00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffffffffd6 CR3: 0000000100efc000 CR4: 00000000000006e0 Call Trace: nh_create_ipv6+0xed/0x10c rtm_new_nexthop+0x6d7/0x13f3 ? check_preemption_disabled+0x3d/0xf2 ? lock_is_held_type+0xbe/0xfd rtnetlink_rcv_msg+0x23f/0x26a ? check_preemption_disabled+0x3d/0xf2 ? rtnl_calcit.isra.0+0x147/0x147 netlink_rcv_skb+0x61/0xb2 netlink_unicast+0x100/0x187 netlink_sendmsg+0x37f/0x3a0 ? netlink_unicast+0x187/0x187 sock_sendmsg_nosec+0x67/0x9b ____sys_sendmsg+0x19d/0x1f9 ? copy_msghdr_from_user+0x4c/0x5e ? rcu_read_lock_any_held+0x2a/0x78 ___sys_sendmsg+0x6c/0x8c ? asm_sysvec_apic_timer_interrupt+0x12/0x20 ? lockdep_hardirqs_on+0xd9/0x102 ? sockfd_lookup_light+0x69/0x99 __sys_sendmsg+0x50/0x6e do_syscall_64+0xcb/0xf2 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f98dea28914 Code: 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b5 0f 1f 80 00 00 00 00 48 8d 05 e9 5d 0c 00 8b 00 85 c0 75 13 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 54 c3 0f 1f 00 41 54 41 89 d4 55 48 89 f5 53 RSP: 002b:00007fff859f5e68 EFLAGS: 00000246 ORIG_RAX: 000000000000002e2e RAX: ffffffffffffffda RBX: 00000000619cb810 RCX: 00007f98dea28914 RDX: 0000000000000000 RSI: 00007fff859f5ed0 RDI: 0000000000000003 RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000008 R10: fffffffffffffce6 R11: 0000000000000246 R12: 0000000000000001 R13: 000055c0097ae520 R14: 000055c0097957fd R15: 00007fff859f63a0 Modules linked in: bridge stp llc bonding virtio_net Cc: stable@vger.kernel.org Fixes: 53010f991a9f ("nexthop: Add support for IPv6 gateways") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/nexthop.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 858bb10d8341e..8bdffdf0502a1 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -1231,11 +1231,15 @@ static int nh_create_ipv6(struct net *net, struct nexthop *nh, /* sets nh_dev if successful */ err = ipv6_stub->fib6_nh_init(net, fib6_nh, &fib6_cfg, GFP_KERNEL, extack); - if (err) + if (err) { + /* IPv6 is not enabled, don't call fib6_nh_release */ + if (err == -EAFNOSUPPORT) + goto out; ipv6_stub->fib6_nh_release(fib6_nh); - else + } else { nh->nh_flags = fib6_nh->fib_nh_flags; - + } +out: return err; } From 85d9efeb4cca44c3049dff342c244dfd78b0aee0 Mon Sep 17 00:00:00 2001 From: Ondrej Jirman Date: Mon, 8 Nov 2021 11:28:32 +0100 Subject: [PATCH 1797/5344] usb: typec: fusb302: Fix masking of comparator and bc_lvl interrupts commit 362468830dd5bea8bf6ad5203b2ea61f8a4e8288 upstream. The code that enables either BC_LVL or COMP_CHNG interrupt in tcpm_set_cc wrongly assumes that the interrupt is unmasked by writing 1 to the apropriate bit in the mask register. In fact, interrupts are enabled when the mask is 0, so the tcpm_set_cc enables interrupt for COMP_CHNG when it expects BC_LVL interrupt to be enabled. This causes inability of the driver to recognize cable unplug events in host mode (unplug is recognized only via a COMP_CHNG interrupt). In device mode this bug was masked by simultaneous triggering of the VBUS change interrupt, because of loss of VBUS when the port peer is providing power. Fixes: 48242e30532b ("usb: typec: fusb302: Revert "Resolve fixed power role contract setup"") Cc: stable Cc: Hans de Goede Reviewed-by: Hans de Goede Acked-by: Heikki Krogerus Signed-off-by: Ondrej Jirman Link: https://lore.kernel.org/r/20211108102833.2793803-1-megous@megous.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/fusb302.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/usb/typec/tcpm/fusb302.c b/drivers/usb/typec/tcpm/fusb302.c index b498960ff72b5..5e661bae39972 100644 --- a/drivers/usb/typec/tcpm/fusb302.c +++ b/drivers/usb/typec/tcpm/fusb302.c @@ -669,25 +669,27 @@ static int tcpm_set_cc(struct tcpc_dev *dev, enum typec_cc_status cc) ret = fusb302_i2c_mask_write(chip, FUSB_REG_MASK, FUSB_REG_MASK_BC_LVL | FUSB_REG_MASK_COMP_CHNG, - FUSB_REG_MASK_COMP_CHNG); + FUSB_REG_MASK_BC_LVL); if (ret < 0) { fusb302_log(chip, "cannot set SRC interrupt, ret=%d", ret); goto done; } chip->intr_comp_chng = true; + chip->intr_bc_lvl = false; break; case TYPEC_CC_RD: ret = fusb302_i2c_mask_write(chip, FUSB_REG_MASK, FUSB_REG_MASK_BC_LVL | FUSB_REG_MASK_COMP_CHNG, - FUSB_REG_MASK_BC_LVL); + FUSB_REG_MASK_COMP_CHNG); if (ret < 0) { fusb302_log(chip, "cannot set SRC interrupt, ret=%d", ret); goto done; } chip->intr_bc_lvl = true; + chip->intr_comp_chng = false; break; default: break; From a425bf488839a341617cf98a6fac3988fcedb2d5 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Tue, 16 Nov 2021 00:16:30 +0200 Subject: [PATCH 1798/5344] usb: hub: Fix usb enumeration issue due to address0 race commit 6ae6dc22d2d1ce6aa77a6da8a761e61aca216f8b upstream. xHC hardware can only have one slot in default state with address 0 waiting for a unique address at a time, otherwise "undefined behavior may occur" according to xhci spec 5.4.3.4 The address0_mutex exists to prevent this across both xhci roothubs. If hub_port_init() fails, it may unlock the mutex and exit with a xhci slot in default state. If the other xhci roothub calls hub_port_init() at this point we end up with two slots in default state. Make sure the address0_mutex protects the slot default state across hub_port_init() retries, until slot is addressed or disabled. Note, one known minor case is not fixed by this patch. If device needs to be reset during resume, but fails all hub_port_init() retries in usb_reset_and_verify_device(), then it's possible the slot is still left in default state when address0_mutex is unlocked. Cc: Fixes: 638139eb95d2 ("usb: hub: allow to process more usb hub events in parallel") Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20211115221630.871204-1-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hub.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 303e8b3c1bdae..d6e9cf3092ff1 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -4609,8 +4609,6 @@ hub_port_init(struct usb_hub *hub, struct usb_device *udev, int port1, if (oldspeed == USB_SPEED_LOW) delay = HUB_LONG_RESET_TIME; - mutex_lock(hcd->address0_mutex); - /* Reset the device; full speed may morph to high speed */ /* FIXME a USB 2.0 device may morph into SuperSpeed on reset. */ retval = hub_port_reset(hub, port1, udev, delay, false); @@ -4925,7 +4923,6 @@ hub_port_init(struct usb_hub *hub, struct usb_device *udev, int port1, hub_port_disable(hub, port1, 0); update_devnum(udev, devnum); /* for disconnect processing */ } - mutex_unlock(hcd->address0_mutex); return retval; } @@ -5070,6 +5067,9 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, unit_load = 100; status = 0; + + mutex_lock(hcd->address0_mutex); + for (i = 0; i < SET_CONFIG_TRIES; i++) { /* reallocate for each attempt, since references @@ -5106,6 +5106,8 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, if (status < 0) goto loop; + mutex_unlock(hcd->address0_mutex); + if (udev->quirks & USB_QUIRK_DELAY_INIT) msleep(2000); @@ -5194,6 +5196,7 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, loop_disable: hub_port_disable(hub, port1, 1); + mutex_lock(hcd->address0_mutex); loop: usb_ep0_reinit(udev); release_devnum(udev); @@ -5220,6 +5223,8 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, } done: + mutex_unlock(hcd->address0_mutex); + hub_port_disable(hub, port1, 1); if (hcd->driver->relinquish_port && !hub->hdev->parent) { if (status != -ENOTCONN && status != -ENODEV) @@ -5794,6 +5799,8 @@ static int usb_reset_and_verify_device(struct usb_device *udev) bos = udev->bos; udev->bos = NULL; + mutex_lock(hcd->address0_mutex); + for (i = 0; i < SET_CONFIG_TRIES; ++i) { /* ep0 maxpacket size may change; let the HCD know about it. @@ -5803,6 +5810,7 @@ static int usb_reset_and_verify_device(struct usb_device *udev) if (ret >= 0 || ret == -ENOTCONN || ret == -ENODEV) break; } + mutex_unlock(hcd->address0_mutex); if (ret < 0) goto re_enumerate; From 36cfdd97f127c1e8b346247c7612cf4da7db8b92 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Tue, 23 Nov 2021 12:16:56 +0200 Subject: [PATCH 1799/5344] usb: hub: Fix locking issues with address0_mutex commit 6cca13de26eea6d32a98d96d916a048d16a12822 upstream. Fix the circular lock dependency and unbalanced unlock of addess0_mutex introduced when fixing an address0_mutex enumeration retry race in commit ae6dc22d2d1 ("usb: hub: Fix usb enumeration issue due to address0 race") Make sure locking order between port_dev->status_lock and address0_mutex is correct, and that address0_mutex is not unlocked in hub_port_connect "done:" codepath which may be reached without locking address0_mutex Fixes: 6ae6dc22d2d1 ("usb: hub: Fix usb enumeration issue due to address0 race") Cc: Reported-by: Marek Szyprowski Tested-by: Hans de Goede Tested-by: Marek Szyprowski Acked-by: Hans de Goede Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20211123101656.1113518-1-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hub.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index d6e9cf3092ff1..d7ab2e88631a0 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -5012,6 +5012,7 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, struct usb_port *port_dev = hub->ports[port1 - 1]; struct usb_device *udev = port_dev->child; static int unreliable_port = -1; + bool retry_locked; /* Disconnect any existing devices under this port */ if (udev) { @@ -5068,9 +5069,10 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, status = 0; - mutex_lock(hcd->address0_mutex); - for (i = 0; i < SET_CONFIG_TRIES; i++) { + usb_lock_port(port_dev); + mutex_lock(hcd->address0_mutex); + retry_locked = true; /* reallocate for each attempt, since references * to the previous one can escape in various ways @@ -5079,6 +5081,8 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, if (!udev) { dev_err(&port_dev->dev, "couldn't allocate usb_device\n"); + mutex_unlock(hcd->address0_mutex); + usb_unlock_port(port_dev); goto done; } @@ -5100,13 +5104,13 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, } /* reset (non-USB 3.0 devices) and get descriptor */ - usb_lock_port(port_dev); status = hub_port_init(hub, udev, port1, i); - usb_unlock_port(port_dev); if (status < 0) goto loop; mutex_unlock(hcd->address0_mutex); + usb_unlock_port(port_dev); + retry_locked = false; if (udev->quirks & USB_QUIRK_DELAY_INIT) msleep(2000); @@ -5196,11 +5200,14 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, loop_disable: hub_port_disable(hub, port1, 1); - mutex_lock(hcd->address0_mutex); loop: usb_ep0_reinit(udev); release_devnum(udev); hub_free_dev(udev); + if (retry_locked) { + mutex_unlock(hcd->address0_mutex); + usb_unlock_port(port_dev); + } usb_put_dev(udev); if ((status == -ENOTCONN) || (status == -ENOTSUPP)) break; @@ -5223,8 +5230,6 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, } done: - mutex_unlock(hcd->address0_mutex); - hub_port_disable(hub, port1, 1); if (hcd->driver->relinquish_port && !hub->hdev->parent) { if (status != -ENOTCONN && status != -ENODEV) From 77e2ff5f18e804c122b83b44c64d3e375677c116 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Fri, 12 Nov 2021 10:07:20 -0800 Subject: [PATCH 1800/5344] binder: fix test regression due to sender_euid change commit c21a80ca0684ec2910344d72556c816cb8940c01 upstream. This is a partial revert of commit 29bc22ac5e5b ("binder: use euid from cred instead of using task"). Setting sender_euid using proc->cred caused some Android system test regressions that need further investigation. It is a partial reversion because subsequent patches rely on proc->cred. Fixes: 29bc22ac5e5b ("binder: use euid from cred instead of using task") Cc: stable@vger.kernel.org # 4.4+ Acked-by: Christian Brauner Signed-off-by: Todd Kjos Change-Id: I9b1769a3510fed250bb21859ef8beebabe034c66 Link: https://lore.kernel.org/r/20211112180720.2858135-1-tkjos@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 33765b85c3366..6d76fff323673 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -3097,7 +3097,7 @@ static void binder_transaction(struct binder_proc *proc, t->from = thread; else t->from = NULL; - t->sender_euid = proc->cred->euid; + t->sender_euid = task_euid(proc->tsk); t->to_proc = target_proc; t->to_thread = target_thread; t->code = tr->code; From dc796281a7b8d91c0f4d21de3fbc425d8628b656 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 18 Nov 2021 22:57:29 +0100 Subject: [PATCH 1801/5344] ALSA: ctxfi: Fix out-of-range access commit 76c47183224c86e4011048b80f0e2d0d166f01c2 upstream. The master and next_conj of rcs_ops are used for iterating the resource list entries, and currently those are supposed to return the current value. The problem is that next_conf may go over the last entry before the loop abort condition is evaluated, and it may return the "current" value that is beyond the array size. It was caught recently as a GPF, for example. Those return values are, however, never actually evaluated, hence basically we don't have to consider the current value as the return at all. By dropping those return values, the potential out-of-range access above is also fixed automatically. This patch changes the return type of master and next_conj callbacks to void and drop the superfluous code accordingly. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=214985 Cc: Link: https://lore.kernel.org/r/20211118215729.26257-1-tiwai@suse.de Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/ctxfi/ctamixer.c | 14 ++++++-------- sound/pci/ctxfi/ctdaio.c | 16 ++++++++-------- sound/pci/ctxfi/ctresource.c | 7 +++---- sound/pci/ctxfi/ctresource.h | 4 ++-- sound/pci/ctxfi/ctsrc.c | 7 +++---- 5 files changed, 22 insertions(+), 26 deletions(-) diff --git a/sound/pci/ctxfi/ctamixer.c b/sound/pci/ctxfi/ctamixer.c index d4ff377eb3a34..6d636bdcaa5a3 100644 --- a/sound/pci/ctxfi/ctamixer.c +++ b/sound/pci/ctxfi/ctamixer.c @@ -23,16 +23,15 @@ #define BLANK_SLOT 4094 -static int amixer_master(struct rsc *rsc) +static void amixer_master(struct rsc *rsc) { rsc->conj = 0; - return rsc->idx = container_of(rsc, struct amixer, rsc)->idx[0]; + rsc->idx = container_of(rsc, struct amixer, rsc)->idx[0]; } -static int amixer_next_conj(struct rsc *rsc) +static void amixer_next_conj(struct rsc *rsc) { rsc->conj++; - return container_of(rsc, struct amixer, rsc)->idx[rsc->conj]; } static int amixer_index(const struct rsc *rsc) @@ -331,16 +330,15 @@ int amixer_mgr_destroy(struct amixer_mgr *amixer_mgr) /* SUM resource management */ -static int sum_master(struct rsc *rsc) +static void sum_master(struct rsc *rsc) { rsc->conj = 0; - return rsc->idx = container_of(rsc, struct sum, rsc)->idx[0]; + rsc->idx = container_of(rsc, struct sum, rsc)->idx[0]; } -static int sum_next_conj(struct rsc *rsc) +static void sum_next_conj(struct rsc *rsc) { rsc->conj++; - return container_of(rsc, struct sum, rsc)->idx[rsc->conj]; } static int sum_index(const struct rsc *rsc) diff --git a/sound/pci/ctxfi/ctdaio.c b/sound/pci/ctxfi/ctdaio.c index 27441d498968d..b5e1296af09ee 100644 --- a/sound/pci/ctxfi/ctdaio.c +++ b/sound/pci/ctxfi/ctdaio.c @@ -51,12 +51,12 @@ static struct daio_rsc_idx idx_20k2[NUM_DAIOTYP] = { [SPDIFIO] = {.left = 0x05, .right = 0x85}, }; -static int daio_master(struct rsc *rsc) +static void daio_master(struct rsc *rsc) { /* Actually, this is not the resource index of DAIO. * For DAO, it is the input mapper index. And, for DAI, * it is the output time-slot index. */ - return rsc->conj = rsc->idx; + rsc->conj = rsc->idx; } static int daio_index(const struct rsc *rsc) @@ -64,19 +64,19 @@ static int daio_index(const struct rsc *rsc) return rsc->conj; } -static int daio_out_next_conj(struct rsc *rsc) +static void daio_out_next_conj(struct rsc *rsc) { - return rsc->conj += 2; + rsc->conj += 2; } -static int daio_in_next_conj_20k1(struct rsc *rsc) +static void daio_in_next_conj_20k1(struct rsc *rsc) { - return rsc->conj += 0x200; + rsc->conj += 0x200; } -static int daio_in_next_conj_20k2(struct rsc *rsc) +static void daio_in_next_conj_20k2(struct rsc *rsc) { - return rsc->conj += 0x100; + rsc->conj += 0x100; } static const struct rsc_ops daio_out_rsc_ops = { diff --git a/sound/pci/ctxfi/ctresource.c b/sound/pci/ctxfi/ctresource.c index 0bb5696e44b37..ec5f597b580ad 100644 --- a/sound/pci/ctxfi/ctresource.c +++ b/sound/pci/ctxfi/ctresource.c @@ -109,18 +109,17 @@ static int audio_ring_slot(const struct rsc *rsc) return (rsc->conj << 4) + offset_in_audio_slot_block[rsc->type]; } -static int rsc_next_conj(struct rsc *rsc) +static void rsc_next_conj(struct rsc *rsc) { unsigned int i; for (i = 0; (i < 8) && (!(rsc->msr & (0x1 << i))); ) i++; rsc->conj += (AUDIO_SLOT_BLOCK_NUM >> i); - return rsc->conj; } -static int rsc_master(struct rsc *rsc) +static void rsc_master(struct rsc *rsc) { - return rsc->conj = rsc->idx; + rsc->conj = rsc->idx; } static const struct rsc_ops rsc_generic_ops = { diff --git a/sound/pci/ctxfi/ctresource.h b/sound/pci/ctxfi/ctresource.h index 93e47488a1c1c..92146054af582 100644 --- a/sound/pci/ctxfi/ctresource.h +++ b/sound/pci/ctxfi/ctresource.h @@ -39,8 +39,8 @@ struct rsc { }; struct rsc_ops { - int (*master)(struct rsc *rsc); /* Move to master resource */ - int (*next_conj)(struct rsc *rsc); /* Move to next conjugate resource */ + void (*master)(struct rsc *rsc); /* Move to master resource */ + void (*next_conj)(struct rsc *rsc); /* Move to next conjugate resource */ int (*index)(const struct rsc *rsc); /* Return the index of resource */ /* Return the output slot number */ int (*output_slot)(const struct rsc *rsc); diff --git a/sound/pci/ctxfi/ctsrc.c b/sound/pci/ctxfi/ctsrc.c index 37c18ce84974a..7d2bda0c3d3de 100644 --- a/sound/pci/ctxfi/ctsrc.c +++ b/sound/pci/ctxfi/ctsrc.c @@ -590,16 +590,15 @@ int src_mgr_destroy(struct src_mgr *src_mgr) /* SRCIMP resource manager operations */ -static int srcimp_master(struct rsc *rsc) +static void srcimp_master(struct rsc *rsc) { rsc->conj = 0; - return rsc->idx = container_of(rsc, struct srcimp, rsc)->idx[0]; + rsc->idx = container_of(rsc, struct srcimp, rsc)->idx[0]; } -static int srcimp_next_conj(struct rsc *rsc) +static void srcimp_next_conj(struct rsc *rsc) { rsc->conj++; - return container_of(rsc, struct srcimp, rsc)->idx[rsc->conj]; } static int srcimp_index(const struct rsc *rsc) From 9e4e357fe72d99ff8ea3c873889f7b23587fe173 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Tue, 2 Nov 2021 12:24:26 +0000 Subject: [PATCH 1802/5344] media: cec: copy sequence field for the reply commit 13cbaa4c2b7bf9f8285e1164d005dbf08244ecd5 upstream. When the reply for a non-blocking transmit arrives, the sequence field for that reply was never filled in, so userspace would have no way of associating the reply to the original transmit. Copy the sequence field to ensure that this is now possible. Signed-off-by: Hans Verkuil Fixes: 0dbacebede1e ([media] cec: move the CEC framework out of staging and to media) Cc: Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/cec/cec-adap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/cec/cec-adap.c b/drivers/media/cec/cec-adap.c index 06383b26712b6..56857ac0a0be2 100644 --- a/drivers/media/cec/cec-adap.c +++ b/drivers/media/cec/cec-adap.c @@ -1191,6 +1191,7 @@ void cec_received_msg_ts(struct cec_adapter *adap, if (abort) dst->rx_status |= CEC_RX_STATUS_FEATURE_ABORT; msg->flags = dst->flags; + msg->sequence = dst->sequence; /* Remove it from the wait_queue */ list_del_init(&data->list); From 7fce013901431ec448865500f830667da0c60725 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 21 Nov 2021 11:10:55 +0100 Subject: [PATCH 1803/5344] Revert "parisc: Fix backtrace to always include init funtion names" commit 98400ad75e95860e9a10ec78b0b90ab66184a2ce upstream. This reverts commit 279917e27edc293eb645a25428c6ab3f3bca3f86. With the CONFIG_HARDENED_USERCOPY option enabled, this patch triggers kernel bugs at runtime: usercopy: Kernel memory overwrite attempt detected to kernel text (offset 2084839, size 6)! kernel BUG at mm/usercopy.c:99! Backtrace: IAOQ[0]: usercopy_abort+0xc4/0xe8 [<00000000406ed1c8>] __check_object_size+0x174/0x238 [<00000000407086d4>] copy_strings.isra.0+0x3e8/0x708 [<0000000040709a20>] do_execveat_common.isra.0+0x1bc/0x328 [<000000004070b760>] compat_sys_execve+0x7c/0xb8 [<0000000040303eb8>] syscall_exit+0x0/0x14 The problem is, that we have an init section of at least 2MB size which starts at _stext and is freed after bootup. If then later some kernel data is (temporarily) stored in this free memory, check_kernel_text_object() will trigger a bug since the data appears to be inside the kernel text (>=_stext) area: if (overlaps(ptr, len, _stext, _etext)) usercopy_abort("kernel text"); Signed-off-by: Helge Deller Cc: stable@kernel.org # 5.4+ Signed-off-by: Greg Kroah-Hartman --- arch/parisc/kernel/vmlinux.lds.S | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S index 164483b37d854..99cd24f2ea01b 100644 --- a/arch/parisc/kernel/vmlinux.lds.S +++ b/arch/parisc/kernel/vmlinux.lds.S @@ -56,8 +56,6 @@ SECTIONS { . = KERNEL_BINARY_TEXT_START; - _stext = .; /* start of kernel text, includes init code & data */ - __init_begin = .; HEAD_TEXT_SECTION MLONGCALL_DISCARD(INIT_TEXT_SECTION(8)) @@ -81,6 +79,7 @@ SECTIONS /* freed after init ends here */ _text = .; /* Text and read-only data */ + _stext = .; MLONGCALL_KEEP(INIT_TEXT_SECTION(8)) .text ALIGN(PAGE_SIZE) : { TEXT_TEXT From 0a9757d1d0be0e585b58999e20aa689f0ee08ec9 Mon Sep 17 00:00:00 2001 From: Jason Gerecke Date: Mon, 8 Nov 2021 16:31:01 -0800 Subject: [PATCH 1804/5344] HID: wacom: Use "Confidence" flag to prevent reporting invalid contacts commit 7fb0413baa7f8a04caef0c504df9af7e0623d296 upstream. The HID descriptor of many of Wacom's touch input devices include a "Confidence" usage that signals if a particular touch collection contains useful data. The driver does not look at this flag, however, which causes even invalid contacts to be reported to userspace. A lucky combination of kernel event filtering and device behavior (specifically: contact ID 0 == invalid, contact ID >0 == valid; and order all data so that all valid contacts are reported before any invalid contacts) spare most devices from any visibly-bad behavior. The DTH-2452 is one example of an unlucky device that misbehaves. It uses ID 0 for both the first valid contact and all invalid contacts. Because we report both the valid and invalid contacts, the kernel reports that contact 0 first goes down (valid) and then goes up (invalid) in every report. This causes ~100 clicks per second simply by touching the screen. This patch inroduces new `confidence` flag in our `hid_data` structure. The value is initially set to `true` at the start of a report and can be set to `false` if an invalid touch usage is seen. Link: https://github.com/linuxwacom/input-wacom/issues/270 Fixes: f8b6a74719b5 ("HID: wacom: generic: Support multiple tools per report") Signed-off-by: Jason Gerecke Tested-by: Joshua Dickens Cc: Signed-off-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- drivers/hid/wacom_wac.c | 8 +++++++- drivers/hid/wacom_wac.h | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index f6be2e70a4967..e011839f19f89 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -2578,6 +2578,9 @@ static void wacom_wac_finger_event(struct hid_device *hdev, return; switch (equivalent_usage) { + case HID_DG_CONFIDENCE: + wacom_wac->hid_data.confidence = value; + break; case HID_GD_X: wacom_wac->hid_data.x = value; break; @@ -2610,7 +2613,8 @@ static void wacom_wac_finger_event(struct hid_device *hdev, } if (usage->usage_index + 1 == field->report_count) { - if (equivalent_usage == wacom_wac->hid_data.last_slot_field) + if (equivalent_usage == wacom_wac->hid_data.last_slot_field && + wacom_wac->hid_data.confidence) wacom_wac_finger_slot(wacom_wac, wacom_wac->touch_input); } } @@ -2625,6 +2629,8 @@ static void wacom_wac_finger_pre_report(struct hid_device *hdev, wacom_wac->is_invalid_bt_frame = false; + hid_data->confidence = true; + for (i = 0; i < report->maxfield; i++) { struct hid_field *field = report->field[i]; int j; diff --git a/drivers/hid/wacom_wac.h b/drivers/hid/wacom_wac.h index e3835407e8d23..8dea7cb298e69 100644 --- a/drivers/hid/wacom_wac.h +++ b/drivers/hid/wacom_wac.h @@ -300,6 +300,7 @@ struct hid_data { bool tipswitch; bool barrelswitch; bool barrelswitch2; + bool confidence; int x; int y; int pressure; From 8d7dbe665bb3cf97e30d8a3a803f7781ac24cf6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Noralf=20Tr=C3=B8nnes?= Date: Fri, 5 Nov 2021 21:43:58 +0100 Subject: [PATCH 1805/5344] staging/fbtft: Fix backlight MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 7865dd24934ad580d1bcde8f63c39f324211a23b upstream. Commit b4a1ed0cd18b ("fbdev: make FB_BACKLIGHT a tristate") forgot to update fbtft breaking its backlight support when FB_BACKLIGHT is a module. Since FB_TFT selects FB_BACKLIGHT there's no need for this conditional so just remove it and we're good. Fixes: b4a1ed0cd18b ("fbdev: make FB_BACKLIGHT a tristate") Cc: Acked-by: Sam Ravnborg Signed-off-by: Noralf Trønnes Link: https://lore.kernel.org/r/20211105204358.2991-1-noralf@tronnes.org Signed-off-by: Greg Kroah-Hartman --- drivers/staging/fbtft/fb_ssd1351.c | 4 ---- drivers/staging/fbtft/fbtft-core.c | 9 +-------- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/drivers/staging/fbtft/fb_ssd1351.c b/drivers/staging/fbtft/fb_ssd1351.c index cf263a58a1489..6fd549a424d53 100644 --- a/drivers/staging/fbtft/fb_ssd1351.c +++ b/drivers/staging/fbtft/fb_ssd1351.c @@ -187,7 +187,6 @@ static struct fbtft_display display = { }, }; -#ifdef CONFIG_FB_BACKLIGHT static int update_onboard_backlight(struct backlight_device *bd) { struct fbtft_par *par = bl_get_data(bd); @@ -231,9 +230,6 @@ static void register_onboard_backlight(struct fbtft_par *par) if (!par->fbtftops.unregister_backlight) par->fbtftops.unregister_backlight = fbtft_unregister_backlight; } -#else -static void register_onboard_backlight(struct fbtft_par *par) { }; -#endif FBTFT_REGISTER_DRIVER(DRVNAME, "solomon,ssd1351", &display); diff --git a/drivers/staging/fbtft/fbtft-core.c b/drivers/staging/fbtft/fbtft-core.c index bc53d68bfcaa3..771697508cec8 100644 --- a/drivers/staging/fbtft/fbtft-core.c +++ b/drivers/staging/fbtft/fbtft-core.c @@ -136,7 +136,6 @@ static int fbtft_request_gpios_dt(struct fbtft_par *par) } #endif -#ifdef CONFIG_FB_BACKLIGHT static int fbtft_backlight_update_status(struct backlight_device *bd) { struct fbtft_par *par = bl_get_data(bd); @@ -169,6 +168,7 @@ void fbtft_unregister_backlight(struct fbtft_par *par) par->info->bl_dev = NULL; } } +EXPORT_SYMBOL(fbtft_unregister_backlight); static const struct backlight_ops fbtft_bl_ops = { .get_brightness = fbtft_backlight_get_brightness, @@ -206,12 +206,7 @@ void fbtft_register_backlight(struct fbtft_par *par) if (!par->fbtftops.unregister_backlight) par->fbtftops.unregister_backlight = fbtft_unregister_backlight; } -#else -void fbtft_register_backlight(struct fbtft_par *par) { }; -void fbtft_unregister_backlight(struct fbtft_par *par) { }; -#endif EXPORT_SYMBOL(fbtft_register_backlight); -EXPORT_SYMBOL(fbtft_unregister_backlight); static void fbtft_set_addr_win(struct fbtft_par *par, int xs, int ys, int xe, int ye) @@ -860,13 +855,11 @@ int fbtft_register_framebuffer(struct fb_info *fb_info) fb_info->fix.smem_len >> 10, text1, HZ / fb_info->fbdefio->delay, text2); -#ifdef CONFIG_FB_BACKLIGHT /* Turn on backlight if available */ if (fb_info->bl_dev) { fb_info->bl_dev->props.power = FB_BLANK_UNBLANK; fb_info->bl_dev->ops->update_status(fb_info->bl_dev); } -#endif return 0; From 0e67a9de0afc0de2c8a8ec37de05280fe82062ad Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 17 Nov 2021 10:20:16 +0300 Subject: [PATCH 1806/5344] staging: rtl8192e: Fix use after free in _rtl92e_pci_disconnect() commit b535917c51acc97fb0761b1edec85f1f3d02bda4 upstream. The free_rtllib() function frees the "dev" pointer so there is use after free on the next line. Re-arrange things to avoid that. Fixes: 66898177e7e5 ("staging: rtl8192e: Fix unload/reload problem") Cc: stable Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20211117072016.GA5237@kili Signed-off-by: Greg Kroah-Hartman --- drivers/staging/rtl8192e/rtl8192e/rtl_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/staging/rtl8192e/rtl8192e/rtl_core.c b/drivers/staging/rtl8192e/rtl8192e/rtl_core.c index c702ee9691b1d..bcbf0c8cd4209 100644 --- a/drivers/staging/rtl8192e/rtl8192e/rtl_core.c +++ b/drivers/staging/rtl8192e/rtl8192e/rtl_core.c @@ -2559,13 +2559,14 @@ static void _rtl92e_pci_disconnect(struct pci_dev *pdev) free_irq(dev->irq, dev); priv->irq = 0; } - free_rtllib(dev); if (dev->mem_start != 0) { iounmap((void __iomem *)dev->mem_start); release_mem_region(pci_resource_start(pdev, 1), pci_resource_len(pdev, 1)); } + + free_rtllib(dev); } else { priv = rtllib_priv(dev); } From db819fd4d8ccc5f772568cb3763f4f1bf4f29958 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 15 Nov 2021 14:27:19 -0800 Subject: [PATCH 1807/5344] xen: don't continue xenstore initialization in case of errors commit 08f6c2b09ebd4b326dbe96d13f94fee8f9814c78 upstream. In case of errors in xenbus_init (e.g. missing xen_store_gfn parameter), we goto out_error but we forget to reset xen_store_domain_type to XS_UNKNOWN. As a consequence xenbus_probe_initcall and other initcalls will still try to initialize xenstore resulting into a crash at boot. [ 2.479830] Call trace: [ 2.482314] xb_init_comms+0x18/0x150 [ 2.486354] xs_init+0x34/0x138 [ 2.489786] xenbus_probe+0x4c/0x70 [ 2.498432] xenbus_probe_initcall+0x2c/0x7c [ 2.503944] do_one_initcall+0x54/0x1b8 [ 2.507358] kernel_init_freeable+0x1ac/0x210 [ 2.511617] kernel_init+0x28/0x130 [ 2.516112] ret_from_fork+0x10/0x20 Cc: Cc: jbeulich@suse.com Signed-off-by: Stefano Stabellini Link: https://lore.kernel.org/r/20211115222719.2558207-1-sstabellini@kernel.org Reviewed-by: Jan Beulich Signed-off-by: Boris Ostrovsky Signed-off-by: Greg Kroah-Hartman --- drivers/xen/xenbus/xenbus_probe.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 652894d619677..5d5efff206501 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -846,7 +846,7 @@ static struct notifier_block xenbus_resume_nb = { static int __init xenbus_init(void) { - int err = 0; + int err; uint64_t v = 0; xen_store_domain_type = XS_UNKNOWN; @@ -920,8 +920,10 @@ static int __init xenbus_init(void) */ proc_create_mount_point("xen"); #endif + return 0; out_error: + xen_store_domain_type = XS_UNKNOWN; return err; } From c6720a180199a4f9f2ea73497ef953c9c4794dcd Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Tue, 23 Nov 2021 13:07:48 -0800 Subject: [PATCH 1808/5344] xen: detect uninitialized xenbus in xenbus_init commit 36e8f60f0867d3b70d398d653c17108459a04efe upstream. If the xenstore page hasn't been allocated properly, reading the value of the related hvm_param (HVM_PARAM_STORE_PFN) won't actually return error. Instead, it will succeed and return zero. Instead of attempting to xen_remap a bad guest physical address, detect this condition and return early. Note that although a guest physical address of zero for HVM_PARAM_STORE_PFN is theoretically possible, it is not a good choice and zero has never been validly used in that capacity. Also recognize all bits set as an invalid value. For 32-bit Linux, any pfn above ULONG_MAX would get truncated. Pfns above ULONG_MAX should never be passed by the Xen tools to HVM guests anyway, so check for this condition and return early. Cc: stable@vger.kernel.org Signed-off-by: Stefano Stabellini Reviewed-by: Juergen Gross Reviewed-by: Jan Beulich Link: https://lore.kernel.org/r/20211123210748.1910236-1-sstabellini@kernel.org Signed-off-by: Boris Ostrovsky Signed-off-by: Greg Kroah-Hartman --- drivers/xen/xenbus/xenbus_probe.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 5d5efff206501..b911a91bce6b7 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -886,6 +886,29 @@ static int __init xenbus_init(void) err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v); if (err) goto out_error; + /* + * Uninitialized hvm_params are zero and return no error. + * Although it is theoretically possible to have + * HVM_PARAM_STORE_PFN set to zero on purpose, in reality it is + * not zero when valid. If zero, it means that Xenstore hasn't + * been properly initialized. Instead of attempting to map a + * wrong guest physical address return error. + * + * Also recognize all bits set as an invalid value. + */ + if (!v || !~v) { + err = -ENOENT; + goto out_error; + } + /* Avoid truncation on 32-bit. */ +#if BITS_PER_LONG == 32 + if (v > ULONG_MAX) { + pr_err("%s: cannot handle HVM_PARAM_STORE_PFN=%llx > ULONG_MAX\n", + __func__, v); + err = -EINVAL; + goto out_error; + } +#endif xen_store_gfn = (unsigned long)v; xen_store_interface = xen_remap(xen_store_gfn << XEN_PAGE_SHIFT, From d286c6d57d21e5f73f41e5cdfb52c086d09404b1 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 19 Nov 2021 13:16:27 +1000 Subject: [PATCH 1809/5344] KVM: PPC: Book3S HV: Prevent POWER7/8 TLB flush flushing SLB commit cf0b0e3712f7af90006f8317ff27278094c2c128 upstream. The POWER9 ERAT flush instruction is a SLBIA with IH=7, which is a reserved value on POWER7/8. On POWER8 this invalidates the SLB entries above index 0, similarly to SLBIA IH=0. If the SLB entries are invalidated, and then the guest is bypassed, the host SLB does not get re-loaded, so the bolted entries above 0 will be lost. This can result in kernel stack access causing a SLB fault. Kernel stack access causing a SLB fault was responsible for the infamous mega bug (search "Fix SLB reload bug"). Although since commit 48e7b7695745 ("powerpc/64s/hash: Convert SLB miss handlers to C") that starts using the kernel stack in the SLB miss handler, it might only result in an infinite loop of SLB faults. In any case it's a bug. Fix this by only executing the instruction on >= POWER9 where IH=7 is defined not to invalidate the SLB. POWER7/8 don't require this ERAT flush. Fixes: 500871125920 ("KVM: PPC: Book3S HV: Invalidate ERAT when flushing guest TLB entries") Cc: stable@vger.kernel.org # v5.2+ Signed-off-by: Nicholas Piggin Reviewed-by: Fabiano Rosas Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211119031627.577853-1-npiggin@gmail.com Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kvm/book3s_hv_builtin.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 4a91b543a8540..6d34b69729854 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -821,6 +821,7 @@ static void flush_guest_tlb(struct kvm *kvm) "r" (0) : "memory"); } asm volatile("ptesync": : :"memory"); + // POWER9 congruence-class TLBIEL leaves ERAT. Flush it now. asm volatile(PPC_RADIX_INVALIDATE_ERAT_GUEST : : :"memory"); } else { for (set = 0; set < kvm->arch.tlb_sets; ++set) { @@ -831,7 +832,9 @@ static void flush_guest_tlb(struct kvm *kvm) rb += PPC_BIT(51); /* increment set number */ } asm volatile("ptesync": : :"memory"); - asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT : : :"memory"); + // POWER9 congruence-class TLBIEL leaves ERAT. Flush it now. + if (cpu_has_feature(CPU_FTR_ARCH_300)) + asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT : : :"memory"); } } From fda620ac483da31c794b6f49e1fd0692cfd8dc66 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 23 Nov 2021 15:28:01 +0100 Subject: [PATCH 1810/5344] tracing/uprobe: Fix uprobe_perf_open probes iteration commit 1880ed71ce863318c1ce93bf324876fb5f92854f upstream. Add missing 'tu' variable initialization in the probes loop, otherwise the head 'tu' is used instead of added probes. Link: https://lkml.kernel.org/r/20211123142801.182530-1-jolsa@kernel.org Cc: stable@vger.kernel.org Fixes: 99c9a923e97a ("tracing/uprobe: Fix double perf_event linking on multiprobe uprobe") Acked-by: Masami Hiramatsu Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace_uprobe.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index b515db036becc..efb51a23a14f2 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1299,6 +1299,7 @@ static int uprobe_perf_open(struct trace_event_call *call, return 0; list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + tu = container_of(pos, struct trace_uprobe, tp); err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); if (err) { uprobe_perf_close(call, event); From 38583fdc962452b28dd8d882da404fa3e49f977a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 26 Nov 2021 17:34:42 -0500 Subject: [PATCH 1811/5344] tracing: Fix pid filtering when triggers are attached commit a55f224ff5f238013de8762c4287117e47b86e22 upstream. If a event is filtered by pid and a trigger that requires processing of the event to happen is a attached to the event, the discard portion does not take the pid filtering into account, and the event will then be recorded when it should not have been. Cc: stable@vger.kernel.org Fixes: 3fdaf80f4a836 ("tracing: Implement event pid filtering") Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.h | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 35e9a01b54800..1d514a1a31554 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1423,14 +1423,26 @@ __event_trigger_test_discard(struct trace_event_file *file, if (eflags & EVENT_FILE_FL_TRIGGER_COND) *tt = event_triggers_call(file, entry, event); - if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) || - (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && - !filter_match_preds(file->filter, entry))) { - __trace_event_discard_commit(buffer, event); - return true; - } + if (likely(!(file->flags & (EVENT_FILE_FL_SOFT_DISABLED | + EVENT_FILE_FL_FILTERED | + EVENT_FILE_FL_PID_FILTER)))) + return false; + + if (file->flags & EVENT_FILE_FL_SOFT_DISABLED) + goto discard; + + if (file->flags & EVENT_FILE_FL_FILTERED && + !filter_match_preds(file->filter, entry)) + goto discard; + + if ((file->flags & EVENT_FILE_FL_PID_FILTER) && + trace_event_ignore_this_pid(file)) + goto discard; return false; + discard: + __trace_event_discard_commit(buffer, event); + return true; } /** From 8aa86a4a6707812ac419841aece702c66ad89cc8 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 15 Nov 2021 10:23:45 +0200 Subject: [PATCH 1812/5344] mmc: sdhci: Fix ADMA for PAGE_SIZE >= 64KiB commit 3d7c194b7c9ad414264935ad4f943a6ce285ebb1 upstream. The block layer forces a minimum segment size of PAGE_SIZE, so a segment can be too big for the ADMA table, if PAGE_SIZE >= 64KiB. Fix by writing multiple descriptors, noting that the ADMA table is sized for 4KiB chunks anyway, so it will be big enough. Reported-and-tested-by: Bough Chen Signed-off-by: Adrian Hunter Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20211115082345.802238-1-adrian.hunter@intel.com Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/host/sdhci.c | 21 ++++++++++++++++++--- drivers/mmc/host/sdhci.h | 4 +++- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index cb54fa2120d72..deafcc56adee6 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -749,7 +749,19 @@ static void sdhci_adma_table_pre(struct sdhci_host *host, len -= offset; } - BUG_ON(len > 65536); + /* + * The block layer forces a minimum segment size of PAGE_SIZE, + * so 'len' can be too big here if PAGE_SIZE >= 64KiB. Write + * multiple descriptors, noting that the ADMA table is sized + * for 4KiB chunks anyway, so it will be big enough. + */ + while (len > host->max_adma) { + int n = 32 * 1024; /* 32KiB*/ + + __sdhci_adma_write_desc(host, &desc, addr, n, ADMA2_TRAN_VALID); + addr += n; + len -= n; + } /* tran, valid */ if (len) @@ -3568,6 +3580,7 @@ struct sdhci_host *sdhci_alloc_host(struct device *dev, * descriptor for each segment, plus 1 for a nop end descriptor. */ host->adma_table_cnt = SDHCI_MAX_SEGS * 2 + 1; + host->max_adma = 65536; return host; } @@ -4221,10 +4234,12 @@ int sdhci_setup_host(struct sdhci_host *host) * be larger than 64 KiB though. */ if (host->flags & SDHCI_USE_ADMA) { - if (host->quirks & SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC) + if (host->quirks & SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC) { + host->max_adma = 65532; /* 32-bit alignment */ mmc->max_seg_size = 65535; - else + } else { mmc->max_seg_size = 65536; + } } else { mmc->max_seg_size = mmc->max_req_size; } diff --git a/drivers/mmc/host/sdhci.h b/drivers/mmc/host/sdhci.h index 96a0a8f97f559..54f9d6720f132 100644 --- a/drivers/mmc/host/sdhci.h +++ b/drivers/mmc/host/sdhci.h @@ -349,7 +349,8 @@ struct sdhci_adma2_64_desc { /* * Maximum segments assuming a 512KiB maximum requisition size and a minimum - * 4KiB page size. + * 4KiB page size. Note this also allows enough for multiple descriptors in + * case of PAGE_SIZE >= 64KiB. */ #define SDHCI_MAX_SEGS 128 @@ -547,6 +548,7 @@ struct sdhci_host { unsigned int blocks; /* remaining PIO blocks */ int sg_count; /* Mapped sg entries */ + int max_adma; /* Max. length in ADMA descriptor */ void *adma_table; /* ADMA descriptor table */ void *align_buffer; /* Bounce buffer */ From fb921d0c618024f3a83fa37f01b27bcb3a688c34 Mon Sep 17 00:00:00 2001 From: Dylan Hung Date: Thu, 25 Nov 2021 10:44:32 +0800 Subject: [PATCH 1813/5344] mdio: aspeed: Fix "Link is Down" issue commit 9dbe33cf371bd70330858370bdbc35c7668f00c3 upstream. The issue happened randomly in runtime. The message "Link is Down" is popped but soon it recovered to "Link is Up". The "Link is Down" results from the incorrect read data for reading the PHY register via MDIO bus. The correct sequence for reading the data shall be: 1. fire the command 2. wait for command done (this step was missing) 3. wait for data idle 4. read data from data register Cc: stable@vger.kernel.org Fixes: f160e99462c6 ("net: phy: Add mdio-aspeed") Reviewed-by: Joel Stanley Signed-off-by: Dylan Hung Reviewed-by: Andrew Lunn Reviewed-by: Russell King (Oracle) Link: https://lore.kernel.org/r/20211125024432.15809-1-dylan_hung@aspeedtech.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/phy/mdio-aspeed.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/phy/mdio-aspeed.c b/drivers/net/phy/mdio-aspeed.c index cad820568f751..966c3b4ad59d1 100644 --- a/drivers/net/phy/mdio-aspeed.c +++ b/drivers/net/phy/mdio-aspeed.c @@ -61,6 +61,13 @@ static int aspeed_mdio_read(struct mii_bus *bus, int addr, int regnum) iowrite32(ctrl, ctx->base + ASPEED_MDIO_CTRL); + rc = readl_poll_timeout(ctx->base + ASPEED_MDIO_CTRL, ctrl, + !(ctrl & ASPEED_MDIO_CTRL_FIRE), + ASPEED_MDIO_INTERVAL_US, + ASPEED_MDIO_TIMEOUT_US); + if (rc < 0) + return rc; + rc = readl_poll_timeout(ctx->base + ASPEED_MDIO_DATA, data, data & ASPEED_MDIO_DATA_IDLE, ASPEED_MDIO_INTERVAL_US, From e4e00fc184709bd63c45e4a0ceaaac9624651961 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Tue, 5 Oct 2021 20:09:47 +0200 Subject: [PATCH 1814/5344] PCI: aardvark: Deduplicate code in advk_pcie_rd_conf() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 67cb2a4c93499c2c22704998fd1fd2bc35194d8e upstream. Avoid code repetition in advk_pcie_rd_conf() by handling errors with goto jump, as is customary in kernel. Link: https://lore.kernel.org/r/20211005180952.6812-9-kabel@kernel.org Fixes: 43f5c77bcbd2 ("PCI: aardvark: Fix reporting CRS value") Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-aardvark.c | 48 +++++++++++---------------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 45794ba643d40..574f5a7be762c 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -773,18 +773,8 @@ static int advk_pcie_rd_conf(struct pci_bus *bus, u32 devfn, (le16_to_cpu(pcie->bridge.pcie_conf.rootctl) & PCI_EXP_RTCTL_CRSSVE); - if (advk_pcie_pio_is_running(pcie)) { - /* - * If it is possible return Completion Retry Status so caller - * tries to issue the request again instead of failing. - */ - if (allow_crs) { - *val = CFG_RD_CRS_VAL; - return PCIBIOS_SUCCESSFUL; - } - *val = 0xffffffff; - return PCIBIOS_SET_FAILED; - } + if (advk_pcie_pio_is_running(pcie)) + goto try_crs; /* Program the control register */ reg = advk_readl(pcie, PIO_CTRL); @@ -808,25 +798,13 @@ static int advk_pcie_rd_conf(struct pci_bus *bus, u32 devfn, advk_writel(pcie, 1, PIO_START); ret = advk_pcie_wait_pio(pcie); - if (ret < 0) { - /* - * If it is possible return Completion Retry Status so caller - * tries to issue the request again instead of failing. - */ - if (allow_crs) { - *val = CFG_RD_CRS_VAL; - return PCIBIOS_SUCCESSFUL; - } - *val = 0xffffffff; - return PCIBIOS_SET_FAILED; - } + if (ret < 0) + goto try_crs; /* Check PIO status and get the read result */ ret = advk_pcie_check_pio_status(pcie, allow_crs, val); - if (ret < 0) { - *val = 0xffffffff; - return PCIBIOS_SET_FAILED; - } + if (ret < 0) + goto fail; if (size == 1) *val = (*val >> (8 * (where & 3))) & 0xff; @@ -834,6 +812,20 @@ static int advk_pcie_rd_conf(struct pci_bus *bus, u32 devfn, *val = (*val >> (8 * (where & 3))) & 0xffff; return PCIBIOS_SUCCESSFUL; + +try_crs: + /* + * If it is possible, return Completion Retry Status so that caller + * tries to issue the request again instead of failing. + */ + if (allow_crs) { + *val = CFG_RD_CRS_VAL; + return PCIBIOS_SUCCESSFUL; + } + +fail: + *val = 0xffffffff; + return PCIBIOS_SET_FAILED; } static int advk_pcie_wr_conf(struct pci_bus *bus, u32 devfn, From 5f55e8a9d5ce021ceecda8fea009d8c03e694b03 Mon Sep 17 00:00:00 2001 From: Remi Pommarel Date: Thu, 25 Nov 2021 01:25:55 +0100 Subject: [PATCH 1815/5344] PCI: aardvark: Wait for endpoint to be ready before training link MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit f4c7d053d7f77cd5c1a1ba7c7ce085ddba13d1d7 upstream. When configuring pcie reset pin from gpio (e.g. initially set by u-boot) to pcie function this pin goes low for a brief moment asserting the PERST# signal. Thus connected device enters fundamental reset process and link configuration can only begin after a minimal 100ms delay (see [1]). Because the pin configuration comes from the "default" pinctrl it is implicitly configured before the probe callback is called: driver_probe_device() really_probe() ... pinctrl_bind_pins() /* Here pin goes from gpio to PCIE reset function and PERST# is asserted */ ... drv->probe() [1] "PCI Express Base Specification", REV. 4.0 PCI Express, February 19 2014, 6.6.1 Conventional Reset Signed-off-by: Remi Pommarel Signed-off-by: Lorenzo Pieralisi Acked-by: Thomas Petazzoni Signed-off-by: Marek Behún Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-aardvark.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 574f5a7be762c..ebefa4577b354 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -432,6 +432,14 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) reg |= PIO_CTRL_ADDR_WIN_DISABLE; advk_writel(pcie, reg, PIO_CTRL); + /* + * PERST# signal could have been asserted by pinctrl subsystem before + * probe() callback has been called, making the endpoint going into + * fundamental reset. As required by PCI Express spec a delay for at + * least 100ms after such a reset before link training is needed. + */ + msleep(PCI_PM_D3COLD_WAIT); + /* Start link training */ reg = advk_readl(pcie, PCIE_CORE_LINK_CTRL_STAT_REG); reg |= PCIE_CORE_LINK_TRAINING; From 8c3aac5f3b74d3d90cc92ef6670860105834eccc Mon Sep 17 00:00:00 2001 From: Grzegorz Jaszczyk Date: Thu, 25 Nov 2021 01:25:56 +0100 Subject: [PATCH 1816/5344] PCI: aardvark: Fix big endian support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit e078723f9cccd509482fd7f30a4afb1125ca7a2a upstream. Initialise every multiple-byte field of emulated PCI bridge config space with proper cpu_to_le* macro. This is required since the structure describing config space of emulated bridge assumes little-endian convention. Signed-off-by: Grzegorz Jaszczyk Signed-off-by: Lorenzo Pieralisi Signed-off-by: Marek Behún Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-aardvark.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index ebefa4577b354..db20bc903c0e9 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -686,18 +686,20 @@ static int advk_sw_pci_bridge_init(struct advk_pcie *pcie) struct pci_bridge_emul *bridge = &pcie->bridge; int ret; - bridge->conf.vendor = advk_readl(pcie, PCIE_CORE_DEV_ID_REG) & 0xffff; - bridge->conf.device = advk_readl(pcie, PCIE_CORE_DEV_ID_REG) >> 16; + bridge->conf.vendor = + cpu_to_le16(advk_readl(pcie, PCIE_CORE_DEV_ID_REG) & 0xffff); + bridge->conf.device = + cpu_to_le16(advk_readl(pcie, PCIE_CORE_DEV_ID_REG) >> 16); bridge->conf.class_revision = - advk_readl(pcie, PCIE_CORE_DEV_REV_REG) & 0xff; + cpu_to_le32(advk_readl(pcie, PCIE_CORE_DEV_REV_REG) & 0xff); /* Support 32 bits I/O addressing */ bridge->conf.iobase = PCI_IO_RANGE_TYPE_32; bridge->conf.iolimit = PCI_IO_RANGE_TYPE_32; /* Support 64 bits memory pref */ - bridge->conf.pref_mem_base = PCI_PREF_RANGE_TYPE_64; - bridge->conf.pref_mem_limit = PCI_PREF_RANGE_TYPE_64; + bridge->conf.pref_mem_base = cpu_to_le16(PCI_PREF_RANGE_TYPE_64); + bridge->conf.pref_mem_limit = cpu_to_le16(PCI_PREF_RANGE_TYPE_64); /* Support interrupt A for MSI feature */ bridge->conf.intpin = PCIE_CORE_INT_A_ASSERT_ENABLE; From f6e7b89f3a75c569d998034f83913fd02b676a21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 25 Nov 2021 01:25:57 +0100 Subject: [PATCH 1817/5344] PCI: aardvark: Train link immediately after enabling training MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 6964494582f56a3882c2c53b0edbfe99eb32b2e1 upstream. Adding even 100ms (PCI_PM_D3COLD_WAIT) delay between enabling link training and starting link training causes detection issues with some buggy cards (such as Compex WLE900VX). Move the code which enables link training immediately before the one which starts link traning. This fixes detection issues of Compex WLE900VX card on Turris MOX after cold boot. Link: https://lore.kernel.org/r/20200430080625.26070-2-pali@kernel.org Fixes: f4c7d053d7f7 ("PCI: aardvark: Wait for endpoint to be ready...") Tested-by: Tomasz Maciej Nowak Signed-off-by: Pali Rohár Signed-off-by: Lorenzo Pieralisi Acked-by: Rob Herring Acked-by: Thomas Petazzoni Signed-off-by: Marek Behún Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-aardvark.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index db20bc903c0e9..c23ccbca49d55 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -394,11 +394,6 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) reg |= LANE_COUNT_1; advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); - /* Enable link training */ - reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); - reg |= LINK_TRAINING_EN; - advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); - /* Enable MSI */ reg = advk_readl(pcie, PCIE_CORE_CTRL2_REG); reg |= PCIE_CORE_CTRL2_MSI_ENABLE; @@ -440,7 +435,15 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) */ msleep(PCI_PM_D3COLD_WAIT); - /* Start link training */ + /* Enable link training */ + reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); + reg |= LINK_TRAINING_EN; + advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); + + /* + * Start link training immediately after enabling it. + * This solves problems for some buggy cards. + */ reg = advk_readl(pcie, PCIE_CORE_LINK_CTRL_STAT_REG); reg |= PCIE_CORE_LINK_TRAINING; advk_writel(pcie, reg, PCIE_CORE_LINK_CTRL_STAT_REG); From c0b034d369715b7c72532f297fef284cb3a77184 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Thu, 25 Nov 2021 01:25:58 +0100 Subject: [PATCH 1818/5344] PCI: aardvark: Improve link training MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 43fc679ced18006b12d918d7a8a4af392b7fbfe7 upstream. Currently the aardvark driver trains link in PCIe gen2 mode. This may cause some buggy gen1 cards (such as Compex WLE900VX) to be unstable or even not detected. Moreover when ASPM code tries to retrain link second time, these cards may stop responding and link goes down. If gen1 is used this does not happen. Unconditionally forcing gen1 is not a good solution since it may have performance impact on gen2 cards. To overcome this, read 'max-link-speed' property (as defined in PCI device tree bindings) and use this as max gen mode. Then iteratively try link training at this mode or lower until successful. After successful link training choose final controller gen based on Negotiated Link Speed from Link Status register, which should match card speed. Link: https://lore.kernel.org/r/20200430080625.26070-5-pali@kernel.org Tested-by: Tomasz Maciej Nowak Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Acked-by: Thomas Petazzoni Signed-off-by: Marek Behún Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-aardvark.c | 114 ++++++++++++++++++++------ 1 file changed, 89 insertions(+), 25 deletions(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index c23ccbca49d55..ae2e8eb85bf8d 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -39,6 +39,7 @@ #define PCIE_CORE_LINK_CTRL_STAT_REG 0xd0 #define PCIE_CORE_LINK_L0S_ENTRY BIT(0) #define PCIE_CORE_LINK_TRAINING BIT(5) +#define PCIE_CORE_LINK_SPEED_SHIFT 16 #define PCIE_CORE_LINK_WIDTH_SHIFT 20 #define PCIE_CORE_ERR_CAPCTL_REG 0x118 #define PCIE_CORE_ERR_CAPCTL_ECRC_CHK_TX BIT(5) @@ -249,6 +250,7 @@ struct advk_pcie { struct mutex msi_used_lock; u16 msi_msg; int root_bus_nr; + int link_gen; struct pci_bridge_emul bridge; }; @@ -309,20 +311,16 @@ static inline bool advk_pcie_link_training(struct advk_pcie *pcie) static int advk_pcie_wait_for_link(struct advk_pcie *pcie) { - struct device *dev = &pcie->pdev->dev; int retries; /* check if the link is up or not */ for (retries = 0; retries < LINK_WAIT_MAX_RETRIES; retries++) { - if (advk_pcie_link_up(pcie)) { - dev_info(dev, "link up\n"); + if (advk_pcie_link_up(pcie)) return 0; - } usleep_range(LINK_WAIT_USLEEP_MIN, LINK_WAIT_USLEEP_MAX); } - dev_err(dev, "link never came up\n"); return -ETIMEDOUT; } @@ -337,6 +335,85 @@ static void advk_pcie_wait_for_retrain(struct advk_pcie *pcie) } } +static int advk_pcie_train_at_gen(struct advk_pcie *pcie, int gen) +{ + int ret, neg_gen; + u32 reg; + + /* Setup link speed */ + reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); + reg &= ~PCIE_GEN_SEL_MSK; + if (gen == 3) + reg |= SPEED_GEN_3; + else if (gen == 2) + reg |= SPEED_GEN_2; + else + reg |= SPEED_GEN_1; + advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); + + /* + * Enable link training. This is not needed in every call to this + * function, just once suffices, but it does not break anything either. + */ + reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); + reg |= LINK_TRAINING_EN; + advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); + + /* + * Start link training immediately after enabling it. + * This solves problems for some buggy cards. + */ + reg = advk_readl(pcie, PCIE_CORE_LINK_CTRL_STAT_REG); + reg |= PCIE_CORE_LINK_TRAINING; + advk_writel(pcie, reg, PCIE_CORE_LINK_CTRL_STAT_REG); + + ret = advk_pcie_wait_for_link(pcie); + if (ret) + return ret; + + reg = advk_readl(pcie, PCIE_CORE_LINK_CTRL_STAT_REG); + neg_gen = (reg >> PCIE_CORE_LINK_SPEED_SHIFT) & 0xf; + + return neg_gen; +} + +static void advk_pcie_train_link(struct advk_pcie *pcie) +{ + struct device *dev = &pcie->pdev->dev; + int neg_gen = -1, gen; + + /* + * Try link training at link gen specified by device tree property + * 'max-link-speed'. If this fails, iteratively train at lower gen. + */ + for (gen = pcie->link_gen; gen > 0; --gen) { + neg_gen = advk_pcie_train_at_gen(pcie, gen); + if (neg_gen > 0) + break; + } + + if (neg_gen < 0) + goto err; + + /* + * After successful training if negotiated gen is lower than requested, + * train again on negotiated gen. This solves some stability issues for + * some buggy gen1 cards. + */ + if (neg_gen < gen) { + gen = neg_gen; + neg_gen = advk_pcie_train_at_gen(pcie, gen); + } + + if (neg_gen == gen) { + dev_info(dev, "link up at gen %i\n", gen); + return; + } + +err: + dev_err(dev, "link never came up\n"); +} + static void advk_pcie_setup_hw(struct advk_pcie *pcie) { u32 reg; @@ -382,12 +459,6 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) PCIE_CORE_CTRL2_TD_ENABLE; advk_writel(pcie, reg, PCIE_CORE_CTRL2_REG); - /* Set GEN2 */ - reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); - reg &= ~PCIE_GEN_SEL_MSK; - reg |= SPEED_GEN_2; - advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); - /* Set lane X1 */ reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); reg &= ~LANE_CNT_MSK; @@ -435,20 +506,7 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) */ msleep(PCI_PM_D3COLD_WAIT); - /* Enable link training */ - reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); - reg |= LINK_TRAINING_EN; - advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); - - /* - * Start link training immediately after enabling it. - * This solves problems for some buggy cards. - */ - reg = advk_readl(pcie, PCIE_CORE_LINK_CTRL_STAT_REG); - reg |= PCIE_CORE_LINK_TRAINING; - advk_writel(pcie, reg, PCIE_CORE_LINK_CTRL_STAT_REG); - - advk_pcie_wait_for_link(pcie); + advk_pcie_train_link(pcie); reg = advk_readl(pcie, PCIE_CORE_CMD_STATUS_REG); reg |= PCIE_CORE_CMD_MEM_ACCESS_EN | @@ -1278,6 +1336,12 @@ static int advk_pcie_probe(struct platform_device *pdev) return ret; } + ret = of_pci_get_max_link_speed(dev->of_node); + if (ret <= 0 || ret > 3) + pcie->link_gen = 3; + else + pcie->link_gen = ret; + advk_pcie_setup_hw(pcie); ret = advk_sw_pci_bridge_init(pcie); From 4895f2a41e1553695cf08d74d4395262592cf0fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 25 Nov 2021 01:25:59 +0100 Subject: [PATCH 1819/5344] PCI: aardvark: Issue PERST via GPIO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 5169a9851daaa2782a7bd2bb83d5b1bd224b2879 upstream. Add support for issuing PERST via GPIO specified in 'reset-gpios' property (as described in PCI device tree bindings). Some buggy cards (e.g. Compex WLE900VX or WLE1216) are not detected after reboot when PERST is not issued during driver initialization. If bootloader already enabled link training then issuing PERST has no effect for some buggy cards (e.g. Compex WLE900VX) and these cards are not detected. We therefore clear the LINK_TRAINING_EN register before. It was observed that Compex WLE900VX card needs to be in PERST reset for at least 10ms if bootloader enabled link training. Tested on Turris MOX. Link: https://lore.kernel.org/r/20200430080625.26070-6-pali@kernel.org Tested-by: Tomasz Maciej Nowak Signed-off-by: Pali Rohár Signed-off-by: Lorenzo Pieralisi Acked-by: Thomas Petazzoni Signed-off-by: Marek Behún Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-aardvark.c | 43 ++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index ae2e8eb85bf8d..1b16256ba6967 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -17,6 +18,7 @@ #include #include #include +#include #include #include "../pci.h" @@ -252,6 +254,7 @@ struct advk_pcie { int root_bus_nr; int link_gen; struct pci_bridge_emul bridge; + struct gpio_desc *reset_gpio; }; static inline void advk_writel(struct advk_pcie *pcie, u32 val, u64 reg) @@ -414,10 +417,31 @@ static void advk_pcie_train_link(struct advk_pcie *pcie) dev_err(dev, "link never came up\n"); } +static void advk_pcie_issue_perst(struct advk_pcie *pcie) +{ + u32 reg; + + if (!pcie->reset_gpio) + return; + + /* PERST does not work for some cards when link training is enabled */ + reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); + reg &= ~LINK_TRAINING_EN; + advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); + + /* 10ms delay is needed for some cards */ + dev_info(&pcie->pdev->dev, "issuing PERST via reset GPIO for 10ms\n"); + gpiod_set_value_cansleep(pcie->reset_gpio, 1); + usleep_range(10000, 11000); + gpiod_set_value_cansleep(pcie->reset_gpio, 0); +} + static void advk_pcie_setup_hw(struct advk_pcie *pcie) { u32 reg; + advk_pcie_issue_perst(pcie); + /* Set to Direct mode */ reg = advk_readl(pcie, CTRL_CONFIG_REG); reg &= ~(CTRL_MODE_MASK << CTRL_MODE_SHIFT); @@ -500,7 +524,8 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) /* * PERST# signal could have been asserted by pinctrl subsystem before - * probe() callback has been called, making the endpoint going into + * probe() callback has been called or issued explicitly by reset gpio + * function advk_pcie_issue_perst(), making the endpoint going into * fundamental reset. As required by PCI Express spec a delay for at * least 100ms after such a reset before link training is needed. */ @@ -1336,6 +1361,22 @@ static int advk_pcie_probe(struct platform_device *pdev) return ret; } + pcie->reset_gpio = devm_gpiod_get_from_of_node(dev, dev->of_node, + "reset-gpios", 0, + GPIOD_OUT_LOW, + "pcie1-reset"); + ret = PTR_ERR_OR_ZERO(pcie->reset_gpio); + if (ret) { + if (ret == -ENOENT) { + pcie->reset_gpio = NULL; + } else { + if (ret != -EPROBE_DEFER) + dev_err(dev, "Failed to get reset-gpio: %i\n", + ret); + return ret; + } + } + ret = of_pci_get_max_link_speed(dev->of_node); if (ret <= 0 || ret > 3) pcie->link_gen = 3; From 17855e4e27846d85568245f5992ee286b6818853 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 25 Nov 2021 01:26:00 +0100 Subject: [PATCH 1820/5344] PCI: aardvark: Replace custom macros by standard linux/pci_regs.h macros MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 96be36dbffacea0aa9e6ec4839583e79faa141a1 upstream. PCI-E capability macros are already defined in linux/pci_regs.h. Remove their reimplementation in pcie-aardvark. Link: https://lore.kernel.org/r/20200430080625.26070-9-pali@kernel.org Tested-by: Tomasz Maciej Nowak Signed-off-by: Pali Rohár Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Acked-by: Thomas Petazzoni Signed-off-by: Marek Behún Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-aardvark.c | 41 ++++++++++++--------------- 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 1b16256ba6967..97ad321cfc7cd 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -32,17 +32,6 @@ #define PCIE_CORE_CMD_MEM_IO_REQ_EN BIT(2) #define PCIE_CORE_DEV_REV_REG 0x8 #define PCIE_CORE_PCIEXP_CAP 0xc0 -#define PCIE_CORE_DEV_CTRL_STATS_REG 0xc8 -#define PCIE_CORE_DEV_CTRL_STATS_RELAX_ORDER_DISABLE (0 << 4) -#define PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ_SHIFT 5 -#define PCIE_CORE_DEV_CTRL_STATS_SNOOP_DISABLE (0 << 11) -#define PCIE_CORE_DEV_CTRL_STATS_MAX_RD_REQ_SIZE_SHIFT 12 -#define PCIE_CORE_DEV_CTRL_STATS_MAX_RD_REQ_SZ 0x2 -#define PCIE_CORE_LINK_CTRL_STAT_REG 0xd0 -#define PCIE_CORE_LINK_L0S_ENTRY BIT(0) -#define PCIE_CORE_LINK_TRAINING BIT(5) -#define PCIE_CORE_LINK_SPEED_SHIFT 16 -#define PCIE_CORE_LINK_WIDTH_SHIFT 20 #define PCIE_CORE_ERR_CAPCTL_REG 0x118 #define PCIE_CORE_ERR_CAPCTL_ECRC_CHK_TX BIT(5) #define PCIE_CORE_ERR_CAPCTL_ECRC_CHK_TX_EN BIT(6) @@ -267,6 +256,11 @@ static inline u32 advk_readl(struct advk_pcie *pcie, u64 reg) return readl(pcie->base + reg); } +static inline u16 advk_read16(struct advk_pcie *pcie, u64 reg) +{ + return advk_readl(pcie, (reg & ~0x3)) >> ((reg & 0x3) * 8); +} + static u8 advk_pcie_ltssm_state(struct advk_pcie *pcie) { u32 val; @@ -366,16 +360,16 @@ static int advk_pcie_train_at_gen(struct advk_pcie *pcie, int gen) * Start link training immediately after enabling it. * This solves problems for some buggy cards. */ - reg = advk_readl(pcie, PCIE_CORE_LINK_CTRL_STAT_REG); - reg |= PCIE_CORE_LINK_TRAINING; - advk_writel(pcie, reg, PCIE_CORE_LINK_CTRL_STAT_REG); + reg = advk_readl(pcie, PCIE_CORE_PCIEXP_CAP + PCI_EXP_LNKCTL); + reg |= PCI_EXP_LNKCTL_RL; + advk_writel(pcie, reg, PCIE_CORE_PCIEXP_CAP + PCI_EXP_LNKCTL); ret = advk_pcie_wait_for_link(pcie); if (ret) return ret; - reg = advk_readl(pcie, PCIE_CORE_LINK_CTRL_STAT_REG); - neg_gen = (reg >> PCIE_CORE_LINK_SPEED_SHIFT) & 0xf; + reg = advk_read16(pcie, PCIE_CORE_PCIEXP_CAP + PCI_EXP_LNKSTA); + neg_gen = reg & PCI_EXP_LNKSTA_CLS; return neg_gen; } @@ -470,13 +464,14 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) PCIE_CORE_ERR_CAPCTL_ECRC_CHCK_RCV; advk_writel(pcie, reg, PCIE_CORE_ERR_CAPCTL_REG); - /* Set PCIe Device Control and Status 1 PF0 register */ - reg = PCIE_CORE_DEV_CTRL_STATS_RELAX_ORDER_DISABLE | - (7 << PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ_SHIFT) | - PCIE_CORE_DEV_CTRL_STATS_SNOOP_DISABLE | - (PCIE_CORE_DEV_CTRL_STATS_MAX_RD_REQ_SZ << - PCIE_CORE_DEV_CTRL_STATS_MAX_RD_REQ_SIZE_SHIFT); - advk_writel(pcie, reg, PCIE_CORE_DEV_CTRL_STATS_REG); + /* Set PCIe Device Control register */ + reg = advk_readl(pcie, PCIE_CORE_PCIEXP_CAP + PCI_EXP_DEVCTL); + reg &= ~PCI_EXP_DEVCTL_RELAX_EN; + reg &= ~PCI_EXP_DEVCTL_NOSNOOP_EN; + reg &= ~PCI_EXP_DEVCTL_READRQ; + reg |= PCI_EXP_DEVCTL_PAYLOAD; /* Set max payload size */ + reg |= PCI_EXP_DEVCTL_READRQ_512B; + advk_writel(pcie, reg, PCIE_CORE_PCIEXP_CAP + PCI_EXP_DEVCTL); /* Program PCIe Control 2 to disable strict ordering */ reg = PCIE_CORE_CTRL2_RESERVED | From 7505357da267067f431c7c301ced90977b8baf10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 25 Nov 2021 01:26:01 +0100 Subject: [PATCH 1821/5344] PCI: aardvark: Don't touch PCIe registers if no card connected MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 70e380250c3621c55ff218cbaf2272830d9dbb1d upstream. When there is no PCIe card connected and advk_pcie_rd_conf() or advk_pcie_wr_conf() is called for PCI bus which doesn't belong to emulated root bridge, the aardvark driver throws the following error message: advk-pcie d0070000.pcie: config read/write timed out Obviously accessing PCIe registers of disconnected card is not possible. Extend check in advk_pcie_valid_device() function for validating availability of PCIe bus. If PCIe link is down, then the device is marked as Not Found and the driver does not try to access these registers. This is just an optimization to prevent accessing PCIe registers when card is disconnected. Trying to access PCIe registers of disconnected card does not cause any crash, kernel just needs to wait for a timeout. So if card disappear immediately after checking for PCIe link (before accessing PCIe registers), it does not cause any problems. Link: https://lore.kernel.org/r/20200702083036.12230-1-pali@kernel.org Signed-off-by: Pali Rohár Signed-off-by: Lorenzo Pieralisi Signed-off-by: Marek Behún Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-aardvark.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 97ad321cfc7cd..3a41ff16ae57d 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -806,6 +806,13 @@ static bool advk_pcie_valid_device(struct advk_pcie *pcie, struct pci_bus *bus, if ((bus->number == pcie->root_bus_nr) && PCI_SLOT(devfn) != 0) return false; + /* + * If the link goes down after we check for link-up, nothing bad + * happens but the config access times out. + */ + if (bus->number != pcie->root_bus_nr && !advk_pcie_link_up(pcie)) + return false; + return true; } From 8a7f561be1ab4d7b2d1555bc42abf48ae2e48ef8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 25 Nov 2021 01:26:02 +0100 Subject: [PATCH 1822/5344] PCI: aardvark: Fix compilation on s390 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit b32c012e4b98f0126aa327be2d1f409963057643 upstream. Include linux/gpio/consumer.h instead of linux/gpio.h, as is said in the latter file. This was reported by kernel test bot when compiling for s390. drivers/pci/controller/pci-aardvark.c:350:2: error: implicit declaration of function 'gpiod_set_value_cansleep' [-Werror,-Wimplicit-function-declaration] drivers/pci/controller/pci-aardvark.c:1074:21: error: implicit declaration of function 'devm_gpiod_get_from_of_node' [-Werror,-Wimplicit-function-declaration] drivers/pci/controller/pci-aardvark.c:1076:14: error: use of undeclared identifier 'GPIOD_OUT_LOW' Link: https://lore.kernel.org/r/202006211118.LxtENQfl%25lkp@intel.com Link: https://lore.kernel.org/r/20200907111038.5811-2-pali@kernel.org Fixes: 5169a9851daa ("PCI: aardvark: Issue PERST via GPIO") Reported-by: kernel test robot Signed-off-by: Pali Rohár Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marek Behún Signed-off-by: Marek Behún Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-aardvark.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 3a41ff16ae57d..d8dc913cabd90 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -9,7 +9,7 @@ */ #include -#include +#include #include #include #include From d8ad05b06956860126253a22c0203410ccfbbcb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 25 Nov 2021 01:26:03 +0100 Subject: [PATCH 1823/5344] PCI: aardvark: Move PCIe reset card code to advk_pcie_train_link() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit d0c6a3475b033960e85ae2bf176b14cab0a627d2 upstream. Move code which belongs to link training (delays and resets) into advk_pcie_train_link() function, so everything related to link training, including timings is at one place. After experiments it can be observed that link training in aardvark hardware is very sensitive to timings and delays, so it is a good idea to have this code at the same place as link training calls. This patch does not change behavior of aardvark initialization. Link: https://lore.kernel.org/r/20200907111038.5811-6-pali@kernel.org Tested-by: Marek Behún Signed-off-by: Pali Rohár Signed-off-by: Lorenzo Pieralisi Signed-off-by: Marek Behún Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-aardvark.c | 64 ++++++++++++++------------- 1 file changed, 34 insertions(+), 30 deletions(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index d8dc913cabd90..ed33932ac769d 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -332,6 +332,25 @@ static void advk_pcie_wait_for_retrain(struct advk_pcie *pcie) } } +static void advk_pcie_issue_perst(struct advk_pcie *pcie) +{ + u32 reg; + + if (!pcie->reset_gpio) + return; + + /* PERST does not work for some cards when link training is enabled */ + reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); + reg &= ~LINK_TRAINING_EN; + advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); + + /* 10ms delay is needed for some cards */ + dev_info(&pcie->pdev->dev, "issuing PERST via reset GPIO for 10ms\n"); + gpiod_set_value_cansleep(pcie->reset_gpio, 1); + usleep_range(10000, 11000); + gpiod_set_value_cansleep(pcie->reset_gpio, 0); +} + static int advk_pcie_train_at_gen(struct advk_pcie *pcie, int gen) { int ret, neg_gen; @@ -379,6 +398,21 @@ static void advk_pcie_train_link(struct advk_pcie *pcie) struct device *dev = &pcie->pdev->dev; int neg_gen = -1, gen; + /* + * Reset PCIe card via PERST# signal. Some cards are not detected + * during link training when they are in some non-initial state. + */ + advk_pcie_issue_perst(pcie); + + /* + * PERST# signal could have been asserted by pinctrl subsystem before + * probe() callback has been called or issued explicitly by reset gpio + * function advk_pcie_issue_perst(), making the endpoint going into + * fundamental reset. As required by PCI Express spec a delay for at + * least 100ms after such a reset before link training is needed. + */ + msleep(PCI_PM_D3COLD_WAIT); + /* * Try link training at link gen specified by device tree property * 'max-link-speed'. If this fails, iteratively train at lower gen. @@ -411,31 +445,10 @@ static void advk_pcie_train_link(struct advk_pcie *pcie) dev_err(dev, "link never came up\n"); } -static void advk_pcie_issue_perst(struct advk_pcie *pcie) -{ - u32 reg; - - if (!pcie->reset_gpio) - return; - - /* PERST does not work for some cards when link training is enabled */ - reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); - reg &= ~LINK_TRAINING_EN; - advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); - - /* 10ms delay is needed for some cards */ - dev_info(&pcie->pdev->dev, "issuing PERST via reset GPIO for 10ms\n"); - gpiod_set_value_cansleep(pcie->reset_gpio, 1); - usleep_range(10000, 11000); - gpiod_set_value_cansleep(pcie->reset_gpio, 0); -} - static void advk_pcie_setup_hw(struct advk_pcie *pcie) { u32 reg; - advk_pcie_issue_perst(pcie); - /* Set to Direct mode */ reg = advk_readl(pcie, CTRL_CONFIG_REG); reg &= ~(CTRL_MODE_MASK << CTRL_MODE_SHIFT); @@ -517,15 +530,6 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) reg |= PIO_CTRL_ADDR_WIN_DISABLE; advk_writel(pcie, reg, PIO_CTRL); - /* - * PERST# signal could have been asserted by pinctrl subsystem before - * probe() callback has been called or issued explicitly by reset gpio - * function advk_pcie_issue_perst(), making the endpoint going into - * fundamental reset. As required by PCI Express spec a delay for at - * least 100ms after such a reset before link training is needed. - */ - msleep(PCI_PM_D3COLD_WAIT); - advk_pcie_train_link(pcie); reg = advk_readl(pcie, PCIE_CORE_CMD_STATUS_REG); From 77355e58e28c25ad0fd4e7aa0dcffa3ad587662b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 25 Nov 2021 01:26:04 +0100 Subject: [PATCH 1824/5344] PCI: aardvark: Update comment about disabling link training MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 1d1cd163d0de22a4041a6f1aeabcf78f80076539 upstream. According to PCI Express Base Specifications (rev 4.0, 6.6.1 "Conventional reset"), after fundamental reset a 100ms delay is needed prior to enabling link training. Update comment in code to reflect this requirement. Link: https://lore.kernel.org/r/20201202184659.3795-1-pali@kernel.org Signed-off-by: Pali Rohár Signed-off-by: Lorenzo Pieralisi Signed-off-by: Marek Behún Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-aardvark.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index ed33932ac769d..b358f59b8945d 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -339,7 +339,14 @@ static void advk_pcie_issue_perst(struct advk_pcie *pcie) if (!pcie->reset_gpio) return; - /* PERST does not work for some cards when link training is enabled */ + /* + * As required by PCI Express spec (PCI Express Base Specification, REV. + * 4.0 PCI Express, February 19 2014, 6.6.1 Conventional Reset) a delay + * for at least 100ms after de-asserting PERST# signal is needed before + * link training is enabled. So ensure that link training is disabled + * prior de-asserting PERST# signal to fulfill that PCI Express spec + * requirement. + */ reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); reg &= ~LINK_TRAINING_EN; advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); From 050dc70df349c1c21e02ff5855dcb358130f7740 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 25 Nov 2021 01:26:05 +0100 Subject: [PATCH 1825/5344] PCI: pci-bridge-emul: Fix array overruns, improve safety MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit f8ee579d53aca887d93f5f411462f25c085a5106 upstream. We allow up to PCI_EXP_SLTSTA2 registers to be accessed, but the pcie_cap_regs_behavior[] array only covers up to PCI_EXP_RTSTA. Expand this array to avoid walking off the end of it. Do the same for pci_regs_behavior for consistency[], and add a BUILD_BUG_ON() to also check the bridge->conf structure size. Fixes: 23a5fba4d941 ("PCI: Introduce PCI bridge emulated config space common logic") Link: https://lore.kernel.org/r/E1l6z9W-0006Re-MQ@rmk-PC.armlinux.org.uk Signed-off-by: Russell King Signed-off-by: Bjorn Helgaas Reviewed-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Greg Kroah-Hartman --- drivers/pci/pci-bridge-emul.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/pci/pci-bridge-emul.c b/drivers/pci/pci-bridge-emul.c index b3d63e319bb39..3026346ccb18c 100644 --- a/drivers/pci/pci-bridge-emul.c +++ b/drivers/pci/pci-bridge-emul.c @@ -21,8 +21,9 @@ #include "pci-bridge-emul.h" #define PCI_BRIDGE_CONF_END PCI_STD_HEADER_SIZEOF +#define PCI_CAP_PCIE_SIZEOF (PCI_EXP_SLTSTA2 + 2) #define PCI_CAP_PCIE_START PCI_BRIDGE_CONF_END -#define PCI_CAP_PCIE_END (PCI_CAP_PCIE_START + PCI_EXP_SLTSTA2 + 2) +#define PCI_CAP_PCIE_END (PCI_CAP_PCIE_START + PCI_CAP_PCIE_SIZEOF) struct pci_bridge_reg_behavior { /* Read-only bits */ @@ -38,7 +39,8 @@ struct pci_bridge_reg_behavior { u32 rsvd; }; -static const struct pci_bridge_reg_behavior pci_regs_behavior[] = { +static const +struct pci_bridge_reg_behavior pci_regs_behavior[PCI_STD_HEADER_SIZEOF / 4] = { [PCI_VENDOR_ID / 4] = { .ro = ~0 }, [PCI_COMMAND / 4] = { .rw = (PCI_COMMAND_IO | PCI_COMMAND_MEMORY | @@ -173,7 +175,8 @@ static const struct pci_bridge_reg_behavior pci_regs_behavior[] = { }, }; -static const struct pci_bridge_reg_behavior pcie_cap_regs_behavior[] = { +static const +struct pci_bridge_reg_behavior pcie_cap_regs_behavior[PCI_CAP_PCIE_SIZEOF / 4] = { [PCI_CAP_LIST_ID / 4] = { /* * Capability ID, Next Capability Pointer and @@ -270,6 +273,8 @@ static const struct pci_bridge_reg_behavior pcie_cap_regs_behavior[] = { int pci_bridge_emul_init(struct pci_bridge_emul *bridge, unsigned int flags) { + BUILD_BUG_ON(sizeof(bridge->conf) != PCI_BRIDGE_CONF_END); + bridge->conf.class_revision |= cpu_to_le32(PCI_CLASS_BRIDGE_PCI << 16); bridge->conf.header_type = PCI_HEADER_TYPE_BRIDGE; bridge->conf.cache_line_size = 0x10; From c312c1ff7e7d1f5b5b677bcb9331dd8742095d23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 25 Nov 2021 01:26:06 +0100 Subject: [PATCH 1826/5344] PCI: aardvark: Configure PCIe resources from 'ranges' DT property MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 64f160e19e9264a7f6d89c516baae1473b6f8359 upstream. In commit 6df6ba974a55 ("PCI: aardvark: Remove PCIe outbound window configuration") was removed aardvark PCIe outbound window configuration and commit description said that was recommended solution by HW designers. But that commit completely removed support for configuring PCIe IO resources without removing PCIe IO 'ranges' from DTS files. After that commit PCIe IO space started to be treated as PCIe MEM space and accessing it just caused kernel crash. Moreover implementation of PCIe outbound windows prior that commit was incorrect. It completely ignored offset between CPU address and PCIe bus address and expected that in DTS is CPU address always same as PCIe bus address without doing any checks. Also it completely ignored size of every PCIe resource specified in 'ranges' DTS property and expected that every PCIe resource has size 128 MB (also for PCIe IO range). Again without any check. Apparently none of PCIe resource has in DTS specified size of 128 MB. So it was completely broken and thanks to how aardvark mask works, configuration was completely ignored. This patch reverts back support for PCIe outbound window configuration but implementation is a new without issues mentioned above. PCIe outbound window is required when DTS specify in 'ranges' property non-zero offset between CPU and PCIe address space. To address recommendation by HW designers as specified in commit description of 6df6ba974a55, set default outbound parameters as PCIe MEM access without translation and therefore for this PCIe 'ranges' it is not needed to configure PCIe outbound window. For PCIe IO space is needed to configure aardvark PCIe outbound window. This patch fixes kernel crash when trying to access PCIe IO space. Link: https://lore.kernel.org/r/20210624215546.4015-2-pali@kernel.org Signed-off-by: Pali Rohár Signed-off-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org # 6df6ba974a55 ("PCI: aardvark: Remove PCIe outbound window configuration") Signed-off-by: Marek Behún Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-aardvark.c | 190 +++++++++++++++++++++++++- 1 file changed, 189 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index b358f59b8945d..4dabeaacd7be9 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -114,6 +114,46 @@ #define PCIE_MSI_PAYLOAD_REG (CONTROL_BASE_ADDR + 0x9C) #define PCIE_MSI_DATA_MASK GENMASK(15, 0) +/* PCIe window configuration */ +#define OB_WIN_BASE_ADDR 0x4c00 +#define OB_WIN_BLOCK_SIZE 0x20 +#define OB_WIN_COUNT 8 +#define OB_WIN_REG_ADDR(win, offset) (OB_WIN_BASE_ADDR + \ + OB_WIN_BLOCK_SIZE * (win) + \ + (offset)) +#define OB_WIN_MATCH_LS(win) OB_WIN_REG_ADDR(win, 0x00) +#define OB_WIN_ENABLE BIT(0) +#define OB_WIN_MATCH_MS(win) OB_WIN_REG_ADDR(win, 0x04) +#define OB_WIN_REMAP_LS(win) OB_WIN_REG_ADDR(win, 0x08) +#define OB_WIN_REMAP_MS(win) OB_WIN_REG_ADDR(win, 0x0c) +#define OB_WIN_MASK_LS(win) OB_WIN_REG_ADDR(win, 0x10) +#define OB_WIN_MASK_MS(win) OB_WIN_REG_ADDR(win, 0x14) +#define OB_WIN_ACTIONS(win) OB_WIN_REG_ADDR(win, 0x18) +#define OB_WIN_DEFAULT_ACTIONS (OB_WIN_ACTIONS(OB_WIN_COUNT-1) + 0x4) +#define OB_WIN_FUNC_NUM_MASK GENMASK(31, 24) +#define OB_WIN_FUNC_NUM_SHIFT 24 +#define OB_WIN_FUNC_NUM_ENABLE BIT(23) +#define OB_WIN_BUS_NUM_BITS_MASK GENMASK(22, 20) +#define OB_WIN_BUS_NUM_BITS_SHIFT 20 +#define OB_WIN_MSG_CODE_ENABLE BIT(22) +#define OB_WIN_MSG_CODE_MASK GENMASK(21, 14) +#define OB_WIN_MSG_CODE_SHIFT 14 +#define OB_WIN_MSG_PAYLOAD_LEN BIT(12) +#define OB_WIN_ATTR_ENABLE BIT(11) +#define OB_WIN_ATTR_TC_MASK GENMASK(10, 8) +#define OB_WIN_ATTR_TC_SHIFT 8 +#define OB_WIN_ATTR_RELAXED BIT(7) +#define OB_WIN_ATTR_NOSNOOP BIT(6) +#define OB_WIN_ATTR_POISON BIT(5) +#define OB_WIN_ATTR_IDO BIT(4) +#define OB_WIN_TYPE_MASK GENMASK(3, 0) +#define OB_WIN_TYPE_SHIFT 0 +#define OB_WIN_TYPE_MEM 0x0 +#define OB_WIN_TYPE_IO 0x4 +#define OB_WIN_TYPE_CONFIG_TYPE0 0x8 +#define OB_WIN_TYPE_CONFIG_TYPE1 0x9 +#define OB_WIN_TYPE_MSG 0xc + /* LMI registers base address and register offsets */ #define LMI_BASE_ADDR 0x6000 #define CFG_REG (LMI_BASE_ADDR + 0x0) @@ -229,6 +269,13 @@ struct advk_pcie { struct platform_device *pdev; void __iomem *base; struct list_head resources; + struct { + phys_addr_t match; + phys_addr_t remap; + phys_addr_t mask; + u32 actions; + } wins[OB_WIN_COUNT]; + u8 wins_count; struct irq_domain *irq_domain; struct irq_chip irq_chip; raw_spinlock_t irq_lock; @@ -452,9 +499,39 @@ static void advk_pcie_train_link(struct advk_pcie *pcie) dev_err(dev, "link never came up\n"); } +/* + * Set PCIe address window register which could be used for memory + * mapping. + */ +static void advk_pcie_set_ob_win(struct advk_pcie *pcie, u8 win_num, + phys_addr_t match, phys_addr_t remap, + phys_addr_t mask, u32 actions) +{ + advk_writel(pcie, OB_WIN_ENABLE | + lower_32_bits(match), OB_WIN_MATCH_LS(win_num)); + advk_writel(pcie, upper_32_bits(match), OB_WIN_MATCH_MS(win_num)); + advk_writel(pcie, lower_32_bits(remap), OB_WIN_REMAP_LS(win_num)); + advk_writel(pcie, upper_32_bits(remap), OB_WIN_REMAP_MS(win_num)); + advk_writel(pcie, lower_32_bits(mask), OB_WIN_MASK_LS(win_num)); + advk_writel(pcie, upper_32_bits(mask), OB_WIN_MASK_MS(win_num)); + advk_writel(pcie, actions, OB_WIN_ACTIONS(win_num)); +} + +static void advk_pcie_disable_ob_win(struct advk_pcie *pcie, u8 win_num) +{ + advk_writel(pcie, 0, OB_WIN_MATCH_LS(win_num)); + advk_writel(pcie, 0, OB_WIN_MATCH_MS(win_num)); + advk_writel(pcie, 0, OB_WIN_REMAP_LS(win_num)); + advk_writel(pcie, 0, OB_WIN_REMAP_MS(win_num)); + advk_writel(pcie, 0, OB_WIN_MASK_LS(win_num)); + advk_writel(pcie, 0, OB_WIN_MASK_MS(win_num)); + advk_writel(pcie, 0, OB_WIN_ACTIONS(win_num)); +} + static void advk_pcie_setup_hw(struct advk_pcie *pcie) { u32 reg; + int i; /* Set to Direct mode */ reg = advk_readl(pcie, CTRL_CONFIG_REG); @@ -528,15 +605,51 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) reg = PCIE_IRQ_ALL_MASK & (~PCIE_IRQ_ENABLE_INTS_MASK); advk_writel(pcie, reg, HOST_CTRL_INT_MASK_REG); + /* + * Enable AXI address window location generation: + * When it is enabled, the default outbound window + * configurations (Default User Field: 0xD0074CFC) + * are used to transparent address translation for + * the outbound transactions. Thus, PCIe address + * windows are not required for transparent memory + * access when default outbound window configuration + * is set for memory access. + */ reg = advk_readl(pcie, PCIE_CORE_CTRL2_REG); reg |= PCIE_CORE_CTRL2_OB_WIN_ENABLE; advk_writel(pcie, reg, PCIE_CORE_CTRL2_REG); - /* Bypass the address window mapping for PIO */ + /* + * Set memory access in Default User Field so it + * is not required to configure PCIe address for + * transparent memory access. + */ + advk_writel(pcie, OB_WIN_TYPE_MEM, OB_WIN_DEFAULT_ACTIONS); + + /* + * Bypass the address window mapping for PIO: + * Since PIO access already contains all required + * info over AXI interface by PIO registers, the + * address window is not required. + */ reg = advk_readl(pcie, PIO_CTRL); reg |= PIO_CTRL_ADDR_WIN_DISABLE; advk_writel(pcie, reg, PIO_CTRL); + /* + * Configure PCIe address windows for non-memory or + * non-transparent access as by default PCIe uses + * transparent memory access. + */ + for (i = 0; i < pcie->wins_count; i++) + advk_pcie_set_ob_win(pcie, i, + pcie->wins[i].match, pcie->wins[i].remap, + pcie->wins[i].mask, pcie->wins[i].actions); + + /* Disable remaining PCIe outbound windows */ + for (i = pcie->wins_count; i < OB_WIN_COUNT; i++) + advk_pcie_disable_ob_win(pcie, i); + advk_pcie_train_link(pcie); reg = advk_readl(pcie, PCIE_CORE_CMD_STATUS_REG); @@ -1345,6 +1458,7 @@ static int advk_pcie_probe(struct platform_device *pdev) struct advk_pcie *pcie; struct resource *res; struct pci_host_bridge *bridge; + struct resource_entry *entry; int ret, irq; bridge = devm_pci_alloc_host_bridge(dev, sizeof(struct advk_pcie)); @@ -1374,6 +1488,80 @@ static int advk_pcie_probe(struct platform_device *pdev) return ret; } + resource_list_for_each_entry(entry, &pcie->resources) { + resource_size_t start = entry->res->start; + resource_size_t size = resource_size(entry->res); + unsigned long type = resource_type(entry->res); + u64 win_size; + + /* + * Aardvark hardware allows to configure also PCIe window + * for config type 0 and type 1 mapping, but driver uses + * only PIO for issuing configuration transfers which does + * not use PCIe window configuration. + */ + if (type != IORESOURCE_MEM && type != IORESOURCE_MEM_64 && + type != IORESOURCE_IO) + continue; + + /* + * Skip transparent memory resources. Default outbound access + * configuration is set to transparent memory access so it + * does not need window configuration. + */ + if ((type == IORESOURCE_MEM || type == IORESOURCE_MEM_64) && + entry->offset == 0) + continue; + + /* + * The n-th PCIe window is configured by tuple (match, remap, mask) + * and an access to address A uses this window if A matches the + * match with given mask. + * So every PCIe window size must be a power of two and every start + * address must be aligned to window size. Minimal size is 64 KiB + * because lower 16 bits of mask must be zero. Remapped address + * may have set only bits from the mask. + */ + while (pcie->wins_count < OB_WIN_COUNT && size > 0) { + /* Calculate the largest aligned window size */ + win_size = (1ULL << (fls64(size)-1)) | + (start ? (1ULL << __ffs64(start)) : 0); + win_size = 1ULL << __ffs64(win_size); + if (win_size < 0x10000) + break; + + dev_dbg(dev, + "Configuring PCIe window %d: [0x%llx-0x%llx] as %lu\n", + pcie->wins_count, (unsigned long long)start, + (unsigned long long)start + win_size, type); + + if (type == IORESOURCE_IO) { + pcie->wins[pcie->wins_count].actions = OB_WIN_TYPE_IO; + pcie->wins[pcie->wins_count].match = pci_pio_to_address(start); + } else { + pcie->wins[pcie->wins_count].actions = OB_WIN_TYPE_MEM; + pcie->wins[pcie->wins_count].match = start; + } + pcie->wins[pcie->wins_count].remap = start - entry->offset; + pcie->wins[pcie->wins_count].mask = ~(win_size - 1); + + if (pcie->wins[pcie->wins_count].remap & (win_size - 1)) + break; + + start += win_size; + size -= win_size; + pcie->wins_count++; + } + + if (size > 0) { + dev_err(&pcie->pdev->dev, + "Invalid PCIe region [0x%llx-0x%llx]\n", + (unsigned long long)entry->res->start, + (unsigned long long)entry->res->end + 1); + return -EINVAL; + } + } + pcie->reset_gpio = devm_gpiod_get_from_of_node(dev, dev->of_node, "reset-gpios", 0, GPIOD_OUT_LOW, From 4cf23a5d3cb7ac8a2522b79f2e6c0627f4780d51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 25 Nov 2021 01:26:07 +0100 Subject: [PATCH 1827/5344] PCI: aardvark: Fix PCIe Max Payload Size setting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit a4e17d65dafdd3513042d8f00404c9b6068a825c upstream. Change PCIe Max Payload Size setting in PCIe Device Control register to 512 bytes to align with PCIe Link Initialization sequence as defined in Marvell Armada 3700 Functional Specification. According to the specification, maximal Max Payload Size supported by this device is 512 bytes. Without this kernel prints suspicious line: pci 0000:01:00.0: Upstream bridge's Max Payload Size set to 256 (was 16384, max 512) With this change it changes to: pci 0000:01:00.0: Upstream bridge's Max Payload Size set to 256 (was 512, max 512) Link: https://lore.kernel.org/r/20211005180952.6812-3-kabel@kernel.org Fixes: 8c39d710363c ("PCI: aardvark: Add Aardvark PCI host controller driver") Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marek Behún Cc: stable@vger.kernel.org Signed-off-by: Marek Behún Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-aardvark.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 4dabeaacd7be9..47becbaf21038 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -565,8 +565,9 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) reg = advk_readl(pcie, PCIE_CORE_PCIEXP_CAP + PCI_EXP_DEVCTL); reg &= ~PCI_EXP_DEVCTL_RELAX_EN; reg &= ~PCI_EXP_DEVCTL_NOSNOOP_EN; + reg &= ~PCI_EXP_DEVCTL_PAYLOAD; reg &= ~PCI_EXP_DEVCTL_READRQ; - reg |= PCI_EXP_DEVCTL_PAYLOAD; /* Set max payload size */ + reg |= PCI_EXP_DEVCTL_PAYLOAD_512B; reg |= PCI_EXP_DEVCTL_READRQ_512B; advk_writel(pcie, reg, PCIE_CORE_PCIEXP_CAP + PCI_EXP_DEVCTL); From 4595fb47467875127d3ce9f112639e94d30a008e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 25 Nov 2021 01:26:09 +0100 Subject: [PATCH 1828/5344] PCI: aardvark: Implement re-issuing config requests on CRS response MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 223dec14a05337a4155f1deed46d2becce4d00fd upstream. Commit 43f5c77bcbd2 ("PCI: aardvark: Fix reporting CRS value") fixed handling of CRS response and when CRSSVE flag was not enabled it marked CRS response as failed transaction (due to simplicity). But pci-aardvark.c driver is already waiting up to the PIO_RETRY_CNT count for PIO config response and so we can with a small change implement re-issuing of config requests as described in PCIe base specification. This change implements re-issuing of config requests when response is CRS. Set upper bound of wait cycles to around PIO_RETRY_CNT, afterwards the transaction is marked as failed and an all-ones value is returned as before. We do this by returning appropriate error codes from function advk_pcie_check_pio_status(). On CRS we return -EAGAIN and caller then reissues transaction. Link: https://lore.kernel.org/r/20211005180952.6812-10-kabel@kernel.org Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marek Behún Signed-off-by: Marek Behún Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-aardvark.c | 67 +++++++++++++++++---------- 1 file changed, 43 insertions(+), 24 deletions(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 47becbaf21038..bd5d309232a99 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -666,6 +666,7 @@ static int advk_pcie_check_pio_status(struct advk_pcie *pcie, bool allow_crs, u3 u32 reg; unsigned int status; char *strcomp_status, *str_posted; + int ret; reg = advk_readl(pcie, PIO_STAT); status = (reg & PIO_COMPLETION_STATUS_MASK) >> @@ -690,6 +691,7 @@ static int advk_pcie_check_pio_status(struct advk_pcie *pcie, bool allow_crs, u3 case PIO_COMPLETION_STATUS_OK: if (reg & PIO_ERR_STATUS) { strcomp_status = "COMP_ERR"; + ret = -EFAULT; break; } /* Get the read result */ @@ -697,9 +699,11 @@ static int advk_pcie_check_pio_status(struct advk_pcie *pcie, bool allow_crs, u3 *val = advk_readl(pcie, PIO_RD_DATA); /* No error */ strcomp_status = NULL; + ret = 0; break; case PIO_COMPLETION_STATUS_UR: strcomp_status = "UR"; + ret = -EOPNOTSUPP; break; case PIO_COMPLETION_STATUS_CRS: if (allow_crs && val) { @@ -717,6 +721,7 @@ static int advk_pcie_check_pio_status(struct advk_pcie *pcie, bool allow_crs, u3 */ *val = CFG_RD_CRS_VAL; strcomp_status = NULL; + ret = 0; break; } /* PCIe r4.0, sec 2.3.2, says: @@ -732,21 +737,24 @@ static int advk_pcie_check_pio_status(struct advk_pcie *pcie, bool allow_crs, u3 * Request and taking appropriate action, e.g., complete the * Request to the host as a failed transaction. * - * To simplify implementation do not re-issue the Configuration - * Request and complete the Request as a failed transaction. + * So return -EAGAIN and caller (pci-aardvark.c driver) will + * re-issue request again up to the PIO_RETRY_CNT retries. */ strcomp_status = "CRS"; + ret = -EAGAIN; break; case PIO_COMPLETION_STATUS_CA: strcomp_status = "CA"; + ret = -ECANCELED; break; default: strcomp_status = "Unknown"; + ret = -EINVAL; break; } if (!strcomp_status) - return 0; + return ret; if (reg & PIO_NON_POSTED_REQ) str_posted = "Non-posted"; @@ -756,7 +764,7 @@ static int advk_pcie_check_pio_status(struct advk_pcie *pcie, bool allow_crs, u3 dev_dbg(dev, "%s PIO Response Status: %s, %#x @ %#x\n", str_posted, strcomp_status, reg, advk_readl(pcie, PIO_ADDR_LS)); - return -EFAULT; + return ret; } static int advk_pcie_wait_pio(struct advk_pcie *pcie) @@ -764,13 +772,13 @@ static int advk_pcie_wait_pio(struct advk_pcie *pcie) struct device *dev = &pcie->pdev->dev; int i; - for (i = 0; i < PIO_RETRY_CNT; i++) { + for (i = 1; i <= PIO_RETRY_CNT; i++) { u32 start, isr; start = advk_readl(pcie, PIO_START); isr = advk_readl(pcie, PIO_ISR); if (!start && isr) - return 0; + return i; udelay(PIO_RETRY_DELAY); } @@ -974,6 +982,7 @@ static int advk_pcie_rd_conf(struct pci_bus *bus, u32 devfn, int where, int size, u32 *val) { struct advk_pcie *pcie = bus->sysdata; + int retry_count; bool allow_crs; u32 reg; int ret; @@ -1016,16 +1025,22 @@ static int advk_pcie_rd_conf(struct pci_bus *bus, u32 devfn, /* Program the data strobe */ advk_writel(pcie, 0xf, PIO_WR_DATA_STRB); - /* Clear PIO DONE ISR and start the transfer */ - advk_writel(pcie, 1, PIO_ISR); - advk_writel(pcie, 1, PIO_START); + retry_count = 0; + do { + /* Clear PIO DONE ISR and start the transfer */ + advk_writel(pcie, 1, PIO_ISR); + advk_writel(pcie, 1, PIO_START); - ret = advk_pcie_wait_pio(pcie); - if (ret < 0) - goto try_crs; + ret = advk_pcie_wait_pio(pcie); + if (ret < 0) + goto try_crs; + + retry_count += ret; + + /* Check PIO status and get the read result */ + ret = advk_pcie_check_pio_status(pcie, allow_crs, val); + } while (ret == -EAGAIN && retry_count < PIO_RETRY_CNT); - /* Check PIO status and get the read result */ - ret = advk_pcie_check_pio_status(pcie, allow_crs, val); if (ret < 0) goto fail; @@ -1057,6 +1072,7 @@ static int advk_pcie_wr_conf(struct pci_bus *bus, u32 devfn, struct advk_pcie *pcie = bus->sysdata; u32 reg; u32 data_strobe = 0x0; + int retry_count; int offset; int ret; @@ -1098,19 +1114,22 @@ static int advk_pcie_wr_conf(struct pci_bus *bus, u32 devfn, /* Program the data strobe */ advk_writel(pcie, data_strobe, PIO_WR_DATA_STRB); - /* Clear PIO DONE ISR and start the transfer */ - advk_writel(pcie, 1, PIO_ISR); - advk_writel(pcie, 1, PIO_START); + retry_count = 0; + do { + /* Clear PIO DONE ISR and start the transfer */ + advk_writel(pcie, 1, PIO_ISR); + advk_writel(pcie, 1, PIO_START); - ret = advk_pcie_wait_pio(pcie); - if (ret < 0) - return PCIBIOS_SET_FAILED; + ret = advk_pcie_wait_pio(pcie); + if (ret < 0) + return PCIBIOS_SET_FAILED; - ret = advk_pcie_check_pio_status(pcie, false, NULL); - if (ret < 0) - return PCIBIOS_SET_FAILED; + retry_count += ret; - return PCIBIOS_SUCCESSFUL; + ret = advk_pcie_check_pio_status(pcie, false, NULL); + } while (ret == -EAGAIN && retry_count < PIO_RETRY_CNT); + + return ret < 0 ? PCIBIOS_SET_FAILED : PCIBIOS_SUCCESSFUL; } static struct pci_ops advk_pcie_ops = { From ceee0d9514be138b134f0d69a8c2972f5f1c7c8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 25 Nov 2021 01:26:10 +0100 Subject: [PATCH 1829/5344] PCI: aardvark: Simplify initialization of rootcap on virtual bridge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 454c53271fc11f3aa5e44e41fd99ca181bd32c62 upstream. PCIe config space can be initialized also before pci_bridge_emul_init() call, so move rootcap initialization after PCI config space initialization. This simplifies the function a little since it removes one if (ret < 0) check. Link: https://lore.kernel.org/r/20211005180952.6812-11-kabel@kernel.org Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marek Behún Signed-off-by: Marek Behún Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-aardvark.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index bd5d309232a99..bb1d038346bdf 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -898,7 +898,6 @@ static struct pci_bridge_emul_ops advk_pci_bridge_emul_ops = { static int advk_sw_pci_bridge_init(struct advk_pcie *pcie) { struct pci_bridge_emul *bridge = &pcie->bridge; - int ret; bridge->conf.vendor = cpu_to_le16(advk_readl(pcie, PCIE_CORE_DEV_ID_REG) & 0xffff); @@ -918,19 +917,14 @@ static int advk_sw_pci_bridge_init(struct advk_pcie *pcie) /* Support interrupt A for MSI feature */ bridge->conf.intpin = PCIE_CORE_INT_A_ASSERT_ENABLE; + /* Indicates supports for Completion Retry Status */ + bridge->pcie_conf.rootcap = cpu_to_le16(PCI_EXP_RTCAP_CRSVIS); + bridge->has_pcie = true; bridge->data = pcie; bridge->ops = &advk_pci_bridge_emul_ops; - /* PCIe config space can be initialized after pci_bridge_emul_init() */ - ret = pci_bridge_emul_init(bridge, 0); - if (ret < 0) - return ret; - - /* Indicates supports for Completion Retry Status */ - bridge->pcie_conf.rootcap = cpu_to_le16(PCI_EXP_RTCAP_CRSVIS); - - return 0; + return pci_bridge_emul_init(bridge, 0); } static bool advk_pcie_valid_device(struct advk_pcie *pcie, struct pci_bus *bus, From 6289d79376052a57207e84cd330b073ce1debd62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 25 Nov 2021 01:26:11 +0100 Subject: [PATCH 1830/5344] PCI: aardvark: Fix link training MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit f76b36d40beee0a13aa8f6aa011df0d7cbbb8a7f upstream. Fix multiple link training issues in aardvark driver. The main reason of these issues was misunderstanding of what certain registers do, since their names and comments were misleading: before commit 96be36dbffac ("PCI: aardvark: Replace custom macros by standard linux/pci_regs.h macros"), the pci-aardvark.c driver used custom macros for accessing standard PCIe Root Bridge registers, and misleading comments did not help to understand what the code was really doing. After doing more tests and experiments I've come to the conclusion that the SPEED_GEN register in aardvark sets the PCIe revision / generation compliance and forces maximal link speed. Both GEN3 and GEN2 values set the read-only PCI_EXP_FLAGS_VERS bits (PCIe capabilities version of Root Bridge) to value 2, while GEN1 value sets PCI_EXP_FLAGS_VERS to 1, which matches with PCI Express specifications revisions 3, 2 and 1 respectively. Changing SPEED_GEN also sets the read-only bits PCI_EXP_LNKCAP_SLS and PCI_EXP_LNKCAP2_SLS to corresponding speed. (Note that PCI Express rev 1 specification does not define PCI_EXP_LNKCAP2 and PCI_EXP_LNKCTL2 registers and when SPEED_GEN is set to GEN1 (which also sets PCI_EXP_FLAGS_VERS set to 1), lspci cannot access PCI_EXP_LNKCAP2 and PCI_EXP_LNKCTL2 registers.) Changing PCIe link speed can be done via PCI_EXP_LNKCTL2_TLS bits of PCI_EXP_LNKCTL2 register. Armada 3700 Functional Specifications says that the default value of PCI_EXP_LNKCTL2_TLS is based on SPEED_GEN value, but tests showed that the default value is always 8.0 GT/s, independently of speed set by SPEED_GEN. So after setting SPEED_GEN, we must also set value in PCI_EXP_LNKCTL2 register via PCI_EXP_LNKCTL2_TLS bits. Triggering PCI_EXP_LNKCTL_RL bit immediately after setting LINK_TRAINING_EN bit actually doesn't do anything. Tests have shown that a delay is needed after enabling LINK_TRAINING_EN bit. As triggering PCI_EXP_LNKCTL_RL currently does nothing, remove it. Commit 43fc679ced18 ("PCI: aardvark: Improve link training") introduced code which sets SPEED_GEN register based on negotiated link speed from PCI_EXP_LNKSTA_CLS bits of PCI_EXP_LNKSTA register. This code was added to fix detection of Compex WLE900VX (Atheros QCA9880) WiFi GEN1 PCIe cards, as otherwise these cards were "invisible" on PCIe bus (probably because they crashed). But apparently more people reported the same issues with these cards also with other PCIe controllers [1] and I was able to reproduce this issue also with other "noname" WiFi cards based on Atheros QCA9890 chip (with the same PCI vendor/device ids as Atheros QCA9880). So this is not an issue in aardvark but rather an issue in Atheros QCA98xx chips. Also, this issue only exists if the kernel is compiled with PCIe ASPM support, and a generic workaround for this is to change PCIe Bridge to 2.5 GT/s link speed via PCI_EXP_LNKCTL2_TLS_2_5GT bits in PCI_EXP_LNKCTL2 register [2], before triggering PCI_EXP_LNKCTL_RL bit. This workaround also works when SPEED_GEN is set to value GEN2 (5 GT/s). So remove this hack completely in the aardvark driver and always set SPEED_GEN to value from 'max-link-speed' DT property. Fix for Atheros QCA98xx chips is handled separately by patch [2]. These two things (code for triggering PCI_EXP_LNKCTL_RL bit and changing SPEED_GEN value) also explain why commit 6964494582f5 ("PCI: aardvark: Train link immediately after enabling training") somehow fixed detection of those problematic Compex cards with Atheros chips: if triggering link retraining (via PCI_EXP_LNKCTL_RL bit) was done immediately after enabling link training (via LINK_TRAINING_EN), it did nothing. If there was a specific delay, aardvark HW already initialized PCIe link and therefore triggering link retraining caused the above issue. Compex cards triggered link down event and disappeared from the PCIe bus. Commit f4c7d053d7f7 ("PCI: aardvark: Wait for endpoint to be ready before training link") added 100ms sleep before calling 'Start link training' command and explained that it is a requirement of PCI Express specification. But the code after this 100ms sleep was not doing 'Start link training', rather it triggered PCI_EXP_LNKCTL_RL bit via PCIe Root Bridge to put link into Recovery state. The required delay after fundamental reset is already done in function advk_pcie_wait_for_link() which also checks whether PCIe link is up. So after removing the code which triggers PCI_EXP_LNKCTL_RL bit on PCIe Root Bridge, there is no need to wait 100ms again. Remove the extra msleep() call and update comment about the delay required by the PCI Express specification. According to Marvell Armada 3700 Functional Specifications, Link training should be enabled via aardvark register LINK_TRAINING_EN after selecting PCIe generation and x1 lane. There is no need to disable it prior resetting card via PERST# signal. This disabling code was introduced in commit 5169a9851daa ("PCI: aardvark: Issue PERST via GPIO") as a workaround for some Atheros cards. It turns out that this also is Atheros specific issue and affects any PCIe controller, not only aardvark. Moreover this Atheros issue was triggered by juggling with PCI_EXP_LNKCTL_RL, LINK_TRAINING_EN and SPEED_GEN bits interleaved with sleeps. Now, after removing triggering PCI_EXP_LNKCTL_RL, there is no need to explicitly disable LINK_TRAINING_EN bit. So remove this code too. The problematic Compex cards described in previous git commits are correctly detected in advk_pcie_train_link() function even after applying all these changes. Note that with this patch, and also prior this patch, some NVMe disks which support PCIe GEN3 with 8 GT/s speed are negotiated only at the lowest link speed 2.5 GT/s, independently of SPEED_GEN value. After manually triggering PCI_EXP_LNKCTL_RL bit (e.g. from userspace via setpci), these NVMe disks change link speed to 5 GT/s when SPEED_GEN was configured to GEN2. This issue first needs to be properly investigated. I will send a fix in the future. On the other hand, some other GEN2 PCIe cards with 5 GT/s speed are autonomously by HW autonegotiated at full 5 GT/s speed without need of any software interaction. Armada 3700 Functional Specifications describes the following steps for link training: set SPEED_GEN to GEN2, enable LINK_TRAINING_EN, poll until link training is complete, trigger PCI_EXP_LNKCTL_RL, poll until signal rate is 5 GT/s, poll until link training is complete, enable ASPM L0s. The requirement for triggering PCI_EXP_LNKCTL_RL can be explained by the need to achieve 5 GT/s speed (as changing link speed is done by throw to recovery state entered by PCI_EXP_LNKCTL_RL) or maybe as a part of enabling ASPM L0s (but in this case ASPM L0s should have been enabled prior PCI_EXP_LNKCTL_RL). It is unknown why the original pci-aardvark.c driver was triggering PCI_EXP_LNKCTL_RL bit before waiting for the link to be up. This does not align with neither PCIe base specifications nor with Armada 3700 Functional Specification. (Note that in older versions of aardvark, this bit was called incorrectly PCIE_CORE_LINK_TRAINING, so this may be the reason.) It is also unknown why Armada 3700 Functional Specification says that it is needed to trigger PCI_EXP_LNKCTL_RL for GEN2 mode, as according to PCIe base specification 5 GT/s speed negotiation is supposed to be entirely autonomous, even if initial speed is 2.5 GT/s. [1] - https://lore.kernel.org/linux-pci/87h7l8axqp.fsf@toke.dk/ [2] - https://lore.kernel.org/linux-pci/20210326124326.21163-1-pali@kernel.org/ Link: https://lore.kernel.org/r/20211005180952.6812-12-kabel@kernel.org Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marek Behún Signed-off-by: Marek Behún Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-aardvark.c | 117 ++++++++------------------ 1 file changed, 34 insertions(+), 83 deletions(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index bb1d038346bdf..5898e55426c4c 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -303,11 +303,6 @@ static inline u32 advk_readl(struct advk_pcie *pcie, u64 reg) return readl(pcie->base + reg); } -static inline u16 advk_read16(struct advk_pcie *pcie, u64 reg) -{ - return advk_readl(pcie, (reg & ~0x3)) >> ((reg & 0x3) * 8); -} - static u8 advk_pcie_ltssm_state(struct advk_pcie *pcie) { u32 val; @@ -381,23 +376,9 @@ static void advk_pcie_wait_for_retrain(struct advk_pcie *pcie) static void advk_pcie_issue_perst(struct advk_pcie *pcie) { - u32 reg; - if (!pcie->reset_gpio) return; - /* - * As required by PCI Express spec (PCI Express Base Specification, REV. - * 4.0 PCI Express, February 19 2014, 6.6.1 Conventional Reset) a delay - * for at least 100ms after de-asserting PERST# signal is needed before - * link training is enabled. So ensure that link training is disabled - * prior de-asserting PERST# signal to fulfill that PCI Express spec - * requirement. - */ - reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); - reg &= ~LINK_TRAINING_EN; - advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); - /* 10ms delay is needed for some cards */ dev_info(&pcie->pdev->dev, "issuing PERST via reset GPIO for 10ms\n"); gpiod_set_value_cansleep(pcie->reset_gpio, 1); @@ -405,53 +386,46 @@ static void advk_pcie_issue_perst(struct advk_pcie *pcie) gpiod_set_value_cansleep(pcie->reset_gpio, 0); } -static int advk_pcie_train_at_gen(struct advk_pcie *pcie, int gen) +static void advk_pcie_train_link(struct advk_pcie *pcie) { - int ret, neg_gen; + struct device *dev = &pcie->pdev->dev; u32 reg; + int ret; - /* Setup link speed */ + /* + * Setup PCIe rev / gen compliance based on device tree property + * 'max-link-speed' which also forces maximal link speed. + */ reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); reg &= ~PCIE_GEN_SEL_MSK; - if (gen == 3) + if (pcie->link_gen == 3) reg |= SPEED_GEN_3; - else if (gen == 2) + else if (pcie->link_gen == 2) reg |= SPEED_GEN_2; else reg |= SPEED_GEN_1; advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); /* - * Enable link training. This is not needed in every call to this - * function, just once suffices, but it does not break anything either. + * Set maximal link speed value also into PCIe Link Control 2 register. + * Armada 3700 Functional Specification says that default value is based + * on SPEED_GEN but tests showed that default value is always 8.0 GT/s. */ + reg = advk_readl(pcie, PCIE_CORE_PCIEXP_CAP + PCI_EXP_LNKCTL2); + reg &= ~PCI_EXP_LNKCTL2_TLS; + if (pcie->link_gen == 3) + reg |= PCI_EXP_LNKCTL2_TLS_8_0GT; + else if (pcie->link_gen == 2) + reg |= PCI_EXP_LNKCTL2_TLS_5_0GT; + else + reg |= PCI_EXP_LNKCTL2_TLS_2_5GT; + advk_writel(pcie, reg, PCIE_CORE_PCIEXP_CAP + PCI_EXP_LNKCTL2); + + /* Enable link training after selecting PCIe generation */ reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); reg |= LINK_TRAINING_EN; advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); - /* - * Start link training immediately after enabling it. - * This solves problems for some buggy cards. - */ - reg = advk_readl(pcie, PCIE_CORE_PCIEXP_CAP + PCI_EXP_LNKCTL); - reg |= PCI_EXP_LNKCTL_RL; - advk_writel(pcie, reg, PCIE_CORE_PCIEXP_CAP + PCI_EXP_LNKCTL); - - ret = advk_pcie_wait_for_link(pcie); - if (ret) - return ret; - - reg = advk_read16(pcie, PCIE_CORE_PCIEXP_CAP + PCI_EXP_LNKSTA); - neg_gen = reg & PCI_EXP_LNKSTA_CLS; - - return neg_gen; -} - -static void advk_pcie_train_link(struct advk_pcie *pcie) -{ - struct device *dev = &pcie->pdev->dev; - int neg_gen = -1, gen; - /* * Reset PCIe card via PERST# signal. Some cards are not detected * during link training when they are in some non-initial state. @@ -462,41 +436,18 @@ static void advk_pcie_train_link(struct advk_pcie *pcie) * PERST# signal could have been asserted by pinctrl subsystem before * probe() callback has been called or issued explicitly by reset gpio * function advk_pcie_issue_perst(), making the endpoint going into - * fundamental reset. As required by PCI Express spec a delay for at - * least 100ms after such a reset before link training is needed. + * fundamental reset. As required by PCI Express spec (PCI Express + * Base Specification, REV. 4.0 PCI Express, February 19 2014, 6.6.1 + * Conventional Reset) a delay for at least 100ms after such a reset + * before sending a Configuration Request to the device is needed. + * So wait until PCIe link is up. Function advk_pcie_wait_for_link() + * waits for link at least 900ms. */ - msleep(PCI_PM_D3COLD_WAIT); - - /* - * Try link training at link gen specified by device tree property - * 'max-link-speed'. If this fails, iteratively train at lower gen. - */ - for (gen = pcie->link_gen; gen > 0; --gen) { - neg_gen = advk_pcie_train_at_gen(pcie, gen); - if (neg_gen > 0) - break; - } - - if (neg_gen < 0) - goto err; - - /* - * After successful training if negotiated gen is lower than requested, - * train again on negotiated gen. This solves some stability issues for - * some buggy gen1 cards. - */ - if (neg_gen < gen) { - gen = neg_gen; - neg_gen = advk_pcie_train_at_gen(pcie, gen); - } - - if (neg_gen == gen) { - dev_info(dev, "link up at gen %i\n", gen); - return; - } - -err: - dev_err(dev, "link never came up\n"); + ret = advk_pcie_wait_for_link(pcie); + if (ret < 0) + dev_err(dev, "link never came up\n"); + else + dev_info(dev, "link up\n"); } /* From c531f69637a78afd67a03d8893db530065184f38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 25 Nov 2021 01:26:12 +0100 Subject: [PATCH 1831/5344] PCI: aardvark: Fix support for bus mastering and PCI_COMMAND on emulated bridge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 771153fc884f566a89af2d30033b7f3bc6e24e84 upstream. >From very vague, ambiguous and incomplete information from Marvell we deduced that the 32-bit Aardvark register at address 0x4 (PCIE_CORE_CMD_STATUS_REG), which is not documented for Root Complex mode in the Functional Specification (only for Endpoint mode), controls two 16-bit PCIe registers: Command Register and Status Registers of PCIe Root Port. This means that bit 2 controls bus mastering and forwarding of memory and I/O requests in the upstream direction. According to PCI specifications bits [0:2] of Command Register, this should be by default disabled on reset. So explicitly disable these bits at early setup of the Aardvark driver. Remove code which unconditionally enables all 3 bits and let kernel code (via pci_set_master() function) to handle bus mastering of Root PCIe Bridge via emulated PCI_COMMAND on emulated bridge. Link: https://lore.kernel.org/r/20211028185659.20329-5-kabel@kernel.org Fixes: 8a3ebd8de328 ("PCI: aardvark: Implement emulated root PCI bridge config space") Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org # b2a56469d550 ("PCI: aardvark: Add FIXME comment for PCIE_CORE_CMD_STATUS_REG access") Signed-off-by: Marek Behún Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-aardvark.c | 47 ++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 5898e55426c4c..2fe6b3aa89194 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -27,9 +27,6 @@ /* PCIe core registers */ #define PCIE_CORE_DEV_ID_REG 0x0 #define PCIE_CORE_CMD_STATUS_REG 0x4 -#define PCIE_CORE_CMD_IO_ACCESS_EN BIT(0) -#define PCIE_CORE_CMD_MEM_ACCESS_EN BIT(1) -#define PCIE_CORE_CMD_MEM_IO_REQ_EN BIT(2) #define PCIE_CORE_DEV_REV_REG 0x8 #define PCIE_CORE_PCIEXP_CAP 0xc0 #define PCIE_CORE_ERR_CAPCTL_REG 0x118 @@ -505,6 +502,11 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) reg = (PCI_VENDOR_ID_MARVELL << 16) | PCI_VENDOR_ID_MARVELL; advk_writel(pcie, reg, VENDOR_ID_REG); + /* Disable Root Bridge I/O space, memory space and bus mastering */ + reg = advk_readl(pcie, PCIE_CORE_CMD_STATUS_REG); + reg &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER); + advk_writel(pcie, reg, PCIE_CORE_CMD_STATUS_REG); + /* Set Advanced Error Capabilities and Control PF0 register */ reg = PCIE_CORE_ERR_CAPCTL_ECRC_CHK_TX | PCIE_CORE_ERR_CAPCTL_ECRC_CHK_TX_EN | @@ -603,12 +605,6 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) advk_pcie_disable_ob_win(pcie, i); advk_pcie_train_link(pcie); - - reg = advk_readl(pcie, PCIE_CORE_CMD_STATUS_REG); - reg |= PCIE_CORE_CMD_MEM_ACCESS_EN | - PCIE_CORE_CMD_IO_ACCESS_EN | - PCIE_CORE_CMD_MEM_IO_REQ_EN; - advk_writel(pcie, reg, PCIE_CORE_CMD_STATUS_REG); } static int advk_pcie_check_pio_status(struct advk_pcie *pcie, bool allow_crs, u32 *val) @@ -737,6 +733,37 @@ static int advk_pcie_wait_pio(struct advk_pcie *pcie) return -ETIMEDOUT; } +static pci_bridge_emul_read_status_t +advk_pci_bridge_emul_base_conf_read(struct pci_bridge_emul *bridge, + int reg, u32 *value) +{ + struct advk_pcie *pcie = bridge->data; + + switch (reg) { + case PCI_COMMAND: + *value = advk_readl(pcie, PCIE_CORE_CMD_STATUS_REG); + return PCI_BRIDGE_EMUL_HANDLED; + + default: + return PCI_BRIDGE_EMUL_NOT_HANDLED; + } +} + +static void +advk_pci_bridge_emul_base_conf_write(struct pci_bridge_emul *bridge, + int reg, u32 old, u32 new, u32 mask) +{ + struct advk_pcie *pcie = bridge->data; + + switch (reg) { + case PCI_COMMAND: + advk_writel(pcie, new, PCIE_CORE_CMD_STATUS_REG); + break; + + default: + break; + } +} static pci_bridge_emul_read_status_t advk_pci_bridge_emul_pcie_conf_read(struct pci_bridge_emul *bridge, @@ -838,6 +865,8 @@ advk_pci_bridge_emul_pcie_conf_write(struct pci_bridge_emul *bridge, } static struct pci_bridge_emul_ops advk_pci_bridge_emul_ops = { + .read_base = advk_pci_bridge_emul_base_conf_read, + .write_base = advk_pci_bridge_emul_base_conf_write, .read_pcie = advk_pci_bridge_emul_pcie_conf_read, .write_pcie = advk_pci_bridge_emul_pcie_conf_write, }; From 330f84f3ff00a5934ed15d8e86ac0671e56d7d48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 25 Nov 2021 01:26:13 +0100 Subject: [PATCH 1832/5344] PCI: aardvark: Set PCI Bridge Class Code to PCI Bridge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 84e1b4045dc887b78bdc87d92927093dc3a465aa upstream. Aardvark controller has something like config space of a Root Port available at offset 0x0 of internal registers - these registers are used for implementation of the emulated bridge. The default value of Class Code of this bridge corresponds to a RAID Mass storage controller, though. (This is probably intended for when the controller is used as Endpoint.) Change the Class Code to correspond to a PCI Bridge. Add comment explaining this change. Link: https://lore.kernel.org/r/20211028185659.20329-6-kabel@kernel.org Fixes: 8a3ebd8de328 ("PCI: aardvark: Implement emulated root PCI bridge config space") Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org Signed-off-by: Marek Behún Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-aardvark.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 2fe6b3aa89194..3265709bfba96 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -502,6 +502,26 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) reg = (PCI_VENDOR_ID_MARVELL << 16) | PCI_VENDOR_ID_MARVELL; advk_writel(pcie, reg, VENDOR_ID_REG); + /* + * Change Class Code of PCI Bridge device to PCI Bridge (0x600400), + * because the default value is Mass storage controller (0x010400). + * + * Note that this Aardvark PCI Bridge does not have compliant Type 1 + * Configuration Space and it even cannot be accessed via Aardvark's + * PCI config space access method. Something like config space is + * available in internal Aardvark registers starting at offset 0x0 + * and is reported as Type 0. In range 0x10 - 0x34 it has totally + * different registers. + * + * Therefore driver uses emulation of PCI Bridge which emulates + * access to configuration space via internal Aardvark registers or + * emulated configuration buffer. + */ + reg = advk_readl(pcie, PCIE_CORE_DEV_REV_REG); + reg &= ~0xffffff00; + reg |= (PCI_CLASS_BRIDGE_PCI << 8) << 8; + advk_writel(pcie, reg, PCIE_CORE_DEV_REV_REG); + /* Disable Root Bridge I/O space, memory space and bus mastering */ reg = advk_readl(pcie, PCIE_CORE_CMD_STATUS_REG); reg &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER); From 82e99870893a531be4be5e3d3386e5b01a66411d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 25 Nov 2021 01:26:14 +0100 Subject: [PATCH 1833/5344] PCI: aardvark: Fix support for PCI_BRIDGE_CTL_BUS_RESET on emulated bridge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit bc4fac42e5f8460af09c0a7f2f1915be09e20c71 upstream. Aardvark supports PCIe Hot Reset via PCIE_CORE_CTRL1_REG. Use it for implementing PCI_BRIDGE_CTL_BUS_RESET bit of PCI_BRIDGE_CONTROL register on emulated bridge. With this, the function pci_reset_secondary_bus() starts working and can reset connected PCIe card. Custom userspace script [1] which uses setpci can trigger PCIe Hot Reset and reset the card manually. [1] https://alexforencich.com/wiki/en/pcie/hot-reset-linux Link: https://lore.kernel.org/r/20211028185659.20329-7-kabel@kernel.org Fixes: 8a3ebd8de328 ("PCI: aardvark: Implement emulated root PCI bridge config space") Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org Signed-off-by: Marek Behún Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/pci-aardvark.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 3265709bfba96..9e208294946cd 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -764,6 +764,22 @@ advk_pci_bridge_emul_base_conf_read(struct pci_bridge_emul *bridge, *value = advk_readl(pcie, PCIE_CORE_CMD_STATUS_REG); return PCI_BRIDGE_EMUL_HANDLED; + case PCI_INTERRUPT_LINE: { + /* + * From the whole 32bit register we support reading from HW only + * one bit: PCI_BRIDGE_CTL_BUS_RESET. + * Other bits are retrieved only from emulated config buffer. + */ + __le32 *cfgspace = (__le32 *)&bridge->conf; + u32 val = le32_to_cpu(cfgspace[PCI_INTERRUPT_LINE / 4]); + if (advk_readl(pcie, PCIE_CORE_CTRL1_REG) & HOT_RESET_GEN) + val |= PCI_BRIDGE_CTL_BUS_RESET << 16; + else + val &= ~(PCI_BRIDGE_CTL_BUS_RESET << 16); + *value = val; + return PCI_BRIDGE_EMUL_HANDLED; + } + default: return PCI_BRIDGE_EMUL_NOT_HANDLED; } @@ -780,6 +796,17 @@ advk_pci_bridge_emul_base_conf_write(struct pci_bridge_emul *bridge, advk_writel(pcie, new, PCIE_CORE_CMD_STATUS_REG); break; + case PCI_INTERRUPT_LINE: + if (mask & (PCI_BRIDGE_CTL_BUS_RESET << 16)) { + u32 val = advk_readl(pcie, PCIE_CORE_CTRL1_REG); + if (new & (PCI_BRIDGE_CTL_BUS_RESET << 16)) + val |= HOT_RESET_GEN; + else + val &= ~HOT_RESET_GEN; + advk_writel(pcie, val, PCIE_CORE_CTRL1_REG); + } + break; + default: break; } From a1dece0a6e9bf0ac7b382e2aa0a8c88609ab68e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Thu, 25 Nov 2021 01:26:15 +0100 Subject: [PATCH 1834/5344] pinctrl: armada-37xx: Correct PWM pins definitions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit baf8d6899b1e8906dc076ef26cc633e96a8bb0c3 upstream. The PWM pins on North Bridge on Armada 37xx can be configured into PWM or GPIO functions. When in PWM function, each pin can also be configured to drive low on 0 and tri-state on 1 (LED mode). The current definitions handle this by declaring two pin groups for each pin: - group "pwmN" with functions "pwm" and "gpio" - group "ledN_od" ("od" for open drain) with functions "led" and "gpio" This is semantically incorrect. The correct definition for each pin should be one group with three functions: "pwm", "led" and "gpio". Change the "pwmN" groups to support "led" function. Remove "ledN_od" groups. This cannot break backwards compatibility with older device trees: no device tree uses it since there is no PWM driver for this SOC yet. Also "ledN_od" groups are not even documented. Fixes: b835d6953009 ("pinctrl: armada-37xx: swap polarity on LED group") Signed-off-by: Marek Behún Acked-by: Rob Herring Link: https://lore.kernel.org/r/20210719112938.27594-1-kabel@kernel.org Signed-off-by: Linus Walleij Signed-off-by: Marek Behún Signed-off-by: Greg Kroah-Hartman --- .../pinctrl/marvell,armada-37xx-pinctrl.txt | 8 ++++---- drivers/pinctrl/mvebu/pinctrl-armada-37xx.c | 17 ++++++++--------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/marvell,armada-37xx-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/marvell,armada-37xx-pinctrl.txt index 38dc56a577604..ecec514b31550 100644 --- a/Documentation/devicetree/bindings/pinctrl/marvell,armada-37xx-pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/marvell,armada-37xx-pinctrl.txt @@ -43,19 +43,19 @@ group emmc_nb group pwm0 - pin 11 (GPIO1-11) - - functions pwm, gpio + - functions pwm, led, gpio group pwm1 - pin 12 - - functions pwm, gpio + - functions pwm, led, gpio group pwm2 - pin 13 - - functions pwm, gpio + - functions pwm, led, gpio group pwm3 - pin 14 - - functions pwm, gpio + - functions pwm, led, gpio group pmic1 - pin 7 diff --git a/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c b/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c index 83e585c5a6132..f56add78d58ce 100644 --- a/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c +++ b/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c @@ -166,10 +166,14 @@ static struct armada_37xx_pin_group armada_37xx_nb_groups[] = { PIN_GRP_GPIO("jtag", 20, 5, BIT(0), "jtag"), PIN_GRP_GPIO("sdio0", 8, 3, BIT(1), "sdio"), PIN_GRP_GPIO("emmc_nb", 27, 9, BIT(2), "emmc"), - PIN_GRP_GPIO("pwm0", 11, 1, BIT(3), "pwm"), - PIN_GRP_GPIO("pwm1", 12, 1, BIT(4), "pwm"), - PIN_GRP_GPIO("pwm2", 13, 1, BIT(5), "pwm"), - PIN_GRP_GPIO("pwm3", 14, 1, BIT(6), "pwm"), + PIN_GRP_GPIO_3("pwm0", 11, 1, BIT(3) | BIT(20), 0, BIT(20), BIT(3), + "pwm", "led"), + PIN_GRP_GPIO_3("pwm1", 12, 1, BIT(4) | BIT(21), 0, BIT(21), BIT(4), + "pwm", "led"), + PIN_GRP_GPIO_3("pwm2", 13, 1, BIT(5) | BIT(22), 0, BIT(22), BIT(5), + "pwm", "led"), + PIN_GRP_GPIO_3("pwm3", 14, 1, BIT(6) | BIT(23), 0, BIT(23), BIT(6), + "pwm", "led"), PIN_GRP_GPIO("pmic1", 7, 1, BIT(7), "pmic"), PIN_GRP_GPIO("pmic0", 6, 1, BIT(8), "pmic"), PIN_GRP_GPIO("i2c2", 2, 2, BIT(9), "i2c"), @@ -183,11 +187,6 @@ static struct armada_37xx_pin_group armada_37xx_nb_groups[] = { PIN_GRP_EXTRA("uart2", 9, 2, BIT(1) | BIT(13) | BIT(14) | BIT(19), BIT(1) | BIT(13) | BIT(14), BIT(1) | BIT(19), 18, 2, "gpio", "uart"), - PIN_GRP_GPIO_2("led0_od", 11, 1, BIT(20), BIT(20), 0, "led"), - PIN_GRP_GPIO_2("led1_od", 12, 1, BIT(21), BIT(21), 0, "led"), - PIN_GRP_GPIO_2("led2_od", 13, 1, BIT(22), BIT(22), 0, "led"), - PIN_GRP_GPIO_2("led3_od", 14, 1, BIT(23), BIT(23), 0, "led"), - }; static struct armada_37xx_pin_group armada_37xx_sb_groups[] = { From 1875a8f680888880f74ce52f9febd8100b2d52a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Thu, 25 Nov 2021 01:26:16 +0100 Subject: [PATCH 1835/5344] arm64: dts: marvell: armada-37xx: Set pcie_reset_pin to gpio function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 715878016984b2617f6c1f177c50039e12e7bd5b upstream. We found out that we are unable to control the PERST# signal via the default pin dedicated to be PERST# pin (GPIO2[3] pin) on A3700 SOC when this pin is in EP_PCIE1_Resetn mode. There is a register in the PCIe register space called PERSTN_GPIO_EN (D0088004[3]), but changing the value of this register does not change the pin output when measuring with voltmeter. We do not know if this is a bug in the SOC, or if it works only when PCIe controller is in a certain state. Commit f4c7d053d7f7 ("PCI: aardvark: Wait for endpoint to be ready before training link") says that when this pin changes pinctrl mode from EP_PCIE1_Resetn to GPIO, the PERST# signal is asserted for a brief moment. So currently the situation is that on A3700 boards the PERST# signal is asserted in U-Boot (because the code in U-Boot issues reset via this pin via GPIO mode), and then in Linux by the obscure and undocumented mechanism described by the above mentioned commit. We want to issue PERST# signal in a known way, therefore this patch changes the pcie_reset_pin function from "pcie" to "gpio" and adds the reset-gpios property to the PCIe node in device tree files of EspressoBin and Armada 3720 Dev Board (Turris Mox device tree already has this property and uDPU does not have a PCIe port). Signed-off-by: Marek Behún Cc: Remi Pommarel Tested-by: Tomasz Maciej Nowak Acked-by: Thomas Petazzoni Signed-off-by: Gregory CLEMENT Signed-off-by: Marek Behún Signed-off-by: Greg Kroah-Hartman --- arch/arm64/boot/dts/marvell/armada-3720-db.dts | 3 +++ arch/arm64/boot/dts/marvell/armada-3720-espressobin.dts | 1 + arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts | 4 ---- arch/arm64/boot/dts/marvell/armada-37xx.dtsi | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/arm64/boot/dts/marvell/armada-3720-db.dts b/arch/arm64/boot/dts/marvell/armada-3720-db.dts index f2cc00594d64a..3e5789f372069 100644 --- a/arch/arm64/boot/dts/marvell/armada-3720-db.dts +++ b/arch/arm64/boot/dts/marvell/armada-3720-db.dts @@ -128,6 +128,9 @@ /* CON15(V2.0)/CON17(V1.4) : PCIe / CON15(V2.0)/CON12(V1.4) :mini-PCIe */ &pcie0 { + pinctrl-names = "default"; + pinctrl-0 = <&pcie_reset_pins &pcie_clkreq_pins>; + reset-gpios = <&gpiosb 3 GPIO_ACTIVE_LOW>; status = "okay"; }; diff --git a/arch/arm64/boot/dts/marvell/armada-3720-espressobin.dts b/arch/arm64/boot/dts/marvell/armada-3720-espressobin.dts index 6226e7e809807..a75bb2ea3506d 100644 --- a/arch/arm64/boot/dts/marvell/armada-3720-espressobin.dts +++ b/arch/arm64/boot/dts/marvell/armada-3720-espressobin.dts @@ -59,6 +59,7 @@ phys = <&comphy1 0>; pinctrl-names = "default"; pinctrl-0 = <&pcie_reset_pins &pcie_clkreq_pins>; + reset-gpios = <&gpiosb 3 GPIO_ACTIVE_LOW>; }; /* J6 */ diff --git a/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts b/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts index de0eabff29353..16e73597bb78c 100644 --- a/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts +++ b/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts @@ -127,10 +127,6 @@ }; }; -&pcie_reset_pins { - function = "gpio"; -}; - &pcie0 { pinctrl-names = "default"; pinctrl-0 = <&pcie_reset_pins &pcie_clkreq_pins>; diff --git a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi index c28611c1c251a..3d15e4ab3f53a 100644 --- a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi +++ b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi @@ -318,7 +318,7 @@ pcie_reset_pins: pcie-reset-pins { groups = "pcie1"; - function = "pcie"; + function = "gpio"; }; pcie_clkreq_pins: pcie-clkreq-pins { From a4b5a2257a0ba54a1872d5b6ff4a179d963b6133 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 19 Nov 2021 16:43:58 -0800 Subject: [PATCH 1836/5344] proc/vmcore: fix clearing user buffer by properly using clear_user() commit c1e63117711977cc4295b2ce73de29dd17066c82 upstream. To clear a user buffer we cannot simply use memset, we have to use clear_user(). With a virtio-mem device that registers a vmcore_cb and has some logically unplugged memory inside an added Linux memory block, I can easily trigger a BUG by copying the vmcore via "cp": systemd[1]: Starting Kdump Vmcore Save Service... kdump[420]: Kdump is using the default log level(3). kdump[453]: saving to /sysroot/var/crash/127.0.0.1-2021-11-11-14:59:22/ kdump[458]: saving vmcore-dmesg.txt to /sysroot/var/crash/127.0.0.1-2021-11-11-14:59:22/ kdump[465]: saving vmcore-dmesg.txt complete kdump[467]: saving vmcore BUG: unable to handle page fault for address: 00007f2374e01000 #PF: supervisor write access in kernel mode #PF: error_code(0x0003) - permissions violation PGD 7a523067 P4D 7a523067 PUD 7a528067 PMD 7a525067 PTE 800000007048f867 Oops: 0003 [#1] PREEMPT SMP NOPTI CPU: 0 PID: 468 Comm: cp Not tainted 5.15.0+ #6 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.14.0-27-g64f37cc530f1-prebuilt.qemu.org 04/01/2014 RIP: 0010:read_from_oldmem.part.0.cold+0x1d/0x86 Code: ff ff ff e8 05 ff fe ff e9 b9 e9 7f ff 48 89 de 48 c7 c7 38 3b 60 82 e8 f1 fe fe ff 83 fd 08 72 3c 49 8d 7d 08 4c 89 e9 89 e8 <49> c7 45 00 00 00 00 00 49 c7 44 05 f8 00 00 00 00 48 83 e7 f81 RSP: 0018:ffffc9000073be08 EFLAGS: 00010212 RAX: 0000000000001000 RBX: 00000000002fd000 RCX: 00007f2374e01000 RDX: 0000000000000001 RSI: 00000000ffffdfff RDI: 00007f2374e01008 RBP: 0000000000001000 R08: 0000000000000000 R09: ffffc9000073bc50 R10: ffffc9000073bc48 R11: ffffffff829461a8 R12: 000000000000f000 R13: 00007f2374e01000 R14: 0000000000000000 R15: ffff88807bd421e8 FS: 00007f2374e12140(0000) GS:ffff88807f000000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f2374e01000 CR3: 000000007a4aa000 CR4: 0000000000350eb0 Call Trace: read_vmcore+0x236/0x2c0 proc_reg_read+0x55/0xa0 vfs_read+0x95/0x190 ksys_read+0x4f/0xc0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae Some x86-64 CPUs have a CPU feature called "Supervisor Mode Access Prevention (SMAP)", which is used to detect wrong access from the kernel to user buffers like this: SMAP triggers a permissions violation on wrong access. In the x86-64 variant of clear_user(), SMAP is properly handled via clac()+stac(). To fix, properly use clear_user() when we're dealing with a user buffer. Link: https://lkml.kernel.org/r/20211112092750.6921-1-david@redhat.com Fixes: 997c136f518c ("fs/proc/vmcore.c: add hook to read_from_oldmem() to check for non-ram pages") Signed-off-by: David Hildenbrand Acked-by: Baoquan He Cc: Dave Young Cc: Baoquan He Cc: Vivek Goyal Cc: Philipp Rudo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: David Hildenbrand Signed-off-by: Greg Kroah-Hartman --- fs/proc/vmcore.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 080ca9d5eccbb..b1102a31a1085 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -125,9 +125,13 @@ ssize_t read_from_oldmem(char *buf, size_t count, nr_bytes = count; /* If pfn is not ram, return zeros for sparse dump files */ - if (pfn_is_ram(pfn) == 0) - memset(buf, 0, nr_bytes); - else { + if (pfn_is_ram(pfn) == 0) { + tmp = 0; + if (!userbuf) + memset(buf, 0, nr_bytes); + else if (clear_user(buf, nr_bytes)) + tmp = -EFAULT; + } else { if (encrypted) tmp = copy_oldmem_page_encrypted(pfn, buf, nr_bytes, @@ -136,10 +140,10 @@ ssize_t read_from_oldmem(char *buf, size_t count, else tmp = copy_oldmem_page(pfn, buf, nr_bytes, offset, userbuf); - - if (tmp < 0) - return tmp; } + if (tmp < 0) + return tmp; + *ppos += nr_bytes; count -= nr_bytes; buf += nr_bytes; From b9a7f9a7b34bdc1e06aa85b300a9a75241c1f383 Mon Sep 17 00:00:00 2001 From: yangxingwu Date: Thu, 4 Nov 2021 03:10:29 +0100 Subject: [PATCH 1837/5344] netfilter: ipvs: Fix reuse connection if RS weight is 0 [ Upstream commit c95c07836fa4c1767ed11d8eca0769c652760e32 ] We are changing expire_nodest_conn to work even for reused connections when conn_reuse_mode=0, just as what was done with commit dc7b3eb900aa ("ipvs: Fix reuse connection if real server is dead"). For controlled and persistent connections, the new connection will get the needed real server depending on the rules in ip_vs_check_template(). Fixes: d752c3645717 ("ipvs: allow rescheduling of new connections when port reuse is detected") Co-developed-by: Chuanqi Liu Signed-off-by: Chuanqi Liu Signed-off-by: yangxingwu Acked-by: Simon Horman Acked-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- Documentation/networking/ipvs-sysctl.txt | 3 +-- net/netfilter/ipvs/ip_vs_core.c | 8 ++++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/Documentation/networking/ipvs-sysctl.txt b/Documentation/networking/ipvs-sysctl.txt index 056898685d408..fc531c29a2e83 100644 --- a/Documentation/networking/ipvs-sysctl.txt +++ b/Documentation/networking/ipvs-sysctl.txt @@ -30,8 +30,7 @@ conn_reuse_mode - INTEGER 0: disable any special handling on port reuse. The new connection will be delivered to the same real server that was - servicing the previous connection. This will effectively - disable expire_nodest_conn. + servicing the previous connection. bit 1: enable rescheduling of new connections when it is safe. That is, whenever expire_nodest_conn and for TCP sockets, when diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index b12c43701f401..df26529f81899 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1976,7 +1976,6 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int struct ip_vs_proto_data *pd; struct ip_vs_conn *cp; int ret, pkts; - int conn_reuse_mode; struct sock *sk; /* Already marked as IPVS request or reply? */ @@ -2053,15 +2052,16 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, ipvs, af, skb, &iph); - conn_reuse_mode = sysctl_conn_reuse_mode(ipvs); - if (conn_reuse_mode && !iph.fragoffs && is_new_conn(skb, &iph) && cp) { + if (!iph.fragoffs && is_new_conn(skb, &iph) && cp) { + int conn_reuse_mode = sysctl_conn_reuse_mode(ipvs); bool old_ct = false, resched = false; if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest && unlikely(!atomic_read(&cp->dest->weight))) { resched = true; old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); - } else if (is_new_conn_expected(cp, conn_reuse_mode)) { + } else if (conn_reuse_mode && + is_new_conn_expected(cp, conn_reuse_mode)) { old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); if (!atomic_read(&cp->n_control)) { resched = true; From 84088aa70ee5810e4d0ddbfe114c1d64991c43b7 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 27 Oct 2021 12:37:29 -0700 Subject: [PATCH 1838/5344] ARM: dts: BCM5301X: Fix I2C controller interrupt [ Upstream commit 754c4050a00e802e122690112fc2c3a6abafa7e2 ] The I2C interrupt controller line is off by 32 because the datasheet describes interrupt inputs into the GIC which are for Shared Peripheral Interrupts and are starting at offset 32. The ARM GIC binding expects the SPI interrupts to be numbered from 0 relative to the SPI base. Fixes: bb097e3e0045 ("ARM: dts: BCM5301X: Add I2C support to the DT") Tested-by: Christian Lamparter Signed-off-by: Florian Fainelli Signed-off-by: Sasha Levin --- arch/arm/boot/dts/bcm5301x.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/bcm5301x.dtsi b/arch/arm/boot/dts/bcm5301x.dtsi index 9711170649b69..51f20ded92f04 100644 --- a/arch/arm/boot/dts/bcm5301x.dtsi +++ b/arch/arm/boot/dts/bcm5301x.dtsi @@ -387,7 +387,7 @@ i2c0: i2c@18009000 { compatible = "brcm,iproc-i2c"; reg = <0x18009000 0x50>; - interrupts = ; + interrupts = ; #address-cells = <1>; #size-cells = <0>; clock-frequency = <100000>; From b028e495fea2bb1f7015dfd2ce970c4295df587b Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 28 Oct 2021 09:46:53 -0700 Subject: [PATCH 1839/5344] ARM: dts: BCM5301X: Add interrupt properties to GPIO node [ Upstream commit 40f7342f0587639e5ad625adaa15efdd3cffb18f ] The GPIO controller is also an interrupt controller provider and is currently missing the appropriate 'interrupt-controller' and '#interrupt-cells' properties to denote that. Fixes: fb026d3de33b ("ARM: BCM5301X: Add Broadcom's bus-axi to the DTS file") Signed-off-by: Florian Fainelli Signed-off-by: Sasha Levin --- arch/arm/boot/dts/bcm5301x.dtsi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/boot/dts/bcm5301x.dtsi b/arch/arm/boot/dts/bcm5301x.dtsi index 51f20ded92f04..05d67f9769118 100644 --- a/arch/arm/boot/dts/bcm5301x.dtsi +++ b/arch/arm/boot/dts/bcm5301x.dtsi @@ -242,6 +242,8 @@ gpio-controller; #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; }; pcie0: pcie@12000 { From 8ba79bbb0047c066e93358deb0e6551aee9c4444 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Tue, 16 Nov 2021 11:47:18 +0000 Subject: [PATCH 1840/5344] ASoC: qdsp6: q6routing: Conditionally reset FrontEnd Mixer [ Upstream commit 861afeac7990587588d057b2c0b3222331c3da29 ] Stream IDs are reused across multiple BackEnd mixers, do not reset the stream mixers if they are not already set for that particular FrontEnd. Ex: amixer cset iface=MIXER,name='SLIMBUS_0_RX Audio Mixer MultiMedia1' 1 would set the MultiMedia1 steam for SLIMBUS_0_RX, however doing below command will reset previously setup MultiMedia1 stream, because both of them are using MultiMedia1 PCM stream. amixer cset iface=MIXER,name='SLIMBUS_2_RX Audio Mixer MultiMedia1' 0 reset the FrontEnd Mixers conditionally to fix this issue. This is more noticeable in desktop setup, where in alsactl tries to restore the alsa state and overwriting the previous mixer settings. Fixes: e3a33673e845 ("ASoC: qdsp6: q6routing: Add q6routing driver") Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20211116114721.12517-3-srinivas.kandagatla@linaro.org Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/qcom/qdsp6/q6routing.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sound/soc/qcom/qdsp6/q6routing.c b/sound/soc/qcom/qdsp6/q6routing.c index 745cc9dd14f38..16f26dd2d59ed 100644 --- a/sound/soc/qcom/qdsp6/q6routing.c +++ b/sound/soc/qcom/qdsp6/q6routing.c @@ -443,7 +443,11 @@ static int msm_routing_put_audio_mixer(struct snd_kcontrol *kcontrol, session->port_id = be_id; snd_soc_dapm_mixer_update_power(dapm, kcontrol, 1, update); } else { - session->port_id = -1; + if (session->port_id == be_id) { + session->port_id = -1; + return 0; + } + snd_soc_dapm_mixer_update_power(dapm, kcontrol, 0, update); } From 89e9382f565d46dacd370bfaac775544ebc89527 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 16 Nov 2021 08:18:12 +0100 Subject: [PATCH 1841/5344] ASoC: topology: Add missing rwsem around snd_ctl_remove() calls [ Upstream commit 7e567b5ae06315ef2d70666b149962e2bb4b97af ] snd_ctl_remove() has to be called with card->controls_rwsem held (when called after the card instantiation). This patch add the missing rwsem calls around it. Fixes: 8a9782346dcc ("ASoC: topology: Add topology core") Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20211116071812.18109-1-tiwai@suse.de Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/soc-topology.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c index c367609433bfc..21f859e56b700 100644 --- a/sound/soc/soc-topology.c +++ b/sound/soc/soc-topology.c @@ -2777,6 +2777,7 @@ EXPORT_SYMBOL_GPL(snd_soc_tplg_widget_remove_all); /* remove dynamic controls from the component driver */ int snd_soc_tplg_component_remove(struct snd_soc_component *comp, u32 index) { + struct snd_card *card = comp->card->snd_card; struct snd_soc_dobj *dobj, *next_dobj; int pass = SOC_TPLG_PASS_END; @@ -2784,6 +2785,7 @@ int snd_soc_tplg_component_remove(struct snd_soc_component *comp, u32 index) while (pass >= SOC_TPLG_PASS_START) { /* remove mixer controls */ + down_write(&card->controls_rwsem); list_for_each_entry_safe(dobj, next_dobj, &comp->dobj_list, list) { @@ -2827,6 +2829,7 @@ int snd_soc_tplg_component_remove(struct snd_soc_component *comp, u32 index) break; } } + up_write(&card->controls_rwsem); pass--; } From af70c02739e2e816ef41d23818a044a9c064e512 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 11 Nov 2021 22:09:16 -0500 Subject: [PATCH 1842/5344] net: ieee802154: handle iftypes as u32 [ Upstream commit 451dc48c806a7ce9fbec5e7a24ccf4b2c936e834 ] This patch fixes an issue that an u32 netlink value is handled as a signed enum value which doesn't fit into the range of u32 netlink type. If it's handled as -1 value some BIT() evaluation ends in a shift-out-of-bounds issue. To solve the issue we set the to u32 max which is s32 "-1" value to keep backwards compatibility and let the followed enum values start counting at 0. This brings the compiler to never handle the enum as signed and a check if the value is above NL802154_IFTYPE_MAX should filter -1 out. Fixes: f3ea5e44231a ("ieee802154: add new interface command") Signed-off-by: Alexander Aring Link: https://lore.kernel.org/r/20211112030916.685793-1-aahringo@redhat.com Signed-off-by: Stefan Schmidt Signed-off-by: Sasha Levin --- include/net/nl802154.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/net/nl802154.h b/include/net/nl802154.h index ddcee128f5d9a..145acb8f25095 100644 --- a/include/net/nl802154.h +++ b/include/net/nl802154.h @@ -19,6 +19,8 @@ * */ +#include + #define NL802154_GENL_NAME "nl802154" enum nl802154_commands { @@ -150,10 +152,9 @@ enum nl802154_attrs { }; enum nl802154_iftype { - /* for backwards compatibility TODO */ - NL802154_IFTYPE_UNSPEC = -1, + NL802154_IFTYPE_UNSPEC = (~(__u32)0), - NL802154_IFTYPE_NODE, + NL802154_IFTYPE_NODE = 0, NL802154_IFTYPE_MONITOR, NL802154_IFTYPE_COORD, From 76d09eaa0e6dd4875a538a0c5d1dbbf9ddd8864a Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Tue, 16 Nov 2021 14:42:27 +0800 Subject: [PATCH 1843/5344] firmware: arm_scmi: pm: Propagate return value to caller [ Upstream commit 1446fc6c678e8d8b31606a4b877abe205f344b38 ] of_genpd_add_provider_onecell may return error, so let's propagate its return value to caller Link: https://lore.kernel.org/r/20211116064227.20571-1-peng.fan@oss.nxp.com Fixes: 898216c97ed2 ("firmware: arm_scmi: add device power domain support using genpd") Signed-off-by: Peng Fan Signed-off-by: Sudeep Holla Signed-off-by: Sasha Levin --- drivers/firmware/arm_scmi/scmi_pm_domain.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/firmware/arm_scmi/scmi_pm_domain.c b/drivers/firmware/arm_scmi/scmi_pm_domain.c index 041f8152272bf..177874adccf0d 100644 --- a/drivers/firmware/arm_scmi/scmi_pm_domain.c +++ b/drivers/firmware/arm_scmi/scmi_pm_domain.c @@ -106,9 +106,7 @@ static int scmi_pm_domain_probe(struct scmi_device *sdev) scmi_pd_data->domains = domains; scmi_pd_data->num_domains = num_domains; - of_genpd_add_provider_onecell(np, scmi_pd_data); - - return 0; + return of_genpd_add_provider_onecell(np, scmi_pd_data); } static const struct scmi_device_id scmi_id_table[] = { From 90a3be254035a8dd3604425c95f52b0206588946 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 16 Nov 2021 09:55:01 -0500 Subject: [PATCH 1844/5344] NFSv42: Don't fail clone() unless the OP_CLONE operation failed [ Upstream commit d3c45824ad65aebf765fcf51366d317a29538820 ] The failure to retrieve post-op attributes has no bearing on whether or not the clone operation itself was successful. We must therefore ignore the return value of decode_getfattr() when looking at the success or failure of nfs4_xdr_dec_clone(). Fixes: 36022770de6c ("nfs42: add CLONE xdr functions") Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- fs/nfs/nfs42xdr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index aed865a846296..2b78f7b8d5467 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -769,8 +769,7 @@ static int nfs4_xdr_dec_clone(struct rpc_rqst *rqstp, status = decode_clone(xdr); if (status) goto out; - status = decode_getfattr(xdr, res->dst_fattr, res->server); - + decode_getfattr(xdr, res->dst_fattr, res->server); out: res->rpc_status = status; return status; From 3dab363fe34e01b4b12df152a3e010bd726d422a Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 18 Nov 2021 15:25:08 +0100 Subject: [PATCH 1845/5344] ARM: socfpga: Fix crash with CONFIG_FORTIRY_SOURCE [ Upstream commit 187bea472600dcc8d2eb714335053264dd437172 ] When CONFIG_FORTIFY_SOURCE is set, memcpy() checks the potential buffer overflow and panics. The code in sofcpga bootstrapping contains the memcpy() calls are mistakenly translated as the shorter size, hence it triggers a panic as if it were overflowing. This patch changes the secondary_trampoline and *_end definitions to arrays for avoiding the false-positive crash above. Fixes: 9c4566a117a6 ("ARM: socfpga: Enable SMP for socfpga") Suggested-by: Kees Cook Buglink: https://bugzilla.suse.com/show_bug.cgi?id=1192473 Link: https://lore.kernel.org/r/20211117193244.31162-1-tiwai@suse.de Signed-off-by: Takashi Iwai Signed-off-by: Dinh Nguyen Signed-off-by: Sasha Levin --- arch/arm/mach-socfpga/core.h | 2 +- arch/arm/mach-socfpga/platsmp.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/arm/mach-socfpga/core.h b/arch/arm/mach-socfpga/core.h index fc2608b18a0d0..18f01190dcfd4 100644 --- a/arch/arm/mach-socfpga/core.h +++ b/arch/arm/mach-socfpga/core.h @@ -33,7 +33,7 @@ extern void __iomem *sdr_ctl_base_addr; u32 socfpga_sdram_self_refresh(u32 sdr_base); extern unsigned int socfpga_sdram_self_refresh_sz; -extern char secondary_trampoline, secondary_trampoline_end; +extern char secondary_trampoline[], secondary_trampoline_end[]; extern unsigned long socfpga_cpu1start_addr; diff --git a/arch/arm/mach-socfpga/platsmp.c b/arch/arm/mach-socfpga/platsmp.c index fbb80b883e5dd..201191cf68f32 100644 --- a/arch/arm/mach-socfpga/platsmp.c +++ b/arch/arm/mach-socfpga/platsmp.c @@ -20,14 +20,14 @@ static int socfpga_boot_secondary(unsigned int cpu, struct task_struct *idle) { - int trampoline_size = &secondary_trampoline_end - &secondary_trampoline; + int trampoline_size = secondary_trampoline_end - secondary_trampoline; if (socfpga_cpu1start_addr) { /* This will put CPU #1 into reset. */ writel(RSTMGR_MPUMODRST_CPU1, rst_manager_base_addr + SOCFPGA_RSTMGR_MODMPURST); - memcpy(phys_to_virt(0), &secondary_trampoline, trampoline_size); + memcpy(phys_to_virt(0), secondary_trampoline, trampoline_size); writel(__pa_symbol(secondary_startup), sys_manager_base_addr + (socfpga_cpu1start_addr & 0x000000ff)); @@ -45,12 +45,12 @@ static int socfpga_boot_secondary(unsigned int cpu, struct task_struct *idle) static int socfpga_a10_boot_secondary(unsigned int cpu, struct task_struct *idle) { - int trampoline_size = &secondary_trampoline_end - &secondary_trampoline; + int trampoline_size = secondary_trampoline_end - secondary_trampoline; if (socfpga_cpu1start_addr) { writel(RSTMGR_MPUMODRST_CPU1, rst_manager_base_addr + SOCFPGA_A10_RSTMGR_MODMPURST); - memcpy(phys_to_virt(0), &secondary_trampoline, trampoline_size); + memcpy(phys_to_virt(0), secondary_trampoline, trampoline_size); writel(__pa_symbol(secondary_startup), sys_manager_base_addr + (socfpga_cpu1start_addr & 0x00000fff)); From 030d2b8a63765c6aa7e22cbf789d05eabb91fc79 Mon Sep 17 00:00:00 2001 From: Sreekanth Reddy Date: Wed, 17 Nov 2021 16:19:09 +0530 Subject: [PATCH 1846/5344] scsi: mpt3sas: Fix kernel panic during drive powercycle test [ Upstream commit 0ee4ba13e09c9d9c1cb6abb59da8295d9952328b ] While looping over shost's sdev list it is possible that one of the drives is getting removed and its sas_target object is freed but its sdev object remains intact. Consequently, a kernel panic can occur while the driver is trying to access the sas_address field of sas_target object without also checking the sas_target object for NULL. Link: https://lore.kernel.org/r/20211117104909.2069-1-sreekanth.reddy@broadcom.com Fixes: f92363d12359 ("[SCSI] mpt3sas: add new driver supporting 12GB SAS") Signed-off-by: Sreekanth Reddy Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/mpt3sas/mpt3sas_scsih.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c index 3654cfc4376fa..97c1f242ef0a3 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c @@ -3387,7 +3387,7 @@ _scsih_ublock_io_device(struct MPT3SAS_ADAPTER *ioc, u64 sas_address) shost_for_each_device(sdev, ioc->shost) { sas_device_priv_data = sdev->hostdata; - if (!sas_device_priv_data) + if (!sas_device_priv_data || !sas_device_priv_data->sas_target) continue; if (sas_device_priv_data->sas_target->sas_address != sas_address) From aa8ff7b6d5c26f08fddcf0150abd7043b2c90a6d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 18 Nov 2021 14:14:16 +0300 Subject: [PATCH 1847/5344] drm/vc4: fix error code in vc4_create_object() [ Upstream commit 96c5f82ef0a145d3e56e5b26f2bf6dcd2ffeae1c ] The ->gem_create_object() functions are supposed to return NULL if there is an error. None of the callers expect error pointers so returing one will lead to an Oops. See drm_gem_vram_create(), for example. Fixes: c826a6e10644 ("drm/vc4: Add a BO cache.") Signed-off-by: Dan Carpenter Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20211118111416.GC1147@kili Signed-off-by: Sasha Levin --- drivers/gpu/drm/vc4/vc4_bo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/vc4/vc4_bo.c b/drivers/gpu/drm/vc4/vc4_bo.c index 72d30d90b856c..0af246a5609ca 100644 --- a/drivers/gpu/drm/vc4/vc4_bo.c +++ b/drivers/gpu/drm/vc4/vc4_bo.c @@ -389,7 +389,7 @@ struct drm_gem_object *vc4_create_object(struct drm_device *dev, size_t size) bo = kzalloc(sizeof(*bo), GFP_KERNEL); if (!bo) - return ERR_PTR(-ENOMEM); + return NULL; bo->madv = VC4_MADV_WILLNEED; refcount_set(&bo->usecnt, 0); From 9246e99b3dd5288f35723bd2af19f81f736903e5 Mon Sep 17 00:00:00 2001 From: Nitesh B Venkatesh Date: Fri, 4 Jun 2021 09:53:31 -0700 Subject: [PATCH 1848/5344] iavf: Prevent changing static ITR values if adaptive moderation is on [ Upstream commit e792779e6b639c182df91b46ac1e5803460b0b15 ] Resolve being able to change static values on VF when adaptive interrupt moderation is enabled. This problem is fixed by checking the interrupt settings is not a combination of change of static value while adaptive interrupt moderation is turned on. Without this fix, the user would be able to change static values on VF with adaptive moderation enabled. Fixes: 65e87c0398f5 ("i40evf: support queue-specific settings for interrupt moderation") Signed-off-by: Nitesh B Venkatesh Tested-by: George Kuruvinakunnel Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- .../net/ethernet/intel/iavf/iavf_ethtool.c | 30 ++++++++++++++++--- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c index ad1e796e5544a..4e0e1b02d615e 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c +++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c @@ -719,12 +719,31 @@ static int iavf_get_per_queue_coalesce(struct net_device *netdev, u32 queue, * * Change the ITR settings for a specific queue. **/ -static void iavf_set_itr_per_queue(struct iavf_adapter *adapter, - struct ethtool_coalesce *ec, int queue) +static int iavf_set_itr_per_queue(struct iavf_adapter *adapter, + struct ethtool_coalesce *ec, int queue) { struct iavf_ring *rx_ring = &adapter->rx_rings[queue]; struct iavf_ring *tx_ring = &adapter->tx_rings[queue]; struct iavf_q_vector *q_vector; + u16 itr_setting; + + itr_setting = rx_ring->itr_setting & ~IAVF_ITR_DYNAMIC; + + if (ec->rx_coalesce_usecs != itr_setting && + ec->use_adaptive_rx_coalesce) { + netif_info(adapter, drv, adapter->netdev, + "Rx interrupt throttling cannot be changed if adaptive-rx is enabled\n"); + return -EINVAL; + } + + itr_setting = tx_ring->itr_setting & ~IAVF_ITR_DYNAMIC; + + if (ec->tx_coalesce_usecs != itr_setting && + ec->use_adaptive_tx_coalesce) { + netif_info(adapter, drv, adapter->netdev, + "Tx interrupt throttling cannot be changed if adaptive-tx is enabled\n"); + return -EINVAL; + } rx_ring->itr_setting = ITR_REG_ALIGN(ec->rx_coalesce_usecs); tx_ring->itr_setting = ITR_REG_ALIGN(ec->tx_coalesce_usecs); @@ -747,6 +766,7 @@ static void iavf_set_itr_per_queue(struct iavf_adapter *adapter, * the Tx and Rx ITR values based on the values we have entered * into the q_vector, no need to write the values now. */ + return 0; } /** @@ -788,9 +808,11 @@ static int __iavf_set_coalesce(struct net_device *netdev, */ if (queue < 0) { for (i = 0; i < adapter->num_active_queues; i++) - iavf_set_itr_per_queue(adapter, ec, i); + if (iavf_set_itr_per_queue(adapter, ec, i)) + return -EINVAL; } else if (queue < adapter->num_active_queues) { - iavf_set_itr_per_queue(adapter, ec, queue); + if (iavf_set_itr_per_queue(adapter, ec, queue)) + return -EINVAL; } else { netif_info(adapter, drv, netdev, "Invalid queue value, queue range is 0 - %d\n", adapter->num_active_queues - 1); From caf7254b40e7d4b8be40d661a323b82e1ac2487e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 18 Nov 2021 17:37:58 -0800 Subject: [PATCH 1849/5344] ipv6: fix typos in __ip6_finish_output() [ Upstream commit 19d36c5f294879949c9d6f57cb61d39cc4c48553 ] We deal with IPv6 packets, so we need to use IP6CB(skb)->flags and IP6SKB_REROUTED, instead of IPCB(skb)->flags and IPSKB_REROUTED Found by code inspection, please double check that fixing this bug does not surface other bugs. Fixes: 09ee9dba9611 ("ipv6: Reinject IPv6 packets if IPsec policy matches after SNAT") Signed-off-by: Eric Dumazet Cc: Tobias Brunner Cc: Steffen Klassert Cc: David Ahern Reviewed-by: David Ahern Tested-by: Tobias Brunner Acked-by: Tobias Brunner Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/ipv6/ip6_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index fc913f09606db..d847aa32628da 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -192,7 +192,7 @@ static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm) { - IPCB(skb)->flags |= IPSKB_REROUTED; + IP6CB(skb)->flags |= IP6SKB_REROUTED; return dst_output(net, sk, skb); } #endif From 9951912ea838577d296766017417830ffc7e987a Mon Sep 17 00:00:00 2001 From: Diana Wang Date: Fri, 19 Nov 2021 14:38:03 +0100 Subject: [PATCH 1850/5344] nfp: checking parameter process for rx-usecs/tx-usecs is invalid [ Upstream commit 3bd6b2a838ba6a3b86d41b077f570b1b61174def ] Use nn->tlv_caps.me_freq_mhz instead of nn->me_freq_mhz to check whether rx-usecs/tx-usecs is valid. This is because nn->tlv_caps.me_freq_mhz represents the clock_freq (MHz) of the flow processing cores (FPC) on the NIC. While nn->me_freq_mhz is not be set. Fixes: ce991ab6662a ("nfp: read ME frequency from vNIC ctrl memory") Signed-off-by: Diana Wang Signed-off-by: Simon Horman Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/netronome/nfp/nfp_net.h | 3 --- drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h b/drivers/net/ethernet/netronome/nfp/nfp_net.h index 250f510b1d212..3dcb09f17b77f 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h @@ -557,7 +557,6 @@ struct nfp_net_dp { * @exn_name: Name for Exception interrupt * @shared_handler: Handler for shared interrupts * @shared_name: Name for shared interrupt - * @me_freq_mhz: ME clock_freq (MHz) * @reconfig_lock: Protects @reconfig_posted, @reconfig_timer_active, * @reconfig_sync_present and HW reconfiguration request * regs/machinery from async requests (sync must take @@ -639,8 +638,6 @@ struct nfp_net { irq_handler_t shared_handler; char shared_name[IFNAMSIZ + 8]; - u32 me_freq_mhz; - bool link_up; spinlock_t link_status_lock; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c index 2354dec994184..89e578e25ff8f 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c @@ -1269,7 +1269,7 @@ static int nfp_net_set_coalesce(struct net_device *netdev, * ME timestamp ticks. There are 16 ME clock cycles for each timestamp * count. */ - factor = nn->me_freq_mhz / 16; + factor = nn->tlv_caps.me_freq_mhz / 16; /* Each pair of (usecs, max_frames) fields specifies that interrupts * should be coalesced until From 73c6a82769341b7b96d6c0d34ac8b9bfa671f680 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 22 Nov 2021 17:15:12 +0200 Subject: [PATCH 1851/5344] net: ipv6: add fib6_nh_release_dsts stub [ Upstream commit 8837cbbf854246f5f4d565f21e6baa945d37aded ] We need a way to release a fib6_nh's per-cpu dsts when replacing nexthops otherwise we can end up with stale per-cpu dsts which hold net device references, so add a new IPv6 stub called fib6_nh_release_dsts. It must be used after an RCU grace period, so no new dsts can be created through a group's nexthop entry. Similar to fib6_nh_release it shouldn't be used if fib6_nh_init has failed so it doesn't need a dummy stub when IPv6 is not enabled. Fixes: 7bf4796dd099 ("nexthops: add support for replace") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/net/ip6_fib.h | 1 + include/net/ipv6_stubs.h | 1 + net/ipv6/af_inet6.c | 1 + net/ipv6/route.c | 19 +++++++++++++++++++ 4 files changed, 22 insertions(+) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index bd0f1595bdc71..05ecaefeb6322 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -451,6 +451,7 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack); void fib6_nh_release(struct fib6_nh *fib6_nh); +void fib6_nh_release_dsts(struct fib6_nh *fib6_nh); int call_fib6_entry_notifiers(struct net *net, enum fib_event_type event_type, diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index 3e7d2c0e79ca1..af9e127779adf 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -47,6 +47,7 @@ struct ipv6_stub { struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack); void (*fib6_nh_release)(struct fib6_nh *fib6_nh); + void (*fib6_nh_release_dsts)(struct fib6_nh *fib6_nh); void (*fib6_update_sernum)(struct net *net, struct fib6_info *rt); int (*ip6_del_rt)(struct net *net, struct fib6_info *rt); void (*fib6_rt_update)(struct net *net, struct fib6_info *rt, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index be83c829f1169..92b587accc2bc 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -958,6 +958,7 @@ static const struct ipv6_stub ipv6_stub_impl = { .ip6_mtu_from_fib6 = ip6_mtu_from_fib6, .fib6_nh_init = fib6_nh_init, .fib6_nh_release = fib6_nh_release, + .fib6_nh_release_dsts = fib6_nh_release_dsts, .fib6_update_sernum = fib6_update_sernum_stub, .fib6_rt_update = fib6_rt_update, .ip6_del_rt = ip6_del_rt, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index daa876c6ae8db..f36db3dd97346 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3585,6 +3585,25 @@ void fib6_nh_release(struct fib6_nh *fib6_nh) fib_nh_common_release(&fib6_nh->nh_common); } +void fib6_nh_release_dsts(struct fib6_nh *fib6_nh) +{ + int cpu; + + if (!fib6_nh->rt6i_pcpu) + return; + + for_each_possible_cpu(cpu) { + struct rt6_info *pcpu_rt, **ppcpu_rt; + + ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); + pcpu_rt = xchg(ppcpu_rt, NULL); + if (pcpu_rt) { + dst_dev_put(&pcpu_rt->dst); + dst_release(&pcpu_rt->dst); + } + } +} + static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) From a050ac20aed0c8005e6d3bb1f53bcdfd7ed29945 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 22 Nov 2021 17:15:13 +0200 Subject: [PATCH 1852/5344] net: nexthop: release IPv6 per-cpu dsts when replacing a nexthop group [ Upstream commit 1005f19b9357b81aa64e1decd08d6e332caaa284 ] When replacing a nexthop group, we must release the IPv6 per-cpu dsts of the removed nexthop entries after an RCU grace period because they contain references to the nexthop's net device and to the fib6 info. With specific series of events[1] we can reach net device refcount imbalance which is unrecoverable. IPv4 is not affected because dsts don't take a refcount on the route. [1] $ ip nexthop list id 200 via 2002:db8::2 dev bridge.10 scope link onlink id 201 via 2002:db8::3 dev bridge scope link onlink id 203 group 201/200 $ ip -6 route 2001:db8::10 nhid 203 metric 1024 pref medium nexthop via 2002:db8::3 dev bridge weight 1 onlink nexthop via 2002:db8::2 dev bridge.10 weight 1 onlink Create rt6_info through one of the multipath legs, e.g.: $ taskset -a -c 1 ./pkt_inj 24 bridge.10 2001:db8::10 (pkt_inj is just a custom packet generator, nothing special) Then remove that leg from the group by replace (let's assume it is id 200 in this case): $ ip nexthop replace id 203 group 201 Now remove the IPv6 route: $ ip -6 route del 2001:db8::10/128 The route won't be really deleted due to the stale rt6_info holding 1 refcnt in nexthop id 200. At this point we have the following reference count dependency: (deleted) IPv6 route holds 1 reference over nhid 203 nh 203 holds 1 ref over id 201 nh 200 holds 1 ref over the net device and the route due to the stale rt6_info Now to create circular dependency between nh 200 and the IPv6 route, and also to get a reference over nh 200, restore nhid 200 in the group: $ ip nexthop replace id 203 group 201/200 And now we have a permanent circular dependncy because nhid 203 holds a reference over nh 200 and 201, but the route holds a ref over nh 203 and is deleted. To trigger the bug just delete the group (nhid 203): $ ip nexthop del id 203 It won't really be deleted due to the IPv6 route dependency, and now we have 2 unlinked and deleted objects that reference each other: the group and the IPv6 route. Since the group drops the reference it holds over its entries at free time (i.e. its own refcount needs to drop to 0) that will never happen and we get a permanent ref on them, since one of the entries holds a reference over the IPv6 route it will also never be released. At this point the dependencies are: (deleted, only unlinked) IPv6 route holds reference over group nh 203 (deleted, only unlinked) group nh 203 holds reference over nh 201 and 200 nh 200 holds 1 ref over the net device and the route due to the stale rt6_info This is the last point where it can be fixed by running traffic through nh 200, and specifically through the same CPU so the rt6_info (dst) will get released due to the IPv6 genid, that in turn will free the IPv6 route, which in turn will free the ref count over the group nh 203. If nh 200 is deleted at this point, it will never be released due to the ref from the unlinked group 203, it will only be unlinked: $ ip nexthop del id 200 $ ip nexthop $ Now we can never release that stale rt6_info, we have IPv6 route with ref over group nh 203, group nh 203 with ref over nh 200 and 201, nh 200 with rt6_info (dst) with ref over the net device and the IPv6 route. All of these objects are only unlinked, and cannot be released, thus they can't release their ref counts. Message from syslogd@dev at Nov 19 14:04:10 ... kernel:[73501.828730] unregister_netdevice: waiting for bridge.10 to become free. Usage count = 3 Message from syslogd@dev at Nov 19 14:04:20 ... kernel:[73512.068811] unregister_netdevice: waiting for bridge.10 to become free. Usage count = 3 Fixes: 7bf4796dd099 ("nexthops: add support for replace") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/ipv4/nexthop.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 8bdffdf0502a1..4d69b3de980a6 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -839,15 +839,36 @@ static void remove_nexthop(struct net *net, struct nexthop *nh, /* if any FIB entries reference this nexthop, any dst entries * need to be regenerated */ -static void nh_rt_cache_flush(struct net *net, struct nexthop *nh) +static void nh_rt_cache_flush(struct net *net, struct nexthop *nh, + struct nexthop *replaced_nh) { struct fib6_info *f6i; + struct nh_group *nhg; + int i; if (!list_empty(&nh->fi_list)) rt_cache_flush(net); list_for_each_entry(f6i, &nh->f6i_list, nh_list) ipv6_stub->fib6_update_sernum(net, f6i); + + /* if an IPv6 group was replaced, we have to release all old + * dsts to make sure all refcounts are released + */ + if (!replaced_nh->is_group) + return; + + /* new dsts must use only the new nexthop group */ + synchronize_net(); + + nhg = rtnl_dereference(replaced_nh->nh_grp); + for (i = 0; i < nhg->num_nh; i++) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + struct nh_info *nhi = rtnl_dereference(nhge->nh->nh_info); + + if (nhi->family == AF_INET6) + ipv6_stub->fib6_nh_release_dsts(&nhi->fib6_nh); + } } static int replace_nexthop_grp(struct net *net, struct nexthop *old, @@ -994,7 +1015,7 @@ static int replace_nexthop(struct net *net, struct nexthop *old, err = replace_nexthop_single(net, old, new, extack); if (!err) { - nh_rt_cache_flush(net, old); + nh_rt_cache_flush(net, old, new); __remove_nexthop(net, new, NULL); nexthop_put(new); From cf03bdb7ef628a0108b40a7886c2470198ae53d1 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Sat, 20 Nov 2021 10:49:17 -0600 Subject: [PATCH 1853/5344] scsi: core: sysfs: Fix setting device state to SDEV_RUNNING [ Upstream commit eb97545d6264b341b06ba7603f52ff6c0b2af6ea ] This fixes an issue added in commit 4edd8cd4e86d ("scsi: core: sysfs: Fix hang when device state is set via sysfs") where if userspace is requesting to set the device state to SDEV_RUNNING when the state is already SDEV_RUNNING, we return -EINVAL instead of count. The commmit above set ret to count for this case, when it should have set it to 0. Link: https://lore.kernel.org/r/20211120164917.4924-1-michael.christie@oracle.com Fixes: 4edd8cd4e86d ("scsi: core: sysfs: Fix hang when device state is set via sysfs") Reviewed-by: Lee Duncan Signed-off-by: Mike Christie Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/scsi_sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index 16432d42a50aa..6faf1d6451b0c 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -796,7 +796,7 @@ store_state_field(struct device *dev, struct device_attribute *attr, mutex_lock(&sdev->state_mutex); if (sdev->sdev_state == SDEV_RUNNING && state == SDEV_RUNNING) { - ret = count; + ret = 0; } else { ret = scsi_device_set_state(sdev, state); if (ret == 0 && state == SDEV_RUNNING) From ad7b1981ae7e1158ead782fc4e78794d95caae71 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Tue, 23 Nov 2021 16:25:18 +0800 Subject: [PATCH 1854/5344] net/smc: Ensure the active closing peer first closes clcsock [ Upstream commit 606a63c9783a32a45bd2ef0eee393711d75b3284 ] The side that actively closed socket, it's clcsock doesn't enter TIME_WAIT state, but the passive side does it. It should show the same behavior as TCP sockets. Consider this, when client actively closes the socket, the clcsock in server enters TIME_WAIT state, which means the address is occupied and won't be reused before TIME_WAIT dismissing. If we restarted server, the service would be unavailable for a long time. To solve this issue, shutdown the clcsock in [A], perform the TCP active close progress first, before the passive closed side closing it. So that the actively closed side enters TIME_WAIT, not the passive one. Client | Server close() // client actively close | smc_release() | smc_close_active() // PEERCLOSEWAIT1 | smc_close_final() // abort or closed = 1| smc_cdc_get_slot_and_msg_send() | [A] | |smc_cdc_msg_recv_action() // ACTIVE | queue_work(smc_close_wq, &conn->close_work) | smc_close_passive_work() // PROCESSABORT or APPCLOSEWAIT1 | smc_close_passive_abort_received() // only in abort | |close() // server recv zero, close | smc_release() // PROCESSABORT or APPCLOSEWAIT1 | smc_close_active() | smc_close_abort() or smc_close_final() // CLOSED | smc_cdc_get_slot_and_msg_send() // abort or closed = 1 smc_cdc_msg_recv_action() | smc_clcsock_release() queue_work(smc_close_wq, &conn->close_work) | sock_release(tcp) // actively close clc, enter TIME_WAIT smc_close_passive_work() // PEERCLOSEWAIT1 | smc_conn_free() smc_close_passive_abort_received() // CLOSED| smc_conn_free() | smc_clcsock_release() | sock_release(tcp) // passive close clc | Link: https://www.spinics.net/lists/netdev/msg780407.html Fixes: b38d732477e4 ("smc: socket closing and linkgroup cleanup") Signed-off-by: Tony Lu Reviewed-by: Wen Gu Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/smc/smc_close.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index fc06720b53c14..2eabf39dee74d 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -218,6 +218,12 @@ int smc_close_active(struct smc_sock *smc) if (rc) break; sk->sk_state = SMC_PEERCLOSEWAIT1; + + /* actively shutdown clcsock before peer close it, + * prevent peer from entering TIME_WAIT state. + */ + if (smc->clcsock && smc->clcsock->sk) + rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); } else { /* peer event has changed the state */ goto again; From 1fe9894030941a5f34f4a80f237f16ab9fdcc533 Mon Sep 17 00:00:00 2001 From: Varun Prakash Date: Mon, 22 Nov 2021 15:38:41 +0530 Subject: [PATCH 1855/5344] nvmet-tcp: fix incomplete data digest send [ Upstream commit 102110efdff6beedece6ab9b51664c32ac01e2db ] Current nvmet_try_send_ddgst() code does not check whether all data digest bytes are transmitted, fix this by returning -EAGAIN if all data digest bytes are not transmitted. Fixes: 872d26a391da ("nvmet-tcp: add NVMe over TCP target driver") Signed-off-by: Varun Prakash Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Sasha Levin --- drivers/nvme/target/tcp.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 3940be28d31cd..b06a67a02506b 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -631,10 +631,11 @@ static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch) static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd) { struct nvmet_tcp_queue *queue = cmd->queue; + int left = NVME_TCP_DIGEST_LENGTH - cmd->offset; struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; struct kvec iov = { .iov_base = (u8 *)&cmd->exp_ddgst + cmd->offset, - .iov_len = NVME_TCP_DIGEST_LENGTH - cmd->offset + .iov_len = left }; int ret; @@ -643,6 +644,10 @@ static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd) return ret; cmd->offset += ret; + left -= ret; + + if (left) + return -EAGAIN; if (queue->nvme_sq.sqhd_disabled) { cmd->queue->snd_cmd = NULL; From 375cc5b404c7933d0ff29f62ec82ccecbd1b3342 Mon Sep 17 00:00:00 2001 From: Kumar Thangavel Date: Mon, 22 Nov 2021 22:08:18 +0530 Subject: [PATCH 1856/5344] net/ncsi : Add payload to be 32-bit aligned to fix dropped packets [ Upstream commit ac132852147ad303a938dda318970dd1bbdfda4e ] Update NC-SI command handler (both standard and OEM) to take into account of payload paddings in allocating skb (in case of payload size is not 32-bit aligned). The checksum field follows payload field, without taking payload padding into account can cause checksum being truncated, leading to dropped packets. Fixes: fb4ee67529ff ("net/ncsi: Add NCSI OEM command support") Signed-off-by: Kumar Thangavel Acked-by: Samuel Mendoza-Jonas Reviewed-by: Paul Menzel Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/ncsi/ncsi-cmd.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/net/ncsi/ncsi-cmd.c b/net/ncsi/ncsi-cmd.c index 0187e65176c05..114ef47db76d3 100644 --- a/net/ncsi/ncsi-cmd.c +++ b/net/ncsi/ncsi-cmd.c @@ -18,6 +18,8 @@ #include "internal.h" #include "ncsi-pkt.h" +static const int padding_bytes = 26; + u32 ncsi_calculate_checksum(unsigned char *data, int len) { u32 checksum = 0; @@ -213,12 +215,17 @@ static int ncsi_cmd_handler_oem(struct sk_buff *skb, { struct ncsi_cmd_oem_pkt *cmd; unsigned int len; + int payload; + /* NC-SI spec DSP_0222_1.2.0, section 8.2.2.2 + * requires payload to be padded with 0 to + * 32-bit boundary before the checksum field. + * Ensure the padding bytes are accounted for in + * skb allocation + */ + payload = ALIGN(nca->payload, 4); len = sizeof(struct ncsi_cmd_pkt_hdr) + 4; - if (nca->payload < 26) - len += 26; - else - len += nca->payload; + len += max(payload, padding_bytes); cmd = skb_put_zero(skb, len); memcpy(&cmd->mfr_id, nca->data, nca->payload); @@ -272,6 +279,7 @@ static struct ncsi_request *ncsi_alloc_command(struct ncsi_cmd_arg *nca) struct net_device *dev = nd->dev; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; + int payload; int len = hlen + tlen; struct sk_buff *skb; struct ncsi_request *nr; @@ -281,14 +289,14 @@ static struct ncsi_request *ncsi_alloc_command(struct ncsi_cmd_arg *nca) return NULL; /* NCSI command packet has 16-bytes header, payload, 4 bytes checksum. + * Payload needs padding so that the checksum field following payload is + * aligned to 32-bit boundary. * The packet needs padding if its payload is less than 26 bytes to * meet 64 bytes minimal ethernet frame length. */ len += sizeof(struct ncsi_cmd_pkt_hdr) + 4; - if (nca->payload < 26) - len += 26; - else - len += nca->payload; + payload = ALIGN(nca->payload, 4); + len += max(payload, padding_bytes); /* Allocate skb */ skb = alloc_skb(len, GFP_ATOMIC); From cdfeee578772dd4209f9fdd4089637fc0e561633 Mon Sep 17 00:00:00 2001 From: Thomas Zeitlhofer Date: Tue, 23 Nov 2021 20:18:43 +0100 Subject: [PATCH 1857/5344] PM: hibernate: use correct mode for swsusp_close() [ Upstream commit cefcf24b4d351daf70ecd945324e200d3736821e ] Commit 39fbef4b0f77 ("PM: hibernate: Get block device exclusively in swsusp_check()") changed the opening mode of the block device to (FMODE_READ | FMODE_EXCL). In the corresponding calls to swsusp_close(), the mode is still just FMODE_READ which triggers the warning in blkdev_flush_mapping() on resume from hibernate. So, use the mode (FMODE_READ | FMODE_EXCL) also when closing the device. Fixes: 39fbef4b0f77 ("PM: hibernate: Get block device exclusively in swsusp_check()") Signed-off-by: Thomas Zeitlhofer Signed-off-by: Rafael J. Wysocki Signed-off-by: Sasha Levin --- kernel/power/hibernate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 69c4cd472def3..6cafb2e910a11 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -676,7 +676,7 @@ static int load_image_and_restore(void) goto Unlock; error = swsusp_read(&flags); - swsusp_close(FMODE_READ); + swsusp_close(FMODE_READ | FMODE_EXCL); if (!error) hibernation_restore(flags & SF_PLATFORM_MODE); @@ -871,7 +871,7 @@ static int software_resume(void) /* The snapshot device should not be opened while we're running */ if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { error = -EBUSY; - swsusp_close(FMODE_READ); + swsusp_close(FMODE_READ | FMODE_EXCL); goto Unlock; } @@ -907,7 +907,7 @@ static int software_resume(void) pm_pr_dbg("Hibernation image not present or could not be loaded.\n"); return error; Close_Finish: - swsusp_close(FMODE_READ); + swsusp_close(FMODE_READ | FMODE_EXCL); goto Finish; } From 6b60b3585764d9c6395256410caff6498ccdc173 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 23 Nov 2021 12:25:35 -0800 Subject: [PATCH 1858/5344] tcp_cubic: fix spurious Hystart ACK train detections for not-cwnd-limited flows [ Upstream commit 4e1fddc98d2585ddd4792b5e44433dcee7ece001 ] While testing BIG TCP patch series, I was expecting that TCP_RR workloads with 80KB requests/answers would send one 80KB TSO packet, then being received as a single GRO packet. It turns out this was not happening, and the root cause was that cubic Hystart ACK train was triggering after a few (2 or 3) rounds of RPC. Hystart was wrongly setting CWND/SSTHRESH to 30, while my RPC needed a budget of ~20 segments. Ideally these TCP_RR flows should not exit slow start. Cubic Hystart should reset itself at each round, instead of assuming every TCP flow is a bulk one. Note that even after this patch, Hystart can still trigger, depending on scheduling artifacts, but at a higher CWND/SSTHRESH threshold, keeping optimal TSO packet sizes. Tested: ip link set dev eth0 gro_ipv6_max_size 131072 gso_ipv6_max_size 131072 nstat -n; netperf -H ... -t TCP_RR -l 5 -- -r 80000,80000 -K cubic; nstat|egrep "Ip6InReceives|Hystart|Ip6OutRequests" Before: 8605 Ip6InReceives 87541 0.0 Ip6OutRequests 129496 0.0 TcpExtTCPHystartTrainDetect 1 0.0 TcpExtTCPHystartTrainCwnd 30 0.0 After: 8760 Ip6InReceives 88514 0.0 Ip6OutRequests 87975 0.0 Fixes: ae27e98a5152 ("[TCP] CUBIC v2.3") Co-developed-by: Neal Cardwell Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Cc: Stephen Hemminger Cc: Yuchung Cheng Cc: Soheil Hassas Yeganeh Link: https://lore.kernel.org/r/20211123202535.1843771-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/ipv4/tcp_cubic.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index ee6c38a73325d..44be7a5a13911 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -341,8 +341,6 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) return; if (tcp_in_slow_start(tp)) { - if (hystart && after(ack, ca->end_seq)) - bictcp_hystart_reset(sk); acked = tcp_slow_start(tp, acked); if (!acked) return; @@ -384,6 +382,9 @@ static void hystart_update(struct sock *sk, u32 delay) if (ca->found & hystart_detect) return; + if (after(tp->snd_una, ca->end_seq)) + bictcp_hystart_reset(sk); + if (hystart_detect & HYSTART_ACK_TRAIN) { u32 now = bictcp_clock(); From 320541d0a34c7bfc202bf4b54a806953be3eebb6 Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Mon, 22 Nov 2021 11:08:27 +0100 Subject: [PATCH 1859/5344] nvmet: use IOCB_NOWAIT only if the filesystem supports it [ Upstream commit c024b226a417c4eb9353ff500b1c823165d4d508 ] Submit I/O requests with the IOCB_NOWAIT flag set only if the underlying filesystem supports it. Fixes: 50a909db36f2 ("nvmet: use IOCB_NOWAIT for file-ns buffered I/O") Signed-off-by: Maurizio Lombardi Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig Signed-off-by: Sasha Levin --- drivers/nvme/target/io-cmd-file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index d67789b13057e..220c2534a21e4 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "nvmet.h" #define NVMET_MAX_MPOOL_BVEC 16 @@ -257,7 +258,8 @@ static void nvmet_file_execute_rw(struct nvmet_req *req) if (req->ns->buffered_io) { if (likely(!req->f.mpool_alloc) && - nvmet_file_execute_io(req, IOCB_NOWAIT)) + (req->ns->file->f_mode & FMODE_NOWAIT) && + nvmet_file_execute_io(req, IOCB_NOWAIT)) return; nvmet_file_submit_buffered_io(req); } else From aa1b576960227356a9e28862df5b6d5b3340b85c Mon Sep 17 00:00:00 2001 From: Jesse Brandeburg Date: Tue, 23 Nov 2021 12:40:00 -0800 Subject: [PATCH 1860/5344] igb: fix netpoll exit with traffic [ Upstream commit eaeace60778e524a2820d0c0ad60bf80289e292c ] Oleksandr brought a bug report where netpoll causes trace messages in the log on igb. Danielle brought this back up as still occurring, so we'll try again. [22038.710800] ------------[ cut here ]------------ [22038.710801] igb_poll+0x0/0x1440 [igb] exceeded budget in poll [22038.710802] WARNING: CPU: 12 PID: 40362 at net/core/netpoll.c:155 netpoll_poll_dev+0x18a/0x1a0 As Alex suggested, change the driver to return work_done at the exit of napi_poll, which should be safe to do in this driver because it is not polling multiple queues in this single napi context (multiple queues attached to one MSI-X vector). Several other drivers contain the same simple sequence, so I hope this will not create new problems. Fixes: 16eb8815c235 ("igb: Refactor clean_rx_irq to reduce overhead and improve performance") Reported-by: Oleksandr Natalenko Reported-by: Danielle Ratson Suggested-by: Alexander Duyck Signed-off-by: Jesse Brandeburg Tested-by: Oleksandr Natalenko Tested-by: Danielle Ratson Link: https://lore.kernel.org/r/20211123204000.1597971-1-jesse.brandeburg@intel.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/igb/igb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 158feb0ab2739..c11244a9b7e69 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -7752,7 +7752,7 @@ static int igb_poll(struct napi_struct *napi, int budget) if (likely(napi_complete_done(napi, work_done))) igb_ring_irq_enable(q_vector); - return min(work_done, budget - 1); + return work_done; } /** From 32d5171b2d85b4154648d6f4b6a1d7209c4f1313 Mon Sep 17 00:00:00 2001 From: Huang Pei Date: Thu, 25 Nov 2021 18:59:48 +0800 Subject: [PATCH 1861/5344] MIPS: use 3-level pgtable for 64KB page size on MIPS_VA_BITS_48 [ Upstream commit 41ce097f714401e6ad8f3f5eb30d7f91b0b5e495 ] It hangup when booting Loongson 3A1000 with BOTH CONFIG_PAGE_SIZE_64KB and CONFIG_MIPS_VA_BITS_48, that it turn out to use 2-level pgtable instead of 3-level. 64KB page size with 2-level pgtable only cover 42 bits VA, use 3-level pgtable to cover all 48 bits VA(55 bits) Fixes: 1e321fa917fb ("MIPS64: Support of at least 48 bits of SEGBITS) Signed-off-by: Huang Pei Signed-off-by: Thomas Bogendoerfer Signed-off-by: Sasha Levin --- arch/mips/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 9749818eed6d6..2811ecc1f3c71 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -3059,7 +3059,7 @@ config STACKTRACE_SUPPORT config PGTABLE_LEVELS int default 4 if PAGE_SIZE_4KB && MIPS_VA_BITS_48 - default 3 if 64BIT && !PAGE_SIZE_64KB + default 3 if 64BIT && (!PAGE_SIZE_64KB || MIPS_VA_BITS_48) default 2 config MIPS_AUTO_PFN_OFFSET From 93678b1b90cd67e03d7451854b26e1c884a2713e Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Fri, 26 Nov 2021 09:59:42 +0800 Subject: [PATCH 1862/5344] net: vlan: fix underflow for the real_dev refcnt [ Upstream commit 01d9cc2dea3fde3bad6d27f464eff463496e2b00 ] Inject error before dev_hold(real_dev) in register_vlan_dev(), and execute the following testcase: ip link add dev dummy1 type dummy ip link add name dummy1.100 link dummy1 type vlan id 100 ip link del dev dummy1 When the dummy netdevice is removed, we will get a WARNING as following: ======================================================================= refcount_t: decrement hit 0; leaking memory. WARNING: CPU: 2 PID: 0 at lib/refcount.c:31 refcount_warn_saturate+0xbf/0x1e0 and an endless loop of: ======================================================================= unregister_netdevice: waiting for dummy1 to become free. Usage count = -1073741824 That is because dev_put(real_dev) in vlan_dev_free() be called without dev_hold(real_dev) in register_vlan_dev(). It makes the refcnt of real_dev underflow. Move the dev_hold(real_dev) to vlan_dev_init() which is the call-back of ndo_init(). That makes dev_hold() and dev_put() for vlan's real_dev symmetrical. Fixes: 563bcbae3ba2 ("net: vlan: fix a UAF in vlan_dev_real_dev()") Reported-by: Petr Machata Suggested-by: Jakub Kicinski Signed-off-by: Ziyang Xuan Link: https://lore.kernel.org/r/20211126015942.2918542-1-william.xuanziyang@huawei.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/8021q/vlan.c | 3 --- net/8021q/vlan_dev.c | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index cd7c0429cddf8..796d95797ab40 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -177,9 +177,6 @@ int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack) if (err) goto out_unregister_netdev; - /* Account for reference in struct vlan_dev_priv */ - dev_hold(real_dev); - vlan_stacked_transfer_operstate(real_dev, dev, vlan); linkwatch_fire_event(dev); /* _MUST_ call rfc2863_policy() */ diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 415a29d42cdf0..589615ec490bb 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -583,6 +583,9 @@ static int vlan_dev_init(struct net_device *dev) if (!vlan->vlan_pcpu_stats) return -ENOMEM; + /* Get vlan's reference to real_dev */ + dev_hold(real_dev); + return 0; } From 67c464d64ded38e526f25ab8b4d1090293a24a67 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Fri, 26 Nov 2021 10:41:35 +0800 Subject: [PATCH 1863/5344] net/smc: Don't call clcsock shutdown twice when smc shutdown [ Upstream commit bacb6c1e47691cda4a95056c21b5487fb7199fcc ] When applications call shutdown() with SHUT_RDWR in userspace, smc_close_active() calls kernel_sock_shutdown(), and it is called twice in smc_shutdown(). This fixes this by checking sk_state before do clcsock shutdown, and avoids missing the application's call of smc_shutdown(). Link: https://lore.kernel.org/linux-s390/1f67548e-cbf6-0dce-82b5-10288a4583bd@linux.ibm.com/ Fixes: 606a63c9783a ("net/smc: Ensure the active closing peer first closes clcsock") Signed-off-by: Tony Lu Reviewed-by: Wen Gu Acked-by: Karsten Graul Link: https://lore.kernel.org/r/20211126024134.45693-1-tonylu@linux.alibaba.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/smc/af_smc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 6b0f09c5b195f..5e1493f8deba7 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1658,8 +1658,10 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, static int smc_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; + bool do_shutdown = true; struct smc_sock *smc; int rc = -EINVAL; + int old_state; int rc1 = 0; smc = smc_sk(sk); @@ -1686,7 +1688,11 @@ static int smc_shutdown(struct socket *sock, int how) } switch (how) { case SHUT_RDWR: /* shutdown in both directions */ + old_state = sk->sk_state; rc = smc_close_active(smc); + if (old_state == SMC_ACTIVE && + sk->sk_state == SMC_PEERCLOSEWAIT1) + do_shutdown = false; break; case SHUT_WR: rc = smc_close_shutdown_write(smc); @@ -1696,7 +1702,7 @@ static int smc_shutdown(struct socket *sock, int how) /* nothing more to do because peer is not involved */ break; } - if (smc->clcsock) + if (do_shutdown && smc->clcsock) rc1 = kernel_sock_shutdown(smc->clcsock, how); /* map sock_shutdown_cmd constants to sk_shutdown value range */ sk->sk_shutdown |= how + 1; From c4936d91d2520e535c28b91f431e694cc7a64aba Mon Sep 17 00:00:00 2001 From: Guangbin Huang Date: Fri, 26 Nov 2021 20:03:15 +0800 Subject: [PATCH 1864/5344] net: hns3: fix VF RSS failed problem after PF enable multi-TCs [ Upstream commit 8d2ad993aa05c0768f00c886c9d369cd97a337ac ] When PF is set to multi-TCs and configured mapping relationship between priorities and TCs, the hardware will active these settings for this PF and its VFs. In this case when VF just uses one TC and its rx packets contain priority, and if the priority is not mapped to TC0, as other TCs of VF is not valid, hardware always put this kind of packets to the queue 0. It cause this kind of packets of VF can not be used RSS function. To fix this problem, set tc mode of all unused TCs of VF to the setting of TC0, then rx packet with priority which map to unused TC will be direct to TC0. Fixes: e2cb1dec9779 ("net: hns3: Add HNS3 VF HCL(Hardware Compatibility Layer) Support") Signed-off-by: Guangbin Huang Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index db2e9dd5681eb..ce6a4e1965e1d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -644,9 +644,9 @@ static int hclgevf_set_rss_tc_mode(struct hclgevf_dev *hdev, u16 rss_size) roundup_size = ilog2(roundup_size); for (i = 0; i < HCLGEVF_MAX_TC_NUM; i++) { - tc_valid[i] = !!(hdev->hw_tc_map & BIT(i)); + tc_valid[i] = 1; tc_size[i] = roundup_size; - tc_offset[i] = rss_size * i; + tc_offset[i] = (hdev->hw_tc_map & BIT(i)) ? rss_size * i : 0; } hclgevf_cmd_setup_basic_desc(&desc, HCLGEVF_OPC_RSS_TC_MODE, false); From 242d9b5e22778e4df913fe36c0647c12298e52a9 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 26 Nov 2021 19:28:41 +0200 Subject: [PATCH 1865/5344] net: mscc: ocelot: don't downgrade timestamping RX filters in SIOCSHWTSTAMP [ Upstream commit 8a075464d1e9317ffae0973dfe538a7511291a06 ] The ocelot driver, when asked to timestamp all receiving packets, 1588 v1 or NTP, says "nah, here's 1588 v2 for you". According to this discussion: https://patchwork.kernel.org/project/netdevbpf/patch/20211104133204.19757-8-martin.kaistra@linutronix.de/#24577647 drivers that downgrade from a wider request to a narrower response (or even a response where the intersection with the request is empty) are buggy, and should return -ERANGE instead. This patch fixes that. Fixes: 4e3b0468e6d7 ("net: mscc: PTP Hardware Clock (PHC) support") Suggested-by: Richard Cochran Signed-off-by: Vladimir Oltean Acked-by: Richard Cochran Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/mscc/ocelot.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 6030c90d50ccb..3f83647c5802e 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -1024,12 +1024,6 @@ static int ocelot_hwstamp_set(struct ocelot_port *port, struct ifreq *ifr) switch (cfg.rx_filter) { case HWTSTAMP_FILTER_NONE: break; - case HWTSTAMP_FILTER_ALL: - case HWTSTAMP_FILTER_SOME: - case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: - case HWTSTAMP_FILTER_PTP_V1_L4_SYNC: - case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ: - case HWTSTAMP_FILTER_NTP_ALL: case HWTSTAMP_FILTER_PTP_V2_L4_EVENT: case HWTSTAMP_FILTER_PTP_V2_L4_SYNC: case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ: From 5c340bdb43fe933186a2af65616b942e3ce283c4 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 26 Nov 2021 19:28:45 +0200 Subject: [PATCH 1866/5344] net: mscc: ocelot: correctly report the timestamping RX filters in ethtool [ Upstream commit c49a35eedfef08bffd46b53c25dbf9d6016a86ff ] The driver doesn't support RX timestamping for non-PTP packets, but it declares that it does. Restrict the reported RX filters to PTP v2 over L2 and over L4. Fixes: 4e3b0468e6d7 ("net: mscc: PTP Hardware Clock (PHC) support") Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/mscc/ocelot.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 3f83647c5802e..bf7832b34a000 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -1183,7 +1183,10 @@ static int ocelot_get_ts_info(struct net_device *dev, SOF_TIMESTAMPING_RAW_HARDWARE; info->tx_types = BIT(HWTSTAMP_TX_OFF) | BIT(HWTSTAMP_TX_ON) | BIT(HWTSTAMP_TX_ONESTEP_SYNC); - info->rx_filters = BIT(HWTSTAMP_FILTER_NONE) | BIT(HWTSTAMP_FILTER_ALL); + info->rx_filters = BIT(HWTSTAMP_FILTER_NONE) | + BIT(HWTSTAMP_FILTER_PTP_V2_EVENT) | + BIT(HWTSTAMP_FILTER_PTP_V2_L2_EVENT) | + BIT(HWTSTAMP_FILTER_PTP_V2_L4_EVENT); return 0; } From b04970757a5a99f7a9f9934a5bce2bb2994fa472 Mon Sep 17 00:00:00 2001 From: Weichao Guo Date: Sat, 18 Sep 2021 20:46:36 +0800 Subject: [PATCH 1867/5344] f2fs: set SBI_NEED_FSCK flag when inconsistent node block found [ Upstream commit 6663b138ded1a59e630c9e605e42aa7fde490cdc ] Inconsistent node block will cause a file fail to open or read, which could make the user process crashes or stucks. Let's mark SBI_NEED_FSCK flag to trigger a fix at next fsck time. After unlinking the corrupted file, the user process could regenerate a new one and work correctly. Signed-off-by: Weichao Guo Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/node.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 4cb182c20eedd..0cd1d51dde06d 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1385,6 +1385,7 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, nid, nid_of_node(page), ino_of_node(page), ofs_of_node(page), cpver_of_node(page), next_blkaddr_of_node(page)); + set_sbi_flag(sbi, SBI_NEED_FSCK); err = -EINVAL; out_err: ClearPageUptodate(page); From 4216ca35b040ea8ea09d2563647e14e068da09b8 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 10 Nov 2021 01:47:48 -0600 Subject: [PATCH 1868/5344] smb3: do not error on fsync when readonly [ Upstream commit 71e6864eacbef0b2645ca043cdfbac272cb6cea3 ] Linux allows doing a flush/fsync on a file open for read-only, but the protocol does not allow that. If the file passed in on the flush is read-only try to find a writeable handle for the same inode, if that is not possible skip sending the fsync call to the server to avoid breaking the apps. Reported-by: Julian Sikorski Tested-by: Julian Sikorski Suggested-by: Jeremy Allison Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/cifs/file.c | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index a9746af5a44db..03c85beecec10 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2577,12 +2577,23 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end, tcon = tlink_tcon(smbfile->tlink); if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) { server = tcon->ses->server; - if (server->ops->flush) - rc = server->ops->flush(xid, tcon, &smbfile->fid); - else + if (server->ops->flush == NULL) { rc = -ENOSYS; + goto strict_fsync_exit; + } + + if ((OPEN_FMODE(smbfile->f_flags) & FMODE_WRITE) == 0) { + smbfile = find_writable_file(CIFS_I(inode), FIND_WR_ANY); + if (smbfile) { + rc = server->ops->flush(xid, tcon, &smbfile->fid); + cifsFileInfo_put(smbfile); + } else + cifs_dbg(FYI, "ignore fsync for file not open for write\n"); + } else + rc = server->ops->flush(xid, tcon, &smbfile->fid); } +strict_fsync_exit: free_xid(xid); return rc; } @@ -2594,6 +2605,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) struct cifs_tcon *tcon; struct TCP_Server_Info *server; struct cifsFileInfo *smbfile = file->private_data; + struct inode *inode = file_inode(file); struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); rc = file_write_and_wait_range(file, start, end); @@ -2608,12 +2620,23 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) tcon = tlink_tcon(smbfile->tlink); if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) { server = tcon->ses->server; - if (server->ops->flush) - rc = server->ops->flush(xid, tcon, &smbfile->fid); - else + if (server->ops->flush == NULL) { rc = -ENOSYS; + goto fsync_exit; + } + + if ((OPEN_FMODE(smbfile->f_flags) & FMODE_WRITE) == 0) { + smbfile = find_writable_file(CIFS_I(inode), FIND_WR_ANY); + if (smbfile) { + rc = server->ops->flush(xid, tcon, &smbfile->fid); + cifsFileInfo_put(smbfile); + } else + cifs_dbg(FYI, "ignore fsync for file not open for write\n"); + } else + rc = server->ops->flush(xid, tcon, &smbfile->fid); } +fsync_exit: free_xid(xid); return rc; } From 5b0236703a994a30231714cd2436ebd6f45ce6c6 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 22 Nov 2021 17:35:24 +0100 Subject: [PATCH 1869/5344] vhost/vsock: fix incorrect used length reported to the guest commit 49d8c5ffad07ca014cfae72a1b9b8c52b6ad9cb8 upstream. The "used length" reported by calling vhost_add_used() must be the number of bytes written by the device (using "in" buffers). In vhost_vsock_handle_tx_kick() the device only reads the guest buffers (they are all "out" buffers), without writing anything, so we must pass 0 as "used length" to comply virtio spec. Fixes: 433fc58e6bf2 ("VSOCK: Introduce vhost_vsock.ko") Cc: stable@vger.kernel.org Reported-by: Halil Pasic Suggested-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20211122163525.294024-2-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi Reviewed-by: Halil Pasic Signed-off-by: Greg Kroah-Hartman --- drivers/vhost/vsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index 6d9943dad5e1e..e3c6a4105305e 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -491,7 +491,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work) virtio_transport_free_pkt(pkt); len += sizeof(pkt->hdr); - vhost_add_used(vq, head, len); + vhost_add_used(vq, head, 0); total_len += len; added = true; } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len))); From 7c426241d1a73616a5c5c28ffd646298d1b61e0a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 26 Nov 2021 13:35:26 -0500 Subject: [PATCH 1870/5344] tracing: Check pid filtering when creating events commit 6cb206508b621a9a0a2c35b60540e399225c8243 upstream. When pid filtering is activated in an instance, all of the events trace files for that instance has the PID_FILTER flag set. This determines whether or not pid filtering needs to be done on the event, otherwise the event is executed as normal. If pid filtering is enabled when an event is created (via a dynamic event or modules), its flag is not updated to reflect the current state, and the events are not filtered properly. Cc: stable@vger.kernel.org Fixes: 3fdaf80f4a836 ("tracing: Implement event pid filtering") Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace_events.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e31ee325dad16..4acc77e049e5f 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2247,12 +2247,19 @@ static struct trace_event_file * trace_create_new_event(struct trace_event_call *call, struct trace_array *tr) { + struct trace_pid_list *pid_list; struct trace_event_file *file; file = kmem_cache_alloc(file_cachep, GFP_TRACE); if (!file) return NULL; + pid_list = rcu_dereference_protected(tr->filtered_pids, + lockdep_is_held(&event_mutex)); + + if (pid_list) + file->flags |= EVENT_FILE_FL_PID_FILTER; + file->event_call = call; file->tr = tr; atomic_set(&file->sm_ref, 0); From 56954d0b303e7ecce88b9e1649c12ee1e816587f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 9 Sep 2021 18:22:42 +0200 Subject: [PATCH 1871/5344] s390/mm: validate VMA in PGSTE manipulation functions commit fe3d10024073f06f04c74b9674bd71ccc1d787cf upstream. We should not walk/touch page tables outside of VMA boundaries when holding only the mmap sem in read mode. Evil user space can modify the VMA layout just before this function runs and e.g., trigger races with page table removal code since commit dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in munmap"). gfn_to_hva() will only translate using KVM memory regions, but won't validate the VMA. Further, we should not allocate page tables outside of VMA boundaries: if evil user space decides to map hugetlbfs to these ranges, bad things will happen because we suddenly have PTE or PMD page tables where we shouldn't have them. Similarly, we have to check if we suddenly find a hugetlbfs VMA, before calling get_locked_pte(). Fixes: 2d42f9477320 ("s390/kvm: Add PGSTE manipulation functions") Signed-off-by: David Hildenbrand Reviewed-by: Claudio Imbrenda Acked-by: Heiko Carstens Link: https://lore.kernel.org/r/20210909162248.14969-4-david@redhat.com Signed-off-by: Christian Borntraeger Signed-off-by: Greg Kroah-Hartman --- arch/s390/mm/pgtable.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 9ebd01219812c..4438c00acb656 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -970,6 +970,7 @@ EXPORT_SYMBOL(get_guest_storage_key); int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, unsigned long *oldpte, unsigned long *oldpgste) { + struct vm_area_struct *vma; unsigned long pgstev; spinlock_t *ptl; pgste_t pgste; @@ -979,6 +980,10 @@ int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, WARN_ON_ONCE(orc > ESSA_MAX); if (unlikely(orc > ESSA_MAX)) return -EINVAL; + + vma = find_vma(mm, hva); + if (!vma || hva < vma->vm_start || is_vm_hugetlb_page(vma)) + return -EFAULT; ptep = get_locked_pte(mm, hva, &ptl); if (unlikely(!ptep)) return -EFAULT; @@ -1071,10 +1076,14 @@ EXPORT_SYMBOL(pgste_perform_essa); int set_pgste_bits(struct mm_struct *mm, unsigned long hva, unsigned long bits, unsigned long value) { + struct vm_area_struct *vma; spinlock_t *ptl; pgste_t new; pte_t *ptep; + vma = find_vma(mm, hva); + if (!vma || hva < vma->vm_start || is_vm_hugetlb_page(vma)) + return -EFAULT; ptep = get_locked_pte(mm, hva, &ptl); if (unlikely(!ptep)) return -EFAULT; @@ -1099,9 +1108,13 @@ EXPORT_SYMBOL(set_pgste_bits); */ int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep) { + struct vm_area_struct *vma; spinlock_t *ptl; pte_t *ptep; + vma = find_vma(mm, hva); + if (!vma || hva < vma->vm_start || is_vm_hugetlb_page(vma)) + return -EFAULT; ptep = get_locked_pte(mm, hva, &ptl); if (unlikely(!ptep)) return -EFAULT; From f58b7fe6c31fd0f29c3a9c12246e7c0e1e8a8e0e Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 19 Nov 2021 16:43:21 -0800 Subject: [PATCH 1872/5344] shm: extend forced shm destroy to support objects from several IPC nses commit 85b6d24646e4125c591639841169baa98a2da503 upstream. Currently, the exit_shm() function not designed to work properly when task->sysvshm.shm_clist holds shm objects from different IPC namespaces. This is a real pain when sysctl kernel.shm_rmid_forced = 1, because it leads to use-after-free (reproducer exists). This is an attempt to fix the problem by extending exit_shm mechanism to handle shm's destroy from several IPC ns'es. To achieve that we do several things: 1. add a namespace (non-refcounted) pointer to the struct shmid_kernel 2. during new shm object creation (newseg()/shmget syscall) we initialize this pointer by current task IPC ns 3. exit_shm() fully reworked such that it traverses over all shp's in task->sysvshm.shm_clist and gets IPC namespace not from current task as it was before but from shp's object itself, then call shm_destroy(shp, ns). Note: We need to be really careful here, because as it was said before (1), our pointer to IPC ns non-refcnt'ed. To be on the safe side we using special helper get_ipc_ns_not_zero() which allows to get IPC ns refcounter only if IPC ns not in the "state of destruction". Q/A Q: Why can we access shp->ns memory using non-refcounted pointer? A: Because shp object lifetime is always shorther than IPC namespace lifetime, so, if we get shp object from the task->sysvshm.shm_clist while holding task_lock(task) nobody can steal our namespace. Q: Does this patch change semantics of unshare/setns/clone syscalls? A: No. It's just fixes non-covered case when process may leave IPC namespace without getting task->sysvshm.shm_clist list cleaned up. Link: https://lkml.kernel.org/r/67bb03e5-f79c-1815-e2bf-949c67047418@colorfullife.com Link: https://lkml.kernel.org/r/20211109151501.4921-1-manfred@colorfullife.com Fixes: ab602f79915 ("shm: make exit_shm work proportional to task activity") Co-developed-by: Manfred Spraul Signed-off-by: Manfred Spraul Signed-off-by: Alexander Mikhalitsyn Cc: "Eric W. Biederman" Cc: Davidlohr Bueso Cc: Greg KH Cc: Andrei Vagin Cc: Pavel Tikhomirov Cc: Vasily Averin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/ipc_namespace.h | 15 +++ include/linux/sched/task.h | 2 +- ipc/shm.c | 189 +++++++++++++++++++++++++--------- 3 files changed, 159 insertions(+), 47 deletions(-) diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index c309f43bde45e..f8c4d9f97819f 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -130,6 +130,16 @@ static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) return ns; } +static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) +{ + if (ns) { + if (refcount_inc_not_zero(&ns->count)) + return ns; + } + + return NULL; +} + extern void put_ipc_ns(struct ipc_namespace *ns); #else static inline struct ipc_namespace *copy_ipcs(unsigned long flags, @@ -146,6 +156,11 @@ static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) return ns; } +static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) +{ + return ns; +} + static inline void put_ipc_ns(struct ipc_namespace *ns) { } diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 9a2710cd63197..56d99bc728006 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -163,7 +163,7 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring * subscriptions and synchronises with wait4(). Also used in procfs. Also * pins the final release of task.io_context. Also protects ->cpuset and - * ->cgroup.subsys[]. And ->vfork_done. + * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist. * * Nests both inside and outside of read_lock(&tasklist_lock). * It must not be nested with write_lock_irq(&tasklist_lock), diff --git a/ipc/shm.c b/ipc/shm.c index ca3f1e89b02ba..c08f81ae9b475 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -62,9 +62,18 @@ struct shmid_kernel /* private to the kernel */ struct pid *shm_lprid; struct user_struct *mlock_user; - /* The task created the shm object. NULL if the task is dead. */ + /* + * The task created the shm object, for + * task_lock(shp->shm_creator) + */ struct task_struct *shm_creator; - struct list_head shm_clist; /* list by creator */ + + /* + * List by creator. task_lock(->shm_creator) required for read/write. + * If list_empty(), then the creator is dead already. + */ + struct list_head shm_clist; + struct ipc_namespace *ns; } __randomize_layout; /* shm_mode upper byte flags */ @@ -115,6 +124,7 @@ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) struct shmid_kernel *shp; shp = container_of(ipcp, struct shmid_kernel, shm_perm); + WARN_ON(ns != shp->ns); if (shp->shm_nattch) { shp->shm_perm.mode |= SHM_DEST; @@ -225,10 +235,43 @@ static void shm_rcu_free(struct rcu_head *head) kvfree(shp); } -static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) +/* + * It has to be called with shp locked. + * It must be called before ipc_rmid() + */ +static inline void shm_clist_rm(struct shmid_kernel *shp) { - list_del(&s->shm_clist); - ipc_rmid(&shm_ids(ns), &s->shm_perm); + struct task_struct *creator; + + /* ensure that shm_creator does not disappear */ + rcu_read_lock(); + + /* + * A concurrent exit_shm may do a list_del_init() as well. + * Just do nothing if exit_shm already did the work + */ + if (!list_empty(&shp->shm_clist)) { + /* + * shp->shm_creator is guaranteed to be valid *only* + * if shp->shm_clist is not empty. + */ + creator = shp->shm_creator; + + task_lock(creator); + /* + * list_del_init() is a nop if the entry was already removed + * from the list. + */ + list_del_init(&shp->shm_clist); + task_unlock(creator); + } + rcu_read_unlock(); +} + +static inline void shm_rmid(struct shmid_kernel *s) +{ + shm_clist_rm(s); + ipc_rmid(&shm_ids(s->ns), &s->shm_perm); } @@ -283,7 +326,7 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) shm_file = shp->shm_file; shp->shm_file = NULL; ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; - shm_rmid(ns, shp); + shm_rmid(shp); shm_unlock(shp); if (!is_file_hugepages(shm_file)) shmem_lock(shm_file, 0, shp->mlock_user); @@ -306,10 +349,10 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) * * 2) sysctl kernel.shm_rmid_forced is set to 1. */ -static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) +static bool shm_may_destroy(struct shmid_kernel *shp) { return (shp->shm_nattch == 0) && - (ns->shm_rmid_forced || + (shp->ns->shm_rmid_forced || (shp->shm_perm.mode & SHM_DEST)); } @@ -340,7 +383,7 @@ static void shm_close(struct vm_area_struct *vma) ipc_update_pid(&shp->shm_lprid, task_tgid(current)); shp->shm_dtim = ktime_get_real_seconds(); shp->shm_nattch--; - if (shm_may_destroy(ns, shp)) + if (shm_may_destroy(shp)) shm_destroy(ns, shp); else shm_unlock(shp); @@ -361,10 +404,10 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data) * * As shp->* are changed under rwsem, it's safe to skip shp locking. */ - if (shp->shm_creator != NULL) + if (!list_empty(&shp->shm_clist)) return 0; - if (shm_may_destroy(ns, shp)) { + if (shm_may_destroy(shp)) { shm_lock_by_ptr(shp); shm_destroy(ns, shp); } @@ -382,48 +425,97 @@ void shm_destroy_orphaned(struct ipc_namespace *ns) /* Locking assumes this will only be called with task == current */ void exit_shm(struct task_struct *task) { - struct ipc_namespace *ns = task->nsproxy->ipc_ns; - struct shmid_kernel *shp, *n; + for (;;) { + struct shmid_kernel *shp; + struct ipc_namespace *ns; - if (list_empty(&task->sysvshm.shm_clist)) - return; + task_lock(task); + + if (list_empty(&task->sysvshm.shm_clist)) { + task_unlock(task); + break; + } + + shp = list_first_entry(&task->sysvshm.shm_clist, struct shmid_kernel, + shm_clist); - /* - * If kernel.shm_rmid_forced is not set then only keep track of - * which shmids are orphaned, so that a later set of the sysctl - * can clean them up. - */ - if (!ns->shm_rmid_forced) { - down_read(&shm_ids(ns).rwsem); - list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist) - shp->shm_creator = NULL; /* - * Only under read lock but we are only called on current - * so no entry on the list will be shared. + * 1) Get pointer to the ipc namespace. It is worth to say + * that this pointer is guaranteed to be valid because + * shp lifetime is always shorter than namespace lifetime + * in which shp lives. + * We taken task_lock it means that shp won't be freed. */ - list_del(&task->sysvshm.shm_clist); - up_read(&shm_ids(ns).rwsem); - return; - } + ns = shp->ns; - /* - * Destroy all already created segments, that were not yet mapped, - * and mark any mapped as orphan to cover the sysctl toggling. - * Destroy is skipped if shm_may_destroy() returns false. - */ - down_write(&shm_ids(ns).rwsem); - list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) { - shp->shm_creator = NULL; + /* + * 2) If kernel.shm_rmid_forced is not set then only keep track of + * which shmids are orphaned, so that a later set of the sysctl + * can clean them up. + */ + if (!ns->shm_rmid_forced) + goto unlink_continue; - if (shm_may_destroy(ns, shp)) { - shm_lock_by_ptr(shp); - shm_destroy(ns, shp); + /* + * 3) get a reference to the namespace. + * The refcount could be already 0. If it is 0, then + * the shm objects will be free by free_ipc_work(). + */ + ns = get_ipc_ns_not_zero(ns); + if (!ns) { +unlink_continue: + list_del_init(&shp->shm_clist); + task_unlock(task); + continue; } - } - /* Remove the list head from any segments still attached. */ - list_del(&task->sysvshm.shm_clist); - up_write(&shm_ids(ns).rwsem); + /* + * 4) get a reference to shp. + * This cannot fail: shm_clist_rm() is called before + * ipc_rmid(), thus the refcount cannot be 0. + */ + WARN_ON(!ipc_rcu_getref(&shp->shm_perm)); + + /* + * 5) unlink the shm segment from the list of segments + * created by current. + * This must be done last. After unlinking, + * only the refcounts obtained above prevent IPC_RMID + * from destroying the segment or the namespace. + */ + list_del_init(&shp->shm_clist); + + task_unlock(task); + + /* + * 6) we have all references + * Thus lock & if needed destroy shp. + */ + down_write(&shm_ids(ns).rwsem); + shm_lock_by_ptr(shp); + /* + * rcu_read_lock was implicitly taken in shm_lock_by_ptr, it's + * safe to call ipc_rcu_putref here + */ + ipc_rcu_putref(&shp->shm_perm, shm_rcu_free); + + if (ipc_valid_object(&shp->shm_perm)) { + if (shm_may_destroy(shp)) + shm_destroy(ns, shp); + else + shm_unlock(shp); + } else { + /* + * Someone else deleted the shp from namespace + * idr/kht while we have waited. + * Just unlock and continue. + */ + shm_unlock(shp); + } + + up_write(&shm_ids(ns).rwsem); + put_ipc_ns(ns); /* paired with get_ipc_ns_not_zero */ + } } static vm_fault_t shm_fault(struct vm_fault *vmf) @@ -680,7 +772,11 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) if (error < 0) goto no_id; + shp->ns = ns; + + task_lock(current); list_add(&shp->shm_clist, ¤t->sysvshm.shm_clist); + task_unlock(current); /* * shmid gets reported as "inode#" in /proc/pid/maps. @@ -1575,7 +1671,8 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, down_write(&shm_ids(ns).rwsem); shp = shm_lock(ns, shmid); shp->shm_nattch--; - if (shm_may_destroy(ns, shp)) + + if (shm_may_destroy(shp)) shm_destroy(ns, shp); else shm_unlock(shp); From d176c6363e9a20352dbe8949f7bc9044a8b0743b Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Tue, 16 Nov 2021 23:27:32 +0800 Subject: [PATCH 1873/5344] NFC: add NCI_UNREG flag to eliminate the race commit 48b71a9e66c2eab60564b1b1c85f4928ed04e406 upstream. There are two sites that calls queue_work() after the destroy_workqueue() and lead to possible UAF. The first site is nci_send_cmd(), which can happen after the nci_close_device as below nfcmrvl_nci_unregister_dev | nfc_genl_dev_up nci_close_device | flush_workqueue | del_timer_sync | nci_unregister_device | nfc_get_device destroy_workqueue | nfc_dev_up nfc_unregister_device | nci_dev_up device_del | nci_open_device | __nci_request | nci_send_cmd | queue_work !!! Another site is nci_cmd_timer, awaked by the nci_cmd_work from the nci_send_cmd. ... | ... nci_unregister_device | queue_work destroy_workqueue | nfc_unregister_device | ... device_del | nci_cmd_work | mod_timer | ... | nci_cmd_timer | queue_work !!! For the above two UAF, the root cause is that the nfc_dev_up can race between the nci_unregister_device routine. Therefore, this patch introduce NCI_UNREG flag to easily eliminate the possible race. In addition, the mutex_lock in nci_close_device can act as a barrier. Signed-off-by: Lin Ma Fixes: 6a2968aaf50c ("NFC: basic NCI protocol implementation") Reviewed-by: Jakub Kicinski Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20211116152732.19238-1-linma@zju.edu.cn Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- include/net/nfc/nci_core.h | 1 + net/nfc/nci/core.c | 19 +++++++++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/include/net/nfc/nci_core.h b/include/net/nfc/nci_core.h index 33979017b7824..004e49f748419 100644 --- a/include/net/nfc/nci_core.h +++ b/include/net/nfc/nci_core.h @@ -30,6 +30,7 @@ enum nci_flag { NCI_UP, NCI_DATA_EXCHANGE, NCI_DATA_EXCHANGE_TO, + NCI_UNREG, }; /* NCI device states */ diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 1d0aa9e6044bf..b8ecb002e6238 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -473,6 +473,11 @@ static int nci_open_device(struct nci_dev *ndev) mutex_lock(&ndev->req_lock); + if (test_bit(NCI_UNREG, &ndev->flags)) { + rc = -ENODEV; + goto done; + } + if (test_bit(NCI_UP, &ndev->flags)) { rc = -EALREADY; goto done; @@ -536,6 +541,10 @@ static int nci_open_device(struct nci_dev *ndev) static int nci_close_device(struct nci_dev *ndev) { nci_req_cancel(ndev, ENODEV); + + /* This mutex needs to be held as a barrier for + * caller nci_unregister_device + */ mutex_lock(&ndev->req_lock); if (!test_and_clear_bit(NCI_UP, &ndev->flags)) { @@ -573,8 +582,8 @@ static int nci_close_device(struct nci_dev *ndev) /* Flush cmd wq */ flush_workqueue(ndev->cmd_wq); - /* Clear flags */ - ndev->flags = 0; + /* Clear flags except NCI_UNREG */ + ndev->flags &= BIT(NCI_UNREG); mutex_unlock(&ndev->req_lock); @@ -1256,6 +1265,12 @@ void nci_unregister_device(struct nci_dev *ndev) { struct nci_conn_info *conn_info, *n; + /* This set_bit is not protected with specialized barrier, + * However, it is fine because the mutex_lock(&ndev->req_lock); + * in nci_close_device() will help to emit one. + */ + set_bit(NCI_UNREG, &ndev->flags); + nci_close_device(ndev); destroy_workqueue(ndev->cmd_wq); From 8026ddcf4c570dfef688f5656c2004a937f9f47f Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 25 Nov 2021 14:05:18 +0100 Subject: [PATCH 1874/5344] fuse: release pipe buf after last use commit 473441720c8616dfaf4451f9c7ea14f0eb5e5d65 upstream. Checking buf->flags should be done before the pipe_buf_release() is called on the pipe buffer, since releasing the buffer might modify the flags. This is exactly what page_cache_pipe_buf_release() does, and which results in the same VM_BUG_ON_PAGE(PageLRU(page)) that the original patch was trying to fix. Reported-by: Justin Forbes Fixes: 712a951025c0 ("fuse: fix page stealing") Cc: # v2.6.35 Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/fuse/dev.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 1861d85b3abcf..8fcddea9a46ce 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -839,17 +839,17 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) goto out_put_old; } + get_page(newpage); + + if (!(buf->flags & PIPE_BUF_FLAG_LRU)) + lru_cache_add_file(newpage); + /* * Release while we have extra ref on stolen page. Otherwise * anon_pipe_buf_release() might think the page can be reused. */ pipe_buf_release(cs->pipe, buf); - get_page(newpage); - - if (!(buf->flags & PIPE_BUF_FLAG_LRU)) - lru_cache_add_file(newpage); - err = 0; spin_lock(&cs->req->waitq.lock); if (test_bit(FR_ABORTED, &cs->req->flags)) From a68cf929e7ab3ea6ad37a327e7cd0b939ac7c00b Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 29 Nov 2021 08:36:21 +0100 Subject: [PATCH 1875/5344] xen: sync include/xen/interface/io/ring.h with Xen's newest version commit 629a5d87e26fe96bcaab44cbb81f5866af6f7008 upstream. Sync include/xen/interface/io/ring.h with Xen's newest version in order to get the RING_COPY_RESPONSE() and RING_RESPONSE_PROD_OVERFLOW() macros. Note that this will correct the wrong license info by adding the missing original copyright notice. Signed-off-by: Juergen Gross Signed-off-by: Greg Kroah-Hartman --- include/xen/interface/io/ring.h | 293 +++++++++++++++++--------------- 1 file changed, 158 insertions(+), 135 deletions(-) diff --git a/include/xen/interface/io/ring.h b/include/xen/interface/io/ring.h index 3f40501fc60b1..b39cdbc522ec7 100644 --- a/include/xen/interface/io/ring.h +++ b/include/xen/interface/io/ring.h @@ -1,21 +1,53 @@ -/* SPDX-License-Identifier: GPL-2.0 */ /****************************************************************************** * ring.h * * Shared producer-consumer ring macros. * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * * Tim Deegan and Andrew Warfield November 2004. */ #ifndef __XEN_PUBLIC_IO_RING_H__ #define __XEN_PUBLIC_IO_RING_H__ +/* + * When #include'ing this header, you need to provide the following + * declaration upfront: + * - standard integers types (uint8_t, uint16_t, etc) + * They are provided by stdint.h of the standard headers. + * + * In addition, if you intend to use the FLEX macros, you also need to + * provide the following, before invoking the FLEX macros: + * - size_t + * - memcpy + * - grant_ref_t + * These declarations are provided by string.h of the standard headers, + * and grant_table.h from the Xen public headers. + */ + #include typedef unsigned int RING_IDX; /* Round a 32-bit unsigned constant down to the nearest power of two. */ -#define __RD2(_x) (((_x) & 0x00000002) ? 0x2 : ((_x) & 0x1)) +#define __RD2(_x) (((_x) & 0x00000002) ? 0x2 : ((_x) & 0x1)) #define __RD4(_x) (((_x) & 0x0000000c) ? __RD2((_x)>>2)<<2 : __RD2(_x)) #define __RD8(_x) (((_x) & 0x000000f0) ? __RD4((_x)>>4)<<4 : __RD4(_x)) #define __RD16(_x) (((_x) & 0x0000ff00) ? __RD8((_x)>>8)<<8 : __RD8(_x)) @@ -27,82 +59,79 @@ typedef unsigned int RING_IDX; * A ring contains as many entries as will fit, rounded down to the nearest * power of two (so we can mask with (size-1) to loop around). */ -#define __CONST_RING_SIZE(_s, _sz) \ - (__RD32(((_sz) - offsetof(struct _s##_sring, ring)) / \ - sizeof(((struct _s##_sring *)0)->ring[0]))) - +#define __CONST_RING_SIZE(_s, _sz) \ + (__RD32(((_sz) - offsetof(struct _s##_sring, ring)) / \ + sizeof(((struct _s##_sring *)0)->ring[0]))) /* * The same for passing in an actual pointer instead of a name tag. */ -#define __RING_SIZE(_s, _sz) \ - (__RD32(((_sz) - (long)&(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0]))) +#define __RING_SIZE(_s, _sz) \ + (__RD32(((_sz) - (long)(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0]))) /* * Macros to make the correct C datatypes for a new kind of ring. * * To make a new ring datatype, you need to have two message structures, - * let's say struct request, and struct response already defined. + * let's say request_t, and response_t already defined. * * In a header where you want the ring datatype declared, you then do: * - * DEFINE_RING_TYPES(mytag, struct request, struct response); + * DEFINE_RING_TYPES(mytag, request_t, response_t); * * These expand out to give you a set of types, as you can see below. * The most important of these are: * - * struct mytag_sring - The shared ring. - * struct mytag_front_ring - The 'front' half of the ring. - * struct mytag_back_ring - The 'back' half of the ring. + * mytag_sring_t - The shared ring. + * mytag_front_ring_t - The 'front' half of the ring. + * mytag_back_ring_t - The 'back' half of the ring. * * To initialize a ring in your code you need to know the location and size * of the shared memory area (PAGE_SIZE, for instance). To initialise * the front half: * - * struct mytag_front_ring front_ring; - * SHARED_RING_INIT((struct mytag_sring *)shared_page); - * FRONT_RING_INIT(&front_ring, (struct mytag_sring *)shared_page, - * PAGE_SIZE); + * mytag_front_ring_t front_ring; + * SHARED_RING_INIT((mytag_sring_t *)shared_page); + * FRONT_RING_INIT(&front_ring, (mytag_sring_t *)shared_page, PAGE_SIZE); * * Initializing the back follows similarly (note that only the front * initializes the shared ring): * - * struct mytag_back_ring back_ring; - * BACK_RING_INIT(&back_ring, (struct mytag_sring *)shared_page, - * PAGE_SIZE); + * mytag_back_ring_t back_ring; + * BACK_RING_INIT(&back_ring, (mytag_sring_t *)shared_page, PAGE_SIZE); */ -#define DEFINE_RING_TYPES(__name, __req_t, __rsp_t) \ - \ -/* Shared ring entry */ \ -union __name##_sring_entry { \ - __req_t req; \ - __rsp_t rsp; \ -}; \ - \ -/* Shared ring page */ \ -struct __name##_sring { \ - RING_IDX req_prod, req_event; \ - RING_IDX rsp_prod, rsp_event; \ - uint8_t pad[48]; \ - union __name##_sring_entry ring[1]; /* variable-length */ \ -}; \ - \ -/* "Front" end's private variables */ \ -struct __name##_front_ring { \ - RING_IDX req_prod_pvt; \ - RING_IDX rsp_cons; \ - unsigned int nr_ents; \ - struct __name##_sring *sring; \ -}; \ - \ -/* "Back" end's private variables */ \ -struct __name##_back_ring { \ - RING_IDX rsp_prod_pvt; \ - RING_IDX req_cons; \ - unsigned int nr_ents; \ - struct __name##_sring *sring; \ -}; - +#define DEFINE_RING_TYPES(__name, __req_t, __rsp_t) \ + \ +/* Shared ring entry */ \ +union __name##_sring_entry { \ + __req_t req; \ + __rsp_t rsp; \ +}; \ + \ +/* Shared ring page */ \ +struct __name##_sring { \ + RING_IDX req_prod, req_event; \ + RING_IDX rsp_prod, rsp_event; \ + uint8_t __pad[48]; \ + union __name##_sring_entry ring[1]; /* variable-length */ \ +}; \ + \ +/* "Front" end's private variables */ \ +struct __name##_front_ring { \ + RING_IDX req_prod_pvt; \ + RING_IDX rsp_cons; \ + unsigned int nr_ents; \ + struct __name##_sring *sring; \ +}; \ + \ +/* "Back" end's private variables */ \ +struct __name##_back_ring { \ + RING_IDX rsp_prod_pvt; \ + RING_IDX req_cons; \ + unsigned int nr_ents; \ + struct __name##_sring *sring; \ +}; \ + \ /* * Macros for manipulating rings. * @@ -119,105 +148,99 @@ struct __name##_back_ring { \ */ /* Initialising empty rings */ -#define SHARED_RING_INIT(_s) do { \ - (_s)->req_prod = (_s)->rsp_prod = 0; \ - (_s)->req_event = (_s)->rsp_event = 1; \ - memset((_s)->pad, 0, sizeof((_s)->pad)); \ +#define SHARED_RING_INIT(_s) do { \ + (_s)->req_prod = (_s)->rsp_prod = 0; \ + (_s)->req_event = (_s)->rsp_event = 1; \ + (void)memset((_s)->__pad, 0, sizeof((_s)->__pad)); \ } while(0) -#define FRONT_RING_INIT(_r, _s, __size) do { \ - (_r)->req_prod_pvt = 0; \ - (_r)->rsp_cons = 0; \ - (_r)->nr_ents = __RING_SIZE(_s, __size); \ - (_r)->sring = (_s); \ +#define FRONT_RING_ATTACH(_r, _s, _i, __size) do { \ + (_r)->req_prod_pvt = (_i); \ + (_r)->rsp_cons = (_i); \ + (_r)->nr_ents = __RING_SIZE(_s, __size); \ + (_r)->sring = (_s); \ } while (0) -#define BACK_RING_INIT(_r, _s, __size) do { \ - (_r)->rsp_prod_pvt = 0; \ - (_r)->req_cons = 0; \ - (_r)->nr_ents = __RING_SIZE(_s, __size); \ - (_r)->sring = (_s); \ -} while (0) +#define FRONT_RING_INIT(_r, _s, __size) FRONT_RING_ATTACH(_r, _s, 0, __size) -/* Initialize to existing shared indexes -- for recovery */ -#define FRONT_RING_ATTACH(_r, _s, __size) do { \ - (_r)->sring = (_s); \ - (_r)->req_prod_pvt = (_s)->req_prod; \ - (_r)->rsp_cons = (_s)->rsp_prod; \ - (_r)->nr_ents = __RING_SIZE(_s, __size); \ +#define BACK_RING_ATTACH(_r, _s, _i, __size) do { \ + (_r)->rsp_prod_pvt = (_i); \ + (_r)->req_cons = (_i); \ + (_r)->nr_ents = __RING_SIZE(_s, __size); \ + (_r)->sring = (_s); \ } while (0) -#define BACK_RING_ATTACH(_r, _s, __size) do { \ - (_r)->sring = (_s); \ - (_r)->rsp_prod_pvt = (_s)->rsp_prod; \ - (_r)->req_cons = (_s)->req_prod; \ - (_r)->nr_ents = __RING_SIZE(_s, __size); \ -} while (0) +#define BACK_RING_INIT(_r, _s, __size) BACK_RING_ATTACH(_r, _s, 0, __size) /* How big is this ring? */ -#define RING_SIZE(_r) \ +#define RING_SIZE(_r) \ ((_r)->nr_ents) /* Number of free requests (for use on front side only). */ -#define RING_FREE_REQUESTS(_r) \ +#define RING_FREE_REQUESTS(_r) \ (RING_SIZE(_r) - ((_r)->req_prod_pvt - (_r)->rsp_cons)) /* Test if there is an empty slot available on the front ring. * (This is only meaningful from the front. ) */ -#define RING_FULL(_r) \ +#define RING_FULL(_r) \ (RING_FREE_REQUESTS(_r) == 0) /* Test if there are outstanding messages to be processed on a ring. */ -#define RING_HAS_UNCONSUMED_RESPONSES(_r) \ +#define RING_HAS_UNCONSUMED_RESPONSES(_r) \ ((_r)->sring->rsp_prod - (_r)->rsp_cons) -#define RING_HAS_UNCONSUMED_REQUESTS(_r) \ - ({ \ - unsigned int req = (_r)->sring->req_prod - (_r)->req_cons; \ - unsigned int rsp = RING_SIZE(_r) - \ - ((_r)->req_cons - (_r)->rsp_prod_pvt); \ - req < rsp ? req : rsp; \ - }) +#define RING_HAS_UNCONSUMED_REQUESTS(_r) ({ \ + unsigned int req = (_r)->sring->req_prod - (_r)->req_cons; \ + unsigned int rsp = RING_SIZE(_r) - \ + ((_r)->req_cons - (_r)->rsp_prod_pvt); \ + req < rsp ? req : rsp; \ +}) /* Direct access to individual ring elements, by index. */ -#define RING_GET_REQUEST(_r, _idx) \ +#define RING_GET_REQUEST(_r, _idx) \ (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].req)) +#define RING_GET_RESPONSE(_r, _idx) \ + (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].rsp)) + /* - * Get a local copy of a request. + * Get a local copy of a request/response. * - * Use this in preference to RING_GET_REQUEST() so all processing is + * Use this in preference to RING_GET_{REQUEST,RESPONSE}() so all processing is * done on a local copy that cannot be modified by the other end. * * Note that https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58145 may cause this - * to be ineffective where _req is a struct which consists of only bitfields. + * to be ineffective where dest is a struct which consists of only bitfields. */ -#define RING_COPY_REQUEST(_r, _idx, _req) do { \ - /* Use volatile to force the copy into _req. */ \ - *(_req) = *(volatile typeof(_req))RING_GET_REQUEST(_r, _idx); \ +#define RING_COPY_(type, r, idx, dest) do { \ + /* Use volatile to force the copy into dest. */ \ + *(dest) = *(volatile typeof(dest))RING_GET_##type(r, idx); \ } while (0) -#define RING_GET_RESPONSE(_r, _idx) \ - (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].rsp)) +#define RING_COPY_REQUEST(r, idx, req) RING_COPY_(REQUEST, r, idx, req) +#define RING_COPY_RESPONSE(r, idx, rsp) RING_COPY_(RESPONSE, r, idx, rsp) /* Loop termination condition: Would the specified index overflow the ring? */ -#define RING_REQUEST_CONS_OVERFLOW(_r, _cons) \ +#define RING_REQUEST_CONS_OVERFLOW(_r, _cons) \ (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r)) /* Ill-behaved frontend determination: Can there be this many requests? */ -#define RING_REQUEST_PROD_OVERFLOW(_r, _prod) \ +#define RING_REQUEST_PROD_OVERFLOW(_r, _prod) \ (((_prod) - (_r)->rsp_prod_pvt) > RING_SIZE(_r)) +/* Ill-behaved backend determination: Can there be this many responses? */ +#define RING_RESPONSE_PROD_OVERFLOW(_r, _prod) \ + (((_prod) - (_r)->rsp_cons) > RING_SIZE(_r)) -#define RING_PUSH_REQUESTS(_r) do { \ - virt_wmb(); /* back sees requests /before/ updated producer index */ \ - (_r)->sring->req_prod = (_r)->req_prod_pvt; \ +#define RING_PUSH_REQUESTS(_r) do { \ + virt_wmb(); /* back sees requests /before/ updated producer index */\ + (_r)->sring->req_prod = (_r)->req_prod_pvt; \ } while (0) -#define RING_PUSH_RESPONSES(_r) do { \ - virt_wmb(); /* front sees responses /before/ updated producer index */ \ - (_r)->sring->rsp_prod = (_r)->rsp_prod_pvt; \ +#define RING_PUSH_RESPONSES(_r) do { \ + virt_wmb(); /* front sees resps /before/ updated producer index */ \ + (_r)->sring->rsp_prod = (_r)->rsp_prod_pvt; \ } while (0) /* @@ -250,40 +273,40 @@ struct __name##_back_ring { \ * field appropriately. */ -#define RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(_r, _notify) do { \ - RING_IDX __old = (_r)->sring->req_prod; \ - RING_IDX __new = (_r)->req_prod_pvt; \ - virt_wmb(); /* back sees requests /before/ updated producer index */ \ - (_r)->sring->req_prod = __new; \ - virt_mb(); /* back sees new requests /before/ we check req_event */ \ - (_notify) = ((RING_IDX)(__new - (_r)->sring->req_event) < \ - (RING_IDX)(__new - __old)); \ +#define RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(_r, _notify) do { \ + RING_IDX __old = (_r)->sring->req_prod; \ + RING_IDX __new = (_r)->req_prod_pvt; \ + virt_wmb(); /* back sees requests /before/ updated producer index */\ + (_r)->sring->req_prod = __new; \ + virt_mb(); /* back sees new requests /before/ we check req_event */ \ + (_notify) = ((RING_IDX)(__new - (_r)->sring->req_event) < \ + (RING_IDX)(__new - __old)); \ } while (0) -#define RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(_r, _notify) do { \ - RING_IDX __old = (_r)->sring->rsp_prod; \ - RING_IDX __new = (_r)->rsp_prod_pvt; \ - virt_wmb(); /* front sees responses /before/ updated producer index */ \ - (_r)->sring->rsp_prod = __new; \ - virt_mb(); /* front sees new responses /before/ we check rsp_event */ \ - (_notify) = ((RING_IDX)(__new - (_r)->sring->rsp_event) < \ - (RING_IDX)(__new - __old)); \ +#define RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(_r, _notify) do { \ + RING_IDX __old = (_r)->sring->rsp_prod; \ + RING_IDX __new = (_r)->rsp_prod_pvt; \ + virt_wmb(); /* front sees resps /before/ updated producer index */ \ + (_r)->sring->rsp_prod = __new; \ + virt_mb(); /* front sees new resps /before/ we check rsp_event */ \ + (_notify) = ((RING_IDX)(__new - (_r)->sring->rsp_event) < \ + (RING_IDX)(__new - __old)); \ } while (0) -#define RING_FINAL_CHECK_FOR_REQUESTS(_r, _work_to_do) do { \ - (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r); \ - if (_work_to_do) break; \ - (_r)->sring->req_event = (_r)->req_cons + 1; \ - virt_mb(); \ - (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r); \ +#define RING_FINAL_CHECK_FOR_REQUESTS(_r, _work_to_do) do { \ + (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r); \ + if (_work_to_do) break; \ + (_r)->sring->req_event = (_r)->req_cons + 1; \ + virt_mb(); \ + (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r); \ } while (0) -#define RING_FINAL_CHECK_FOR_RESPONSES(_r, _work_to_do) do { \ - (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \ - if (_work_to_do) break; \ - (_r)->sring->rsp_event = (_r)->rsp_cons + 1; \ - virt_mb(); \ - (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \ +#define RING_FINAL_CHECK_FOR_RESPONSES(_r, _work_to_do) do { \ + (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \ + if (_work_to_do) break; \ + (_r)->sring->rsp_event = (_r)->rsp_cons + 1; \ + virt_mb(); \ + (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \ } while (0) From 25c15565472565f08d93f9f44755cc2c8b8f5352 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 29 Nov 2021 08:41:07 +0100 Subject: [PATCH 1876/5344] xen/blkfront: read response from backend only once MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 71b66243f9898d0e54296b4e7035fb33cdcb0707 upstream. In order to avoid problems in case the backend is modifying a response on the ring page while the frontend has already seen it, just read the response into a local buffer in one go and then operate on that buffer only. Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich Acked-by: Roger Pau Monné Link: https://lore.kernel.org/r/20210730103854.12681-2-jgross@suse.com Signed-off-by: Juergen Gross Signed-off-by: Greg Kroah-Hartman --- drivers/block/xen-blkfront.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index def41e1bd7364..ce0f314817c9d 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1549,7 +1549,7 @@ static bool blkif_completion(unsigned long *id, static irqreturn_t blkif_interrupt(int irq, void *dev_id) { struct request *req; - struct blkif_response *bret; + struct blkif_response bret; RING_IDX i, rp; unsigned long flags; struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)dev_id; @@ -1566,8 +1566,9 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) for (i = rinfo->ring.rsp_cons; i != rp; i++) { unsigned long id; - bret = RING_GET_RESPONSE(&rinfo->ring, i); - id = bret->id; + RING_COPY_RESPONSE(&rinfo->ring, i, &bret); + id = bret.id; + /* * The backend has messed up and given us an id that we would * never have given to it (we stamp it up to BLK_RING_SIZE - @@ -1575,39 +1576,39 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) */ if (id >= BLK_RING_SIZE(info)) { WARN(1, "%s: response to %s has incorrect id (%ld)\n", - info->gd->disk_name, op_name(bret->operation), id); + info->gd->disk_name, op_name(bret.operation), id); /* We can't safely get the 'struct request' as * the id is busted. */ continue; } req = rinfo->shadow[id].request; - if (bret->operation != BLKIF_OP_DISCARD) { + if (bret.operation != BLKIF_OP_DISCARD) { /* * We may need to wait for an extra response if the * I/O request is split in 2 */ - if (!blkif_completion(&id, rinfo, bret)) + if (!blkif_completion(&id, rinfo, &bret)) continue; } if (add_id_to_freelist(rinfo, id)) { WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n", - info->gd->disk_name, op_name(bret->operation), id); + info->gd->disk_name, op_name(bret.operation), id); continue; } - if (bret->status == BLKIF_RSP_OKAY) + if (bret.status == BLKIF_RSP_OKAY) blkif_req(req)->error = BLK_STS_OK; else blkif_req(req)->error = BLK_STS_IOERR; - switch (bret->operation) { + switch (bret.operation) { case BLKIF_OP_DISCARD: - if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { + if (unlikely(bret.status == BLKIF_RSP_EOPNOTSUPP)) { struct request_queue *rq = info->rq; printk(KERN_WARNING "blkfront: %s: %s op failed\n", - info->gd->disk_name, op_name(bret->operation)); + info->gd->disk_name, op_name(bret.operation)); blkif_req(req)->error = BLK_STS_NOTSUPP; info->feature_discard = 0; info->feature_secdiscard = 0; @@ -1617,15 +1618,15 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) break; case BLKIF_OP_FLUSH_DISKCACHE: case BLKIF_OP_WRITE_BARRIER: - if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { + if (unlikely(bret.status == BLKIF_RSP_EOPNOTSUPP)) { printk(KERN_WARNING "blkfront: %s: %s op failed\n", - info->gd->disk_name, op_name(bret->operation)); + info->gd->disk_name, op_name(bret.operation)); blkif_req(req)->error = BLK_STS_NOTSUPP; } - if (unlikely(bret->status == BLKIF_RSP_ERROR && + if (unlikely(bret.status == BLKIF_RSP_ERROR && rinfo->shadow[id].req.u.rw.nr_segments == 0)) { printk(KERN_WARNING "blkfront: %s: empty %s op failed\n", - info->gd->disk_name, op_name(bret->operation)); + info->gd->disk_name, op_name(bret.operation)); blkif_req(req)->error = BLK_STS_NOTSUPP; } if (unlikely(blkif_req(req)->error)) { @@ -1638,9 +1639,9 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) /* fall through */ case BLKIF_OP_READ: case BLKIF_OP_WRITE: - if (unlikely(bret->status != BLKIF_RSP_OKAY)) + if (unlikely(bret.status != BLKIF_RSP_OKAY)) dev_dbg(&info->xbdev->dev, "Bad return from blkdev data " - "request: %x\n", bret->status); + "request: %x\n", bret.status); break; default: From bb0ffe969b2210c2a5336febe9b0f469764bc8de Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 29 Nov 2021 08:45:38 +0100 Subject: [PATCH 1877/5344] xen/blkfront: don't take local copy of a request from the ring page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 8f5a695d99000fc3aa73934d7ced33cfc64dcdab upstream. In order to avoid a malicious backend being able to influence the local copy of a request build the request locally first and then copy it to the ring page instead of doing it the other way round as today. Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich Acked-by: Roger Pau Monné Link: https://lore.kernel.org/r/20210730103854.12681-3-jgross@suse.com Signed-off-by: Juergen Gross Signed-off-by: Greg Kroah-Hartman --- drivers/block/xen-blkfront.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index ce0f314817c9d..d24d3d7f5b522 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -536,7 +536,7 @@ static unsigned long blkif_ring_get_request(struct blkfront_ring_info *rinfo, rinfo->shadow[id].status = REQ_WAITING; rinfo->shadow[id].associated_id = NO_ASSOCIATED_ID; - (*ring_req)->u.rw.id = id; + rinfo->shadow[id].req.u.rw.id = id; return id; } @@ -544,11 +544,12 @@ static unsigned long blkif_ring_get_request(struct blkfront_ring_info *rinfo, static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_info *rinfo) { struct blkfront_info *info = rinfo->dev_info; - struct blkif_request *ring_req; + struct blkif_request *ring_req, *final_ring_req; unsigned long id; /* Fill out a communications ring structure. */ - id = blkif_ring_get_request(rinfo, req, &ring_req); + id = blkif_ring_get_request(rinfo, req, &final_ring_req); + ring_req = &rinfo->shadow[id].req; ring_req->operation = BLKIF_OP_DISCARD; ring_req->u.discard.nr_sectors = blk_rq_sectors(req); @@ -559,8 +560,8 @@ static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_inf else ring_req->u.discard.flag = 0; - /* Keep a private copy so we can reissue requests when recovering. */ - rinfo->shadow[id].req = *ring_req; + /* Copy the request to the ring page. */ + *final_ring_req = *ring_req; return 0; } @@ -693,6 +694,7 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri { struct blkfront_info *info = rinfo->dev_info; struct blkif_request *ring_req, *extra_ring_req = NULL; + struct blkif_request *final_ring_req, *final_extra_ring_req = NULL; unsigned long id, extra_id = NO_ASSOCIATED_ID; bool require_extra_req = false; int i; @@ -737,7 +739,8 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri } /* Fill out a communications ring structure. */ - id = blkif_ring_get_request(rinfo, req, &ring_req); + id = blkif_ring_get_request(rinfo, req, &final_ring_req); + ring_req = &rinfo->shadow[id].req; num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg); num_grant = 0; @@ -788,7 +791,9 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri ring_req->u.rw.nr_segments = num_grant; if (unlikely(require_extra_req)) { extra_id = blkif_ring_get_request(rinfo, req, - &extra_ring_req); + &final_extra_ring_req); + extra_ring_req = &rinfo->shadow[extra_id].req; + /* * Only the first request contains the scatter-gather * list. @@ -830,10 +835,10 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri if (setup.segments) kunmap_atomic(setup.segments); - /* Keep a private copy so we can reissue requests when recovering. */ - rinfo->shadow[id].req = *ring_req; + /* Copy request(s) to the ring page. */ + *final_ring_req = *ring_req; if (unlikely(require_extra_req)) - rinfo->shadow[extra_id].req = *extra_ring_req; + *final_extra_ring_req = *extra_ring_req; if (new_persistent_gnts) gnttab_free_grant_references(setup.gref_head); From 5683787ec83ffa18ec0dd58e45a51bf4c6421296 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 29 Nov 2021 08:47:08 +0100 Subject: [PATCH 1878/5344] xen/blkfront: don't trust the backend response data blindly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit b94e4b147fd1992ad450e1fea1fdaa3738753373 upstream. Today blkfront will trust the backend to send only sane response data. In order to avoid privilege escalations or crashes in case of malicious backends verify the data to be within expected limits. Especially make sure that the response always references an outstanding request. Introduce a new state of the ring BLKIF_STATE_ERROR which will be switched to in case an inconsistency is being detected. Recovering from this state is possible only via removing and adding the virtual device again (e.g. via a suspend/resume cycle). Make all warning messages issued due to valid error responses rate limited in order to avoid message floods being triggered by a malicious backend. Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich Acked-by: Roger Pau Monné Link: https://lore.kernel.org/r/20210730103854.12681-4-jgross@suse.com Signed-off-by: Juergen Gross Signed-off-by: Greg Kroah-Hartman --- drivers/block/xen-blkfront.c | 70 +++++++++++++++++++++++++++--------- 1 file changed, 53 insertions(+), 17 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index d24d3d7f5b522..baf10b73675e2 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -80,6 +80,7 @@ enum blkif_state { BLKIF_STATE_DISCONNECTED, BLKIF_STATE_CONNECTED, BLKIF_STATE_SUSPENDED, + BLKIF_STATE_ERROR, }; struct grant { @@ -89,6 +90,7 @@ struct grant { }; enum blk_req_status { + REQ_PROCESSING, REQ_WAITING, REQ_DONE, REQ_ERROR, @@ -533,7 +535,7 @@ static unsigned long blkif_ring_get_request(struct blkfront_ring_info *rinfo, id = get_id_from_freelist(rinfo); rinfo->shadow[id].request = req; - rinfo->shadow[id].status = REQ_WAITING; + rinfo->shadow[id].status = REQ_PROCESSING; rinfo->shadow[id].associated_id = NO_ASSOCIATED_ID; rinfo->shadow[id].req.u.rw.id = id; @@ -562,6 +564,7 @@ static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_inf /* Copy the request to the ring page. */ *final_ring_req = *ring_req; + rinfo->shadow[id].status = REQ_WAITING; return 0; } @@ -837,8 +840,11 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri /* Copy request(s) to the ring page. */ *final_ring_req = *ring_req; - if (unlikely(require_extra_req)) + rinfo->shadow[id].status = REQ_WAITING; + if (unlikely(require_extra_req)) { *final_extra_ring_req = *extra_ring_req; + rinfo->shadow[extra_id].status = REQ_WAITING; + } if (new_persistent_gnts) gnttab_free_grant_references(setup.gref_head); @@ -1412,8 +1418,8 @@ static enum blk_req_status blkif_rsp_to_req_status(int rsp) static int blkif_get_final_status(enum blk_req_status s1, enum blk_req_status s2) { - BUG_ON(s1 == REQ_WAITING); - BUG_ON(s2 == REQ_WAITING); + BUG_ON(s1 < REQ_DONE); + BUG_ON(s2 < REQ_DONE); if (s1 == REQ_ERROR || s2 == REQ_ERROR) return BLKIF_RSP_ERROR; @@ -1446,7 +1452,7 @@ static bool blkif_completion(unsigned long *id, s->status = blkif_rsp_to_req_status(bret->status); /* Wait the second response if not yet here. */ - if (s2->status == REQ_WAITING) + if (s2->status < REQ_DONE) return false; bret->status = blkif_get_final_status(s->status, @@ -1565,11 +1571,17 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) spin_lock_irqsave(&rinfo->ring_lock, flags); again: - rp = rinfo->ring.sring->rsp_prod; - rmb(); /* Ensure we see queued responses up to 'rp'. */ + rp = READ_ONCE(rinfo->ring.sring->rsp_prod); + virt_rmb(); /* Ensure we see queued responses up to 'rp'. */ + if (RING_RESPONSE_PROD_OVERFLOW(&rinfo->ring, rp)) { + pr_alert("%s: illegal number of responses %u\n", + info->gd->disk_name, rp - rinfo->ring.rsp_cons); + goto err; + } for (i = rinfo->ring.rsp_cons; i != rp; i++) { unsigned long id; + unsigned int op; RING_COPY_RESPONSE(&rinfo->ring, i, &bret); id = bret.id; @@ -1580,14 +1592,28 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) * look in get_id_from_freelist. */ if (id >= BLK_RING_SIZE(info)) { - WARN(1, "%s: response to %s has incorrect id (%ld)\n", - info->gd->disk_name, op_name(bret.operation), id); - /* We can't safely get the 'struct request' as - * the id is busted. */ - continue; + pr_alert("%s: response has incorrect id (%ld)\n", + info->gd->disk_name, id); + goto err; + } + if (rinfo->shadow[id].status != REQ_WAITING) { + pr_alert("%s: response references no pending request\n", + info->gd->disk_name); + goto err; } + + rinfo->shadow[id].status = REQ_PROCESSING; req = rinfo->shadow[id].request; + op = rinfo->shadow[id].req.operation; + if (op == BLKIF_OP_INDIRECT) + op = rinfo->shadow[id].req.u.indirect.indirect_op; + if (bret.operation != op) { + pr_alert("%s: response has wrong operation (%u instead of %u)\n", + info->gd->disk_name, bret.operation, op); + goto err; + } + if (bret.operation != BLKIF_OP_DISCARD) { /* * We may need to wait for an extra response if the @@ -1612,7 +1638,8 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) case BLKIF_OP_DISCARD: if (unlikely(bret.status == BLKIF_RSP_EOPNOTSUPP)) { struct request_queue *rq = info->rq; - printk(KERN_WARNING "blkfront: %s: %s op failed\n", + + pr_warn_ratelimited("blkfront: %s: %s op failed\n", info->gd->disk_name, op_name(bret.operation)); blkif_req(req)->error = BLK_STS_NOTSUPP; info->feature_discard = 0; @@ -1624,13 +1651,13 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) case BLKIF_OP_FLUSH_DISKCACHE: case BLKIF_OP_WRITE_BARRIER: if (unlikely(bret.status == BLKIF_RSP_EOPNOTSUPP)) { - printk(KERN_WARNING "blkfront: %s: %s op failed\n", + pr_warn_ratelimited("blkfront: %s: %s op failed\n", info->gd->disk_name, op_name(bret.operation)); blkif_req(req)->error = BLK_STS_NOTSUPP; } if (unlikely(bret.status == BLKIF_RSP_ERROR && rinfo->shadow[id].req.u.rw.nr_segments == 0)) { - printk(KERN_WARNING "blkfront: %s: empty %s op failed\n", + pr_warn_ratelimited("blkfront: %s: empty %s op failed\n", info->gd->disk_name, op_name(bret.operation)); blkif_req(req)->error = BLK_STS_NOTSUPP; } @@ -1645,8 +1672,9 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) case BLKIF_OP_READ: case BLKIF_OP_WRITE: if (unlikely(bret.status != BLKIF_RSP_OKAY)) - dev_dbg(&info->xbdev->dev, "Bad return from blkdev data " - "request: %x\n", bret.status); + dev_dbg_ratelimited(&info->xbdev->dev, + "Bad return from blkdev data request: %#x\n", + bret.status); break; default: @@ -1671,6 +1699,14 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) spin_unlock_irqrestore(&rinfo->ring_lock, flags); return IRQ_HANDLED; + + err: + info->connected = BLKIF_STATE_ERROR; + + spin_unlock_irqrestore(&rinfo->ring_lock, flags); + + pr_alert("%s disabled for further use\n", info->gd->disk_name); + return IRQ_HANDLED; } From d421a7bf9e184118b346eba8c57aec9abc83d6e8 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 29 Nov 2021 08:58:22 +0100 Subject: [PATCH 1879/5344] xen/netfront: read response from backend only once commit 8446066bf8c1f9f7b7412c43fbea0fb87464d75b upstream. In order to avoid problems in case the backend is modifying a response on the ring page while the frontend has already seen it, just read the response into a local buffer in one go and then operate on that buffer only. Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/xen-netfront.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 7d389c2cc9026..a193c7ba8f38c 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -387,13 +387,13 @@ static void xennet_tx_buf_gc(struct netfront_queue *queue) rmb(); /* Ensure we see responses up to 'rp'. */ for (cons = queue->tx.rsp_cons; cons != prod; cons++) { - struct xen_netif_tx_response *txrsp; + struct xen_netif_tx_response txrsp; - txrsp = RING_GET_RESPONSE(&queue->tx, cons); - if (txrsp->status == XEN_NETIF_RSP_NULL) + RING_COPY_RESPONSE(&queue->tx, cons, &txrsp); + if (txrsp.status == XEN_NETIF_RSP_NULL) continue; - id = txrsp->id; + id = txrsp.id; skb = queue->tx_skbs[id].skb; if (unlikely(gnttab_query_foreign_access( queue->grant_tx_ref[id]) != 0)) { @@ -741,7 +741,7 @@ static int xennet_get_extras(struct netfront_queue *queue, RING_IDX rp) { - struct xen_netif_extra_info *extra; + struct xen_netif_extra_info extra; struct device *dev = &queue->info->netdev->dev; RING_IDX cons = queue->rx.rsp_cons; int err = 0; @@ -757,24 +757,22 @@ static int xennet_get_extras(struct netfront_queue *queue, break; } - extra = (struct xen_netif_extra_info *) - RING_GET_RESPONSE(&queue->rx, ++cons); + RING_COPY_RESPONSE(&queue->rx, ++cons, &extra); - if (unlikely(!extra->type || - extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) { + if (unlikely(!extra.type || + extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) { if (net_ratelimit()) dev_warn(dev, "Invalid extra type: %d\n", - extra->type); + extra.type); err = -EINVAL; } else { - memcpy(&extras[extra->type - 1], extra, - sizeof(*extra)); + extras[extra.type - 1] = extra; } skb = xennet_get_rx_skb(queue, cons); ref = xennet_get_rx_ref(queue, cons); xennet_move_rx_slot(queue, skb, ref); - } while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE); + } while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE); queue->rx.rsp_cons = cons; return err; @@ -784,7 +782,7 @@ static int xennet_get_responses(struct netfront_queue *queue, struct netfront_rx_info *rinfo, RING_IDX rp, struct sk_buff_head *list) { - struct xen_netif_rx_response *rx = &rinfo->rx; + struct xen_netif_rx_response *rx = &rinfo->rx, rx_local; struct xen_netif_extra_info *extras = rinfo->extras; struct device *dev = &queue->info->netdev->dev; RING_IDX cons = queue->rx.rsp_cons; @@ -842,7 +840,8 @@ static int xennet_get_responses(struct netfront_queue *queue, break; } - rx = RING_GET_RESPONSE(&queue->rx, cons + slots); + RING_COPY_RESPONSE(&queue->rx, cons + slots, &rx_local); + rx = &rx_local; skb = xennet_get_rx_skb(queue, cons + slots); ref = xennet_get_rx_ref(queue, cons + slots); slots++; @@ -897,10 +896,11 @@ static int xennet_fill_frags(struct netfront_queue *queue, struct sk_buff *nskb; while ((nskb = __skb_dequeue(list))) { - struct xen_netif_rx_response *rx = - RING_GET_RESPONSE(&queue->rx, ++cons); + struct xen_netif_rx_response rx; skb_frag_t *nfrag = &skb_shinfo(nskb)->frags[0]; + RING_COPY_RESPONSE(&queue->rx, ++cons, &rx); + if (skb_shinfo(skb)->nr_frags == MAX_SKB_FRAGS) { unsigned int pull_to = NETFRONT_SKB_CB(skb)->pull_to; @@ -915,7 +915,7 @@ static int xennet_fill_frags(struct netfront_queue *queue, skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, skb_frag_page(nfrag), - rx->offset, rx->status, PAGE_SIZE); + rx.offset, rx.status, PAGE_SIZE); skb_shinfo(nskb)->nr_frags = 0; kfree_skb(nskb); @@ -1013,7 +1013,7 @@ static int xennet_poll(struct napi_struct *napi, int budget) i = queue->rx.rsp_cons; work_done = 0; while ((i != rp) && (work_done < budget)) { - memcpy(rx, RING_GET_RESPONSE(&queue->rx, i), sizeof(*rx)); + RING_COPY_RESPONSE(&queue->rx, i, rx); memset(extras, 0, sizeof(rinfo.extras)); err = xennet_get_responses(queue, &rinfo, rp, &tmpq); From 0b24d930dd8c39797b6ff9e4b14c54da8b15998d Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 29 Nov 2021 09:01:05 +0100 Subject: [PATCH 1880/5344] xen/netfront: don't read data from request on the ring page commit 162081ec33c2686afa29d91bf8d302824aa846c7 upstream. In order to avoid a malicious backend being able to influence the local processing of a request build the request locally first and then copy it to the ring page. Any reading from the request influencing the processing in the frontend needs to be done on the local instance. Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/xen-netfront.c | 78 ++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 42 deletions(-) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index a193c7ba8f38c..145511afe173b 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -423,7 +423,8 @@ struct xennet_gnttab_make_txreq { struct netfront_queue *queue; struct sk_buff *skb; struct page *page; - struct xen_netif_tx_request *tx; /* Last request */ + struct xen_netif_tx_request *tx; /* Last request on ring page */ + struct xen_netif_tx_request tx_local; /* Last request local copy*/ unsigned int size; }; @@ -451,30 +452,27 @@ static void xennet_tx_setup_grant(unsigned long gfn, unsigned int offset, queue->grant_tx_page[id] = page; queue->grant_tx_ref[id] = ref; - tx->id = id; - tx->gref = ref; - tx->offset = offset; - tx->size = len; - tx->flags = 0; + info->tx_local.id = id; + info->tx_local.gref = ref; + info->tx_local.offset = offset; + info->tx_local.size = len; + info->tx_local.flags = 0; + + *tx = info->tx_local; info->tx = tx; - info->size += tx->size; + info->size += info->tx_local.size; } static struct xen_netif_tx_request *xennet_make_first_txreq( - struct netfront_queue *queue, struct sk_buff *skb, - struct page *page, unsigned int offset, unsigned int len) + struct xennet_gnttab_make_txreq *info, + unsigned int offset, unsigned int len) { - struct xennet_gnttab_make_txreq info = { - .queue = queue, - .skb = skb, - .page = page, - .size = 0, - }; + info->size = 0; - gnttab_for_one_grant(page, offset, len, xennet_tx_setup_grant, &info); + gnttab_for_one_grant(info->page, offset, len, xennet_tx_setup_grant, info); - return info.tx; + return info->tx; } static void xennet_make_one_txreq(unsigned long gfn, unsigned int offset, @@ -487,35 +485,27 @@ static void xennet_make_one_txreq(unsigned long gfn, unsigned int offset, xennet_tx_setup_grant(gfn, offset, len, data); } -static struct xen_netif_tx_request *xennet_make_txreqs( - struct netfront_queue *queue, struct xen_netif_tx_request *tx, - struct sk_buff *skb, struct page *page, +static void xennet_make_txreqs( + struct xennet_gnttab_make_txreq *info, + struct page *page, unsigned int offset, unsigned int len) { - struct xennet_gnttab_make_txreq info = { - .queue = queue, - .skb = skb, - .tx = tx, - }; - /* Skip unused frames from start of page */ page += offset >> PAGE_SHIFT; offset &= ~PAGE_MASK; while (len) { - info.page = page; - info.size = 0; + info->page = page; + info->size = 0; gnttab_foreach_grant_in_range(page, offset, len, xennet_make_one_txreq, - &info); + info); page++; offset = 0; - len -= info.size; + len -= info->size; } - - return info.tx; } /* @@ -568,7 +558,7 @@ static netdev_tx_t xennet_start_xmit(struct sk_buff *skb, struct net_device *dev { struct netfront_info *np = netdev_priv(dev); struct netfront_stats *tx_stats = this_cpu_ptr(np->tx_stats); - struct xen_netif_tx_request *tx, *first_tx; + struct xen_netif_tx_request *first_tx; unsigned int i; int notify; int slots; @@ -577,6 +567,7 @@ static netdev_tx_t xennet_start_xmit(struct sk_buff *skb, struct net_device *dev unsigned int len; unsigned long flags; struct netfront_queue *queue = NULL; + struct xennet_gnttab_make_txreq info = { }; unsigned int num_queues = dev->real_num_tx_queues; u16 queue_index; struct sk_buff *nskb; @@ -634,21 +625,24 @@ static netdev_tx_t xennet_start_xmit(struct sk_buff *skb, struct net_device *dev } /* First request for the linear area. */ - first_tx = tx = xennet_make_first_txreq(queue, skb, - page, offset, len); - offset += tx->size; + info.queue = queue; + info.skb = skb; + info.page = page; + first_tx = xennet_make_first_txreq(&info, offset, len); + offset += info.tx_local.size; if (offset == PAGE_SIZE) { page++; offset = 0; } - len -= tx->size; + len -= info.tx_local.size; if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */ - tx->flags |= XEN_NETTXF_csum_blank | XEN_NETTXF_data_validated; + first_tx->flags |= XEN_NETTXF_csum_blank | + XEN_NETTXF_data_validated; else if (skb->ip_summed == CHECKSUM_UNNECESSARY) /* remote but checksummed. */ - tx->flags |= XEN_NETTXF_data_validated; + first_tx->flags |= XEN_NETTXF_data_validated; /* Optional extra info after the first request. */ if (skb_shinfo(skb)->gso_size) { @@ -657,7 +651,7 @@ static netdev_tx_t xennet_start_xmit(struct sk_buff *skb, struct net_device *dev gso = (struct xen_netif_extra_info *) RING_GET_REQUEST(&queue->tx, queue->tx.req_prod_pvt++); - tx->flags |= XEN_NETTXF_extra_info; + first_tx->flags |= XEN_NETTXF_extra_info; gso->u.gso.size = skb_shinfo(skb)->gso_size; gso->u.gso.type = (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) ? @@ -671,12 +665,12 @@ static netdev_tx_t xennet_start_xmit(struct sk_buff *skb, struct net_device *dev } /* Requests for the rest of the linear area. */ - tx = xennet_make_txreqs(queue, tx, skb, page, offset, len); + xennet_make_txreqs(&info, page, offset, len); /* Requests for all the frags. */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - tx = xennet_make_txreqs(queue, tx, skb, skb_frag_page(frag), + xennet_make_txreqs(&info, skb_frag_page(frag), skb_frag_off(frag), skb_frag_size(frag)); } From 8cd26248ee6957b7fefc5b77f7e21c3c0e7e5d56 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 29 Nov 2021 09:03:28 +0100 Subject: [PATCH 1881/5344] xen/netfront: disentangle tx_skb_freelist commit 21631d2d741a64a073e167c27769e73bc7844a2f upstream. The tx_skb_freelist elements are in a single linked list with the request id used as link reference. The per element link field is in a union with the skb pointer of an in use request. Move the link reference out of the union in order to enable a later reuse of it for requests which need a populated skb pointer. Rename add_id_to_freelist() and get_id_from_freelist() to add_id_to_list() and get_id_from_list() in order to prepare using those for other lists as well. Define ~0 as value to indicate the end of a list and place that value into the link for a request not being on the list. When freeing a skb zero the skb pointer in the request. Use a NULL value of the skb pointer instead of skb_entry_is_link() for deciding whether a request has a skb linked to it. Remove skb_entry_set_link() and open code it instead as it is really trivial now. Signed-off-by: Juergen Gross Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/xen-netfront.c | 61 ++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 36 deletions(-) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 145511afe173b..a06b9d5204d94 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -121,17 +121,11 @@ struct netfront_queue { /* * {tx,rx}_skbs store outstanding skbuffs. Free tx_skb entries - * are linked from tx_skb_freelist through skb_entry.link. - * - * NB. Freelist index entries are always going to be less than - * PAGE_OFFSET, whereas pointers to skbs will always be equal or - * greater than PAGE_OFFSET: we use this property to distinguish - * them. + * are linked from tx_skb_freelist through tx_link. */ - union skb_entry { - struct sk_buff *skb; - unsigned long link; - } tx_skbs[NET_TX_RING_SIZE]; + struct sk_buff *tx_skbs[NET_TX_RING_SIZE]; + unsigned short tx_link[NET_TX_RING_SIZE]; +#define TX_LINK_NONE 0xffff grant_ref_t gref_tx_head; grant_ref_t grant_tx_ref[NET_TX_RING_SIZE]; struct page *grant_tx_page[NET_TX_RING_SIZE]; @@ -169,33 +163,25 @@ struct netfront_rx_info { struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1]; }; -static void skb_entry_set_link(union skb_entry *list, unsigned short id) -{ - list->link = id; -} - -static int skb_entry_is_link(const union skb_entry *list) -{ - BUILD_BUG_ON(sizeof(list->skb) != sizeof(list->link)); - return (unsigned long)list->skb < PAGE_OFFSET; -} - /* * Access macros for acquiring freeing slots in tx_skbs[]. */ -static void add_id_to_freelist(unsigned *head, union skb_entry *list, - unsigned short id) +static void add_id_to_list(unsigned *head, unsigned short *list, + unsigned short id) { - skb_entry_set_link(&list[id], *head); + list[id] = *head; *head = id; } -static unsigned short get_id_from_freelist(unsigned *head, - union skb_entry *list) +static unsigned short get_id_from_list(unsigned *head, unsigned short *list) { unsigned int id = *head; - *head = list[id].link; + + if (id != TX_LINK_NONE) { + *head = list[id]; + list[id] = TX_LINK_NONE; + } return id; } @@ -394,7 +380,8 @@ static void xennet_tx_buf_gc(struct netfront_queue *queue) continue; id = txrsp.id; - skb = queue->tx_skbs[id].skb; + skb = queue->tx_skbs[id]; + queue->tx_skbs[id] = NULL; if (unlikely(gnttab_query_foreign_access( queue->grant_tx_ref[id]) != 0)) { pr_alert("%s: warning -- grant still in use by backend domain\n", @@ -407,7 +394,7 @@ static void xennet_tx_buf_gc(struct netfront_queue *queue) &queue->gref_tx_head, queue->grant_tx_ref[id]); queue->grant_tx_ref[id] = GRANT_INVALID_REF; queue->grant_tx_page[id] = NULL; - add_id_to_freelist(&queue->tx_skb_freelist, queue->tx_skbs, id); + add_id_to_list(&queue->tx_skb_freelist, queue->tx_link, id); dev_kfree_skb_irq(skb); } @@ -440,7 +427,7 @@ static void xennet_tx_setup_grant(unsigned long gfn, unsigned int offset, struct netfront_queue *queue = info->queue; struct sk_buff *skb = info->skb; - id = get_id_from_freelist(&queue->tx_skb_freelist, queue->tx_skbs); + id = get_id_from_list(&queue->tx_skb_freelist, queue->tx_link); tx = RING_GET_REQUEST(&queue->tx, queue->tx.req_prod_pvt++); ref = gnttab_claim_grant_reference(&queue->gref_tx_head); WARN_ON_ONCE(IS_ERR_VALUE((unsigned long)(int)ref)); @@ -448,7 +435,7 @@ static void xennet_tx_setup_grant(unsigned long gfn, unsigned int offset, gnttab_grant_foreign_access_ref(ref, queue->info->xbdev->otherend_id, gfn, GNTMAP_readonly); - queue->tx_skbs[id].skb = skb; + queue->tx_skbs[id] = skb; queue->grant_tx_page[id] = page; queue->grant_tx_ref[id] = ref; @@ -1129,17 +1116,18 @@ static void xennet_release_tx_bufs(struct netfront_queue *queue) for (i = 0; i < NET_TX_RING_SIZE; i++) { /* Skip over entries which are actually freelist references */ - if (skb_entry_is_link(&queue->tx_skbs[i])) + if (!queue->tx_skbs[i]) continue; - skb = queue->tx_skbs[i].skb; + skb = queue->tx_skbs[i]; + queue->tx_skbs[i] = NULL; get_page(queue->grant_tx_page[i]); gnttab_end_foreign_access(queue->grant_tx_ref[i], GNTMAP_readonly, (unsigned long)page_address(queue->grant_tx_page[i])); queue->grant_tx_page[i] = NULL; queue->grant_tx_ref[i] = GRANT_INVALID_REF; - add_id_to_freelist(&queue->tx_skb_freelist, queue->tx_skbs, i); + add_id_to_list(&queue->tx_skb_freelist, queue->tx_link, i); dev_kfree_skb_irq(skb); } } @@ -1621,13 +1609,14 @@ static int xennet_init_queue(struct netfront_queue *queue) snprintf(queue->name, sizeof(queue->name), "vif%s-q%u", devid, queue->id); - /* Initialise tx_skbs as a free chain containing every entry. */ + /* Initialise tx_skb_freelist as a free chain containing every entry. */ queue->tx_skb_freelist = 0; for (i = 0; i < NET_TX_RING_SIZE; i++) { - skb_entry_set_link(&queue->tx_skbs[i], i+1); + queue->tx_link[i] = i + 1; queue->grant_tx_ref[i] = GRANT_INVALID_REF; queue->grant_tx_page[i] = NULL; } + queue->tx_link[NET_TX_RING_SIZE - 1] = TX_LINK_NONE; /* Clear out rx_skbs */ for (i = 0; i < NET_RX_RING_SIZE; i++) { From f2673cf35d202359f1682858505049cfa357dca9 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 29 Nov 2021 09:04:35 +0100 Subject: [PATCH 1882/5344] xen/netfront: don't trust the backend response data blindly commit a884daa61a7d91650987e855464526aef219590f upstream. Today netfront will trust the backend to send only sane response data. In order to avoid privilege escalations or crashes in case of malicious backends verify the data to be within expected limits. Especially make sure that the response always references an outstanding request. Note that only the tx queue needs special id handling, as for the rx queue the id is equal to the index in the ring page. Introduce a new indicator for the device whether it is broken and let the device stop working when it is set. Set this indicator in case the backend sets any weird data. Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/xen-netfront.c | 80 +++++++++++++++++++++++++++++++++++--- 1 file changed, 75 insertions(+), 5 deletions(-) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index a06b9d5204d94..d6f44343213cc 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -126,10 +126,12 @@ struct netfront_queue { struct sk_buff *tx_skbs[NET_TX_RING_SIZE]; unsigned short tx_link[NET_TX_RING_SIZE]; #define TX_LINK_NONE 0xffff +#define TX_PENDING 0xfffe grant_ref_t gref_tx_head; grant_ref_t grant_tx_ref[NET_TX_RING_SIZE]; struct page *grant_tx_page[NET_TX_RING_SIZE]; unsigned tx_skb_freelist; + unsigned int tx_pend_queue; spinlock_t rx_lock ____cacheline_aligned_in_smp; struct xen_netif_rx_front_ring rx; @@ -155,6 +157,9 @@ struct netfront_info { struct netfront_stats __percpu *rx_stats; struct netfront_stats __percpu *tx_stats; + /* Is device behaving sane? */ + bool broken; + atomic_t rx_gso_checksum_fixup; }; @@ -337,7 +342,7 @@ static int xennet_open(struct net_device *dev) unsigned int i = 0; struct netfront_queue *queue = NULL; - if (!np->queues) + if (!np->queues || np->broken) return -ENODEV; for (i = 0; i < num_queues; ++i) { @@ -365,11 +370,17 @@ static void xennet_tx_buf_gc(struct netfront_queue *queue) unsigned short id; struct sk_buff *skb; bool more_to_do; + const struct device *dev = &queue->info->netdev->dev; BUG_ON(!netif_carrier_ok(queue->info->netdev)); do { prod = queue->tx.sring->rsp_prod; + if (RING_RESPONSE_PROD_OVERFLOW(&queue->tx, prod)) { + dev_alert(dev, "Illegal number of responses %u\n", + prod - queue->tx.rsp_cons); + goto err; + } rmb(); /* Ensure we see responses up to 'rp'. */ for (cons = queue->tx.rsp_cons; cons != prod; cons++) { @@ -379,14 +390,27 @@ static void xennet_tx_buf_gc(struct netfront_queue *queue) if (txrsp.status == XEN_NETIF_RSP_NULL) continue; - id = txrsp.id; + id = txrsp.id; + if (id >= RING_SIZE(&queue->tx)) { + dev_alert(dev, + "Response has incorrect id (%u)\n", + id); + goto err; + } + if (queue->tx_link[id] != TX_PENDING) { + dev_alert(dev, + "Response for inactive request\n"); + goto err; + } + + queue->tx_link[id] = TX_LINK_NONE; skb = queue->tx_skbs[id]; queue->tx_skbs[id] = NULL; if (unlikely(gnttab_query_foreign_access( queue->grant_tx_ref[id]) != 0)) { - pr_alert("%s: warning -- grant still in use by backend domain\n", - __func__); - BUG(); + dev_alert(dev, + "Grant still in use by backend domain\n"); + goto err; } gnttab_end_foreign_access_ref( queue->grant_tx_ref[id], GNTMAP_readonly); @@ -404,6 +428,12 @@ static void xennet_tx_buf_gc(struct netfront_queue *queue) } while (more_to_do); xennet_maybe_wake_tx(queue); + + return; + + err: + queue->info->broken = true; + dev_alert(dev, "Disabled for further use\n"); } struct xennet_gnttab_make_txreq { @@ -447,6 +477,12 @@ static void xennet_tx_setup_grant(unsigned long gfn, unsigned int offset, *tx = info->tx_local; + /* + * Put the request in the pending queue, it will be set to be pending + * when the producer index is about to be raised. + */ + add_id_to_list(&queue->tx_pend_queue, queue->tx_link, id); + info->tx = tx; info->size += info->tx_local.size; } @@ -539,6 +575,15 @@ static u16 xennet_select_queue(struct net_device *dev, struct sk_buff *skb, return queue_idx; } +static void xennet_mark_tx_pending(struct netfront_queue *queue) +{ + unsigned int i; + + while ((i = get_id_from_list(&queue->tx_pend_queue, queue->tx_link)) != + TX_LINK_NONE) + queue->tx_link[i] = TX_PENDING; +} + #define MAX_XEN_SKB_FRAGS (65536 / XEN_PAGE_SIZE + 1) static netdev_tx_t xennet_start_xmit(struct sk_buff *skb, struct net_device *dev) @@ -562,6 +607,8 @@ static netdev_tx_t xennet_start_xmit(struct sk_buff *skb, struct net_device *dev /* Drop the packet if no queues are set up */ if (num_queues < 1) goto drop; + if (unlikely(np->broken)) + goto drop; /* Determine which queue to transmit this SKB on */ queue_index = skb_get_queue_mapping(skb); queue = &np->queues[queue_index]; @@ -665,6 +712,8 @@ static netdev_tx_t xennet_start_xmit(struct sk_buff *skb, struct net_device *dev /* First request has the packet length. */ first_tx->size = skb->len; + xennet_mark_tx_pending(queue); + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&queue->tx, notify); if (notify) notify_remote_via_irq(queue->tx_irq); @@ -989,6 +1038,13 @@ static int xennet_poll(struct napi_struct *napi, int budget) skb_queue_head_init(&tmpq); rp = queue->rx.sring->rsp_prod; + if (RING_RESPONSE_PROD_OVERFLOW(&queue->rx, rp)) { + dev_alert(&dev->dev, "Illegal number of responses %u\n", + rp - queue->rx.rsp_cons); + queue->info->broken = true; + spin_unlock(&queue->rx_lock); + return 0; + } rmb(); /* Ensure we see queued responses up to 'rp'. */ i = queue->rx.rsp_cons; @@ -1207,6 +1263,9 @@ static irqreturn_t xennet_tx_interrupt(int irq, void *dev_id) struct netfront_queue *queue = dev_id; unsigned long flags; + if (queue->info->broken) + return IRQ_HANDLED; + spin_lock_irqsave(&queue->tx_lock, flags); xennet_tx_buf_gc(queue); spin_unlock_irqrestore(&queue->tx_lock, flags); @@ -1219,6 +1278,9 @@ static irqreturn_t xennet_rx_interrupt(int irq, void *dev_id) struct netfront_queue *queue = dev_id; struct net_device *dev = queue->info->netdev; + if (queue->info->broken) + return IRQ_HANDLED; + if (likely(netif_carrier_ok(dev) && RING_HAS_UNCONSUMED_RESPONSES(&queue->rx))) napi_schedule(&queue->napi); @@ -1240,6 +1302,10 @@ static void xennet_poll_controller(struct net_device *dev) struct netfront_info *info = netdev_priv(dev); unsigned int num_queues = dev->real_num_tx_queues; unsigned int i; + + if (info->broken) + return; + for (i = 0; i < num_queues; ++i) xennet_interrupt(0, &info->queues[i]); } @@ -1611,6 +1677,7 @@ static int xennet_init_queue(struct netfront_queue *queue) /* Initialise tx_skb_freelist as a free chain containing every entry. */ queue->tx_skb_freelist = 0; + queue->tx_pend_queue = TX_LINK_NONE; for (i = 0; i < NET_TX_RING_SIZE; i++) { queue->tx_link[i] = i + 1; queue->grant_tx_ref[i] = GRANT_INVALID_REF; @@ -1821,6 +1888,9 @@ static int talk_to_netback(struct xenbus_device *dev, if (info->queues) xennet_destroy_queues(info); + /* For the case of a reconnect reset the "broken" indicator. */ + info->broken = false; + err = xennet_create_queues(info, &num_queues); if (err < 0) { xenbus_dev_fatal(dev, err, "creating queues"); From e9b61ecc0085fa43e6cd587dd93906bcc29e7503 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 29 Nov 2021 09:10:33 +0100 Subject: [PATCH 1883/5344] tty: hvc: replace BUG_ON() with negative return value commit e679004dec37566f658a255157d3aed9d762a2b7 upstream. Xen frontends shouldn't BUG() in case of illegal data received from their backends. So replace the BUG_ON()s when reading illegal data from the ring page with negative return values. This is commit e679004dec37566f upstream. Reviewed-by: Jan Beulich Signed-off-by: Juergen Gross Link: https://lore.kernel.org/r/20210707091045.460-1-jgross@suse.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/hvc/hvc_xen.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/tty/hvc/hvc_xen.c b/drivers/tty/hvc/hvc_xen.c index 5ef08905fe05c..15da02aeee948 100644 --- a/drivers/tty/hvc/hvc_xen.c +++ b/drivers/tty/hvc/hvc_xen.c @@ -86,7 +86,11 @@ static int __write_console(struct xencons_info *xencons, cons = intf->out_cons; prod = intf->out_prod; mb(); /* update queue values before going on */ - BUG_ON((prod - cons) > sizeof(intf->out)); + + if ((prod - cons) > sizeof(intf->out)) { + pr_err_once("xencons: Illegal ring page indices"); + return -EINVAL; + } while ((sent < len) && ((prod - cons) < sizeof(intf->out))) intf->out[MASK_XENCONS_IDX(prod++, intf->out)] = data[sent++]; @@ -114,7 +118,10 @@ static int domU_write_console(uint32_t vtermno, const char *data, int len) */ while (len) { int sent = __write_console(cons, data, len); - + + if (sent < 0) + return sent; + data += sent; len -= sent; @@ -138,7 +145,11 @@ static int domU_read_console(uint32_t vtermno, char *buf, int len) cons = intf->in_cons; prod = intf->in_prod; mb(); /* get pointers before reading ring */ - BUG_ON((prod - cons) > sizeof(intf->in)); + + if ((prod - cons) > sizeof(intf->in)) { + pr_err_once("xencons: Illegal ring page indices"); + return -EINVAL; + } while (cons != prod && recv < len) buf[recv++] = intf->in[MASK_XENCONS_IDX(cons++, intf->in)]; From 174992336fea01e2c18d472c29db913a472e13b9 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 1 Dec 2021 09:23:36 +0100 Subject: [PATCH 1884/5344] Linux 5.4.163 Link: https://lore.kernel.org/r/20211129181707.392764191@linuxfoundation.org Tested-by: Shuah Khan Tested-by: Hulk Robot Tested-by: Florian Fainelli Tested-by: Jon Hunter Tested-by: Linux Kernel Functional Testing Tested-by: Sudip Mukherjee Tested-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e8b05f7d3b238..91d77df0128b4 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 4 -SUBLEVEL = 162 +SUBLEVEL = 163 EXTRAVERSION = NAME = Kleptomaniac Octopus From bd9792138c7b4aa7dbe78e1d1d7a40fa8b0eb0df Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Tue, 16 Nov 2021 10:48:13 -0500 Subject: [PATCH 1885/5344] NFSv42: Fix pagecache invalidation after COPY/CLONE commit 3f015d89a47cd8855cd92f71fff770095bd885a1 upstream. The mechanism in use to allow the client to see the results of COPY/CLONE is to drop those pages from the pagecache. This forces the client to read those pages once more from the server. However, truncate_pagecache_range() zeros out partial pages instead of dropping them. Let us instead use invalidate_inode_pages2_range() with full-page offsets to ensure the client properly sees the results of COPY/CLONE operations. Cc: # v4.7+ Fixes: 2e72448b07dc ("NFS: Add COPY nfs operation") Signed-off-by: Benjamin Coddington Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/nfs/nfs42proc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 6b7c926824ae0..504812ea4bc29 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -295,8 +295,9 @@ static ssize_t _nfs42_proc_copy(struct file *src, goto out; } - truncate_pagecache_range(dst_inode, pos_dst, - pos_dst + res->write_res.count); + WARN_ON_ONCE(invalidate_inode_pages2_range(dst_inode->i_mapping, + pos_dst >> PAGE_SHIFT, + (pos_dst + res->write_res.count - 1) >> PAGE_SHIFT)); status = res->write_res.count; out: From ae6b6a55f232e43bd627ac30225786c2c39bfe37 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 5 Feb 2020 20:46:49 +0100 Subject: [PATCH 1886/5344] of: clk: Make self-contained MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 5df867145f8adad9e5cdf9d67db1fbc0f71351e9 upstream. Depending on include order: include/linux/of_clk.h:11:45: warning: ‘struct device_node’ declared inside parameter list will not be visible outside of this definition or declaration unsigned int of_clk_get_parent_count(struct device_node *np); ^~~~~~~~~~~ include/linux/of_clk.h:12:43: warning: ‘struct device_node’ declared inside parameter list will not be visible outside of this definition or declaration const char *of_clk_get_parent_name(struct device_node *np, int index); ^~~~~~~~~~~ include/linux/of_clk.h:13:31: warning: ‘struct of_device_id’ declared inside parameter list will not be visible outside of this definition or declaration void of_clk_init(const struct of_device_id *matches); ^~~~~~~~~~~~ Fix this by adding forward declarations for struct device_node and struct of_device_id. Signed-off-by: Geert Uytterhoeven Link: https://lkml.kernel.org/r/20200205194649.31309-1-geert+renesas@glider.be Signed-off-by: Stephen Boyd Signed-off-by: Greg Kroah-Hartman --- include/linux/of_clk.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/of_clk.h b/include/linux/of_clk.h index b27da9f164cbd..c86fcad23fc21 100644 --- a/include/linux/of_clk.h +++ b/include/linux/of_clk.h @@ -6,6 +6,9 @@ #ifndef __LINUX_OF_CLK_H #define __LINUX_OF_CLK_H +struct device_node; +struct of_device_id; + #if defined(CONFIG_COMMON_CLK) && defined(CONFIG_OF) unsigned int of_clk_get_parent_count(struct device_node *np); From 6b943ba76080ea93dcbda1d21111f694ab2ed098 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 27 Feb 2020 12:08:58 +0000 Subject: [PATCH 1887/5344] arm64: dts: mcbin: support 2W SFP modules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 05abc6a5dec2a8c123a50235ecd1ad8d75ffa7b4 upstream. Allow the SFP cages to be used with 2W SFP modules. Signed-off-by: Russell King Reviewed-by: Andrew Lunn Signed-off-by: Gregory CLEMENT Cc: Christian Lamparter Cc: 照山周一郎 Signed-off-by: Greg Kroah-Hartman --- arch/arm64/boot/dts/marvell/armada-8040-mcbin.dtsi | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/boot/dts/marvell/armada-8040-mcbin.dtsi b/arch/arm64/boot/dts/marvell/armada-8040-mcbin.dtsi index d250f4b2bfedb..bf443ca1fcf11 100644 --- a/arch/arm64/boot/dts/marvell/armada-8040-mcbin.dtsi +++ b/arch/arm64/boot/dts/marvell/armada-8040-mcbin.dtsi @@ -71,6 +71,7 @@ tx-fault-gpio = <&cp1_gpio1 26 GPIO_ACTIVE_HIGH>; pinctrl-names = "default"; pinctrl-0 = <&cp1_sfpp0_pins>; + maximum-power-milliwatt = <2000>; }; sfp_eth1: sfp-eth1 { @@ -83,6 +84,7 @@ tx-fault-gpio = <&cp0_gpio2 30 GPIO_ACTIVE_HIGH>; pinctrl-names = "default"; pinctrl-0 = <&cp1_sfpp1_pins &cp0_sfpp1_pins>; + maximum-power-milliwatt = <2000>; }; sfp_eth3: sfp-eth3 { @@ -95,6 +97,7 @@ tx-fault-gpio = <&cp0_gpio2 19 GPIO_ACTIVE_HIGH>; pinctrl-names = "default"; pinctrl-0 = <&cp0_sfp_1g_pins &cp1_sfp_1g_pins>; + maximum-power-milliwatt = <2000>; }; }; From 69f4d224e4bb0d0504483ff37c954793746730df Mon Sep 17 00:00:00 2001 From: Zhang Changzhong Date: Thu, 28 Oct 2021 22:38:27 +0800 Subject: [PATCH 1888/5344] can: j1939: j1939_tp_cmd_recv(): check the dst address of TP.CM_BAM commit 164051a6ab5445bd97f719f50b16db8b32174269 upstream. The TP.CM_BAM message must be sent to the global address [1], so add a check to drop TP.CM_BAM sent to a non-global address. Without this patch, the receiver will treat the following packets as normal RTS/CTS transport: 18EC0102#20090002FF002301 18EB0102#0100000000000000 18EB0102#020000FFFFFFFFFF [1] SAE-J1939-82 2015 A.3.3 Row 1. Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Link: https://lore.kernel.org/all/1635431907-15617-4-git-send-email-zhangchangzhong@huawei.com Cc: stable@vger.kernel.org Signed-off-by: Zhang Changzhong Acked-by: Oleksij Rempel Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- net/can/j1939/transport.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 811682e06951b..22f4b798d385b 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -2004,6 +2004,12 @@ static void j1939_tp_cmd_recv(struct j1939_priv *priv, struct sk_buff *skb) extd = J1939_ETP; /* fall through */ case J1939_TP_CMD_BAM: /* fall through */ + if (cmd == J1939_TP_CMD_BAM && !j1939_cb_is_broadcast(skcb)) { + netdev_err_once(priv->ndev, "%s: BAM to unicast (%02x), ignoring!\n", + __func__, skcb->addr.sa); + return; + } + fallthrough; case J1939_TP_CMD_RTS: /* fall through */ if (skcb->addr.type != extd) return; From 9fa7c614ee46784ca2aded7d0e68cbe5be84631b Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Sat, 6 Nov 2021 00:18:56 +0100 Subject: [PATCH 1889/5344] gfs2: Fix length of holes reported at end-of-file [ Upstream commit f3506eee81d1f700d9ee2d2f4a88fddb669ec032 ] Fix the length of holes reported at the end of a file: the length is relative to the beginning of the extent, not the seek position which is rounded down to the filesystem block size. This bug went unnoticed for some time, but is now caught by the following assertion in iomap_iter_done(): WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos) Signed-off-by: Andreas Gruenbacher Signed-off-by: Sasha Levin --- fs/gfs2/bmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index aaec3c5b02028..dec5285a02e9d 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -940,7 +940,7 @@ static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, else if (height == ip->i_height) ret = gfs2_hole_size(inode, lblock, len, mp, iomap); else - iomap->length = size - pos; + iomap->length = size - iomap->offset; } else if (flags & IOMAP_WRITE) { u64 alloc_size; From 375d6573f0b57fc9176a001e1a53382b1fb073b6 Mon Sep 17 00:00:00 2001 From: Julian Braha Date: Mon, 8 Nov 2021 22:23:51 -0500 Subject: [PATCH 1890/5344] drm/sun4i: fix unmet dependency on RESET_CONTROLLER for PHY_SUN6I_MIPI_DPHY [ Upstream commit bb162bb2b4394108c8f055d1b115735331205e28 ] When PHY_SUN6I_MIPI_DPHY is selected, and RESET_CONTROLLER is not selected, Kbuild gives the following warning: WARNING: unmet direct dependencies detected for PHY_SUN6I_MIPI_DPHY Depends on [n]: (ARCH_SUNXI [=n] || COMPILE_TEST [=y]) && HAS_IOMEM [=y] && COMMON_CLK [=y] && RESET_CONTROLLER [=n] Selected by [y]: - DRM_SUN6I_DSI [=y] && HAS_IOMEM [=y] && DRM_SUN4I [=y] This is because DRM_SUN6I_DSI selects PHY_SUN6I_MIPI_DPHY without selecting or depending on RESET_CONTROLLER, despite PHY_SUN6I_MIPI_DPHY depending on RESET_CONTROLLER. These unmet dependency bugs were detected by Kismet, a static analysis tool for Kconfig. Please advise if this is not the appropriate solution. v2: Fixed indentation to match the rest of the file. Signed-off-by: Julian Braha Acked-by: Jernej Skrabec Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20211109032351.43322-1-julianbraha@gmail.com Signed-off-by: Sasha Levin --- drivers/gpu/drm/sun4i/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/sun4i/Kconfig b/drivers/gpu/drm/sun4i/Kconfig index 37e90e42943f6..0e2d304f0d83f 100644 --- a/drivers/gpu/drm/sun4i/Kconfig +++ b/drivers/gpu/drm/sun4i/Kconfig @@ -46,6 +46,7 @@ config DRM_SUN6I_DSI default MACH_SUN8I select CRC_CCITT select DRM_MIPI_DSI + select RESET_CONTROLLER select PHY_SUN6I_MIPI_DPHY help Choose this option if you want have an Allwinner SoC with From 4d6254bd5afa389d77c1524148627440a63631c6 Mon Sep 17 00:00:00 2001 From: Xing Song Date: Mon, 1 Nov 2021 10:46:57 +0800 Subject: [PATCH 1891/5344] mac80211: do not access the IV when it was stripped [ Upstream commit 77dfc2bc0bb4b8376ecd7a430f27a4a8fff6a5a0 ] ieee80211_get_keyid() will return false value if IV has been stripped, such as return 0 for IP/ARP frames due to LLC header, and return -EINVAL for disassociation frames due to its length... etc. Don't try to access it if it's not present. Signed-off-by: Xing Song Link: https://lore.kernel.org/r/20211101024657.143026-1-xing.song@mediatek.com Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin --- net/mac80211/rx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index c7e6bf7c22c78..282bf336b15a4 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1918,7 +1918,8 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) int keyid = rx->sta->ptk_idx; sta_ptk = rcu_dereference(rx->sta->ptk[keyid]); - if (ieee80211_has_protected(fc)) { + if (ieee80211_has_protected(fc) && + !(status->flag & RX_FLAG_IV_STRIPPED)) { cs = rx->sta->cipher_scheme; keyid = ieee80211_get_keyid(rx->skb, cs); From 5badf9009b0740c9e3f320a51e93b9fd8099d0fa Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Sat, 13 Nov 2021 15:33:35 +0800 Subject: [PATCH 1892/5344] net/smc: Transfer remaining wait queue entries during fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 2153bd1e3d3dbf6a3403572084ef6ed31c53c5f0 ] The SMC fallback is incomplete currently. There may be some wait queue entries remaining in smc socket->wq, which should be removed to clcsocket->wq during the fallback. For example, in nginx/wrk benchmark, this issue causes an all-zeros test result: server: nginx -g 'daemon off;' client: smc_run wrk -c 1 -t 1 -d 5 http://11.200.15.93/index.html Running 5s test @ http://11.200.15.93/index.html 1 threads and 1 connections Thread Stats Avg Stdev Max ± Stdev Latency 0.00us 0.00us 0.00us -nan% Req/Sec 0.00 0.00 0.00 -nan% 0 requests in 5.00s, 0.00B read Requests/sec: 0.00 Transfer/sec: 0.00B The reason for this all-zeros result is that when wrk used SMC to replace TCP, it added an eppoll_entry into smc socket->wq and expected to be notified if epoll events like EPOLL_IN/ EPOLL_OUT occurred on the smc socket. However, once a fallback occurred, wrk switches to use clcsocket. Now it is clcsocket->wq instead of smc socket->wq which will be woken up. The eppoll_entry remaining in smc socket->wq does not work anymore and wrk stops the test. This patch fixes this issue by removing remaining wait queue entries from smc socket->wq to clcsocket->wq during the fallback. Link: https://www.spinics.net/lists/netdev/msg779769.html Signed-off-by: Wen Gu Reviewed-by: Tony Lu Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/smc/af_smc.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 5e1493f8deba7..00d3787387172 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -467,12 +467,26 @@ static void smc_link_save_peer_info(struct smc_link *link, static void smc_switch_to_fallback(struct smc_sock *smc) { + wait_queue_head_t *smc_wait = sk_sleep(&smc->sk); + wait_queue_head_t *clc_wait = sk_sleep(smc->clcsock->sk); + unsigned long flags; + smc->use_fallback = true; if (smc->sk.sk_socket && smc->sk.sk_socket->file) { smc->clcsock->file = smc->sk.sk_socket->file; smc->clcsock->file->private_data = smc->clcsock; smc->clcsock->wq.fasync_list = smc->sk.sk_socket->wq.fasync_list; + + /* There may be some entries remaining in + * smc socket->wq, which should be removed + * to clcsocket->wq during the fallback. + */ + spin_lock_irqsave(&smc_wait->lock, flags); + spin_lock(&clc_wait->lock); + list_splice_init(&smc_wait->head, &clc_wait->head); + spin_unlock(&clc_wait->lock); + spin_unlock_irqrestore(&smc_wait->lock, flags); } } From e83b88e46eea5bf3eb43c762d32d157d38285ca2 Mon Sep 17 00:00:00 2001 From: Zekun Shen Date: Sat, 13 Nov 2021 22:24:40 -0500 Subject: [PATCH 1893/5344] atlantic: Fix OOB read and write in hw_atl_utils_fw_rpc_wait [ Upstream commit b922f622592af76b57cbc566eaeccda0b31a3496 ] This bug report shows up when running our research tools. The reports is SOOB read, but it seems SOOB write is also possible a few lines below. In details, fw.len and sw.len are inputs coming from io. A len over the size of self->rpc triggers SOOB. The patch fixes the bugs by adding sanity checks. The bugs are triggerable with compromised/malfunctioning devices. They are potentially exploitable given they first leak up to 0xffff bytes and able to overwrite the region later. The patch is tested with QEMU emulater. This is NOT tested with a real device. Attached is the log we found by fuzzing. BUG: KASAN: slab-out-of-bounds in hw_atl_utils_fw_upload_dwords+0x393/0x3c0 [atlantic] Read of size 4 at addr ffff888016260b08 by task modprobe/213 CPU: 0 PID: 213 Comm: modprobe Not tainted 5.6.0 #1 Call Trace: dump_stack+0x76/0xa0 print_address_description.constprop.0+0x16/0x200 ? hw_atl_utils_fw_upload_dwords+0x393/0x3c0 [atlantic] ? hw_atl_utils_fw_upload_dwords+0x393/0x3c0 [atlantic] __kasan_report.cold+0x37/0x7c ? aq_hw_read_reg_bit+0x60/0x70 [atlantic] ? hw_atl_utils_fw_upload_dwords+0x393/0x3c0 [atlantic] kasan_report+0xe/0x20 hw_atl_utils_fw_upload_dwords+0x393/0x3c0 [atlantic] hw_atl_utils_fw_rpc_call+0x95/0x130 [atlantic] hw_atl_utils_fw_rpc_wait+0x176/0x210 [atlantic] hw_atl_utils_mpi_create+0x229/0x2e0 [atlantic] ? hw_atl_utils_fw_rpc_wait+0x210/0x210 [atlantic] ? hw_atl_utils_initfw+0x9f/0x1c8 [atlantic] hw_atl_utils_initfw+0x12a/0x1c8 [atlantic] aq_nic_ndev_register+0x88/0x650 [atlantic] ? aq_nic_ndev_init+0x235/0x3c0 [atlantic] aq_pci_probe+0x731/0x9b0 [atlantic] ? aq_pci_func_init+0xc0/0xc0 [atlantic] local_pci_probe+0xd3/0x160 pci_device_probe+0x23f/0x3e0 Reported-by: Brendan Dolan-Gavitt Signed-off-by: Zekun Shen Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- .../ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c index 873f9865f0d15..b7d70c33459fd 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c @@ -481,6 +481,11 @@ int hw_atl_utils_fw_rpc_wait(struct aq_hw_s *self, goto err_exit; if (fw.len == 0xFFFFU) { + if (sw.len > sizeof(self->rpc)) { + printk(KERN_INFO "Invalid sw len: %x\n", sw.len); + err = -EINVAL; + goto err_exit; + } err = hw_atl_utils_fw_rpc_call(self, sw.len); if (err < 0) goto err_exit; @@ -489,6 +494,11 @@ int hw_atl_utils_fw_rpc_wait(struct aq_hw_s *self, if (rpc) { if (fw.len) { + if (fw.len > sizeof(self->rpc)) { + printk(KERN_INFO "Invalid fw len: %x\n", fw.len); + err = -EINVAL; + goto err_exit; + } err = hw_atl_utils_fw_downld_dwords(self, self->rpc_addr, From e23ca89a2f1408263cdcec9941cb10ff5362a276 Mon Sep 17 00:00:00 2001 From: liuguoqiang Date: Mon, 15 Nov 2021 16:14:48 +0800 Subject: [PATCH 1894/5344] net: return correct error code [ Upstream commit 6def480181f15f6d9ec812bca8cbc62451ba314c ] When kmemdup called failed and register_net_sysctl return NULL, should return ENOMEM instead of ENOBUFS Signed-off-by: liuguoqiang Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/ipv4/devinet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 603a3495afa62..4a8ad46397c0e 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2585,7 +2585,7 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name, free: kfree(t); out: - return -ENOBUFS; + return -ENOMEM; } static void __devinet_sysctl_unregister(struct net *net, From fc8d4613cade4e8cb7079a60ae604b7a99b3c962 Mon Sep 17 00:00:00 2001 From: Slark Xiao Date: Mon, 8 Nov 2021 14:06:48 +0800 Subject: [PATCH 1895/5344] platform/x86: thinkpad_acpi: Fix WWAN device disabled issue after S3 deep [ Upstream commit 39f53292181081d35174a581a98441de5da22bc9 ] When WWAN device wake from S3 deep, under thinkpad platform, WWAN would be disabled. This disable status could be checked by command 'nmcli r wwan' or 'rfkill list'. Issue analysis as below: When host resume from S3 deep, thinkpad_acpi driver would call hotkey_resume() function. Finnaly, it will use wan_get_status to check the current status of WWAN device. During this resume progress, wan_get_status would always return off even WWAN boot up completely. In patch V2, Hans said 'sw_state should be unchanged after a suspend/resume. It's better to drop the tpacpi_rfk_update_swstate call all together from the resume path'. And it's confimed by Lenovo that GWAN is no longer available from WHL generation because the design does not match with current pin control. Signed-off-by: Slark Xiao Link: https://lore.kernel.org/r/20211108060648.8212-1-slark_xiao@163.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Signed-off-by: Sasha Levin --- drivers/platform/x86/thinkpad_acpi.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 3028d9f1ac59c..5d114088c88fb 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -1188,15 +1188,6 @@ static int tpacpi_rfk_update_swstate(const struct tpacpi_rfk *tp_rfk) return status; } -/* Query FW and update rfkill sw state for all rfkill switches */ -static void tpacpi_rfk_update_swstate_all(void) -{ - unsigned int i; - - for (i = 0; i < TPACPI_RFK_SW_MAX; i++) - tpacpi_rfk_update_swstate(tpacpi_rfkill_switches[i]); -} - /* * Sync the HW-blocking state of all rfkill switches, * do notice it causes the rfkill core to schedule uevents @@ -3135,9 +3126,6 @@ static void tpacpi_send_radiosw_update(void) if (wlsw == TPACPI_RFK_RADIO_OFF) tpacpi_rfk_update_hwblock_state(true); - /* Sync sw blocking state */ - tpacpi_rfk_update_swstate_all(); - /* Sync hw blocking state last if it is hw-unblocked */ if (wlsw == TPACPI_RFK_RADIO_ON) tpacpi_rfk_update_hwblock_state(false); From 637430982013f05ee3027f465d3bf76ddfae5b3f Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Thu, 14 Oct 2021 13:38:17 +0200 Subject: [PATCH 1896/5344] s390/setup: avoid using memblock_enforce_memory_limit [ Upstream commit 5dbc4cb4667457b0c53bcd7bff11500b3c362975 ] There is a difference in how architectures treat "mem=" option. For some that is an amount of online memory, for s390 and x86 this is the limiting max address. Some memblock api like memblock_enforce_memory_limit() take limit argument and explicitly treat it as the size of online memory, and use __find_max_addr to convert it to an actual max address. Current s390 usage: memblock_enforce_memory_limit(memblock_end_of_DRAM()); yields different results depending on presence of memory holes (offline memory blocks in between online memory). If there are no memory holes limit == max_addr in memblock_enforce_memory_limit() and it does trim online memory and reserved memory regions. With memory holes present it actually does nothing. Since we already use memblock_remove() explicitly to trim online memory regions to potential limit (think mem=, kdump, addressing limits, etc.) drop the usage of memblock_enforce_memory_limit() altogether. Trimming reserved regions should not be required, since we now use memblock_set_current_limit() to limit allocations and any explicit memory reservations above the limit is an actual problem we should not hide. Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik Signed-off-by: Heiko Carstens Signed-off-by: Sasha Levin --- arch/s390/kernel/setup.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index f661f176966f5..9a0316a067a11 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -841,9 +841,6 @@ static void __init setup_memory(void) storage_key_init_range(reg->base, reg->base + reg->size); } psw_set_key(PAGE_DEFAULT_KEY); - - /* Only cosmetics */ - memblock_enforce_memory_limit(memblock_end_of_DRAM()); } /* From d00c8b163777be6c98f57aee565a2366bd1174c0 Mon Sep 17 00:00:00 2001 From: Wang Yugui Date: Thu, 28 Oct 2021 06:32:54 +0800 Subject: [PATCH 1897/5344] btrfs: check-integrity: fix a warning on write caching disabled disk [ Upstream commit a91cf0ffbc244792e0b3ecf7d0fddb2f344b461f ] When a disk has write caching disabled, we skip submission of a bio with flush and sync requests before writing the superblock, since it's not needed. However when the integrity checker is enabled, this results in reports that there are metadata blocks referred by a superblock that were not properly flushed. So don't skip the bio submission only when the integrity checker is enabled for the sake of simplicity, since this is a debug tool and not meant for use in non-debug builds. fstests/btrfs/220 trigger a check-integrity warning like the following when CONFIG_BTRFS_FS_CHECK_INTEGRITY=y and the disk with WCE=0. btrfs: attempt to write superblock which references block M @5242880 (sdb2/5242880/0) which is not flushed out of disk's write cache (block flush_gen=1, dev->flush_gen=0)! ------------[ cut here ]------------ WARNING: CPU: 28 PID: 843680 at fs/btrfs/check-integrity.c:2196 btrfsic_process_written_superblock+0x22a/0x2a0 [btrfs] CPU: 28 PID: 843680 Comm: umount Not tainted 5.15.0-0.rc5.39.el8.x86_64 #1 Hardware name: Dell Inc. Precision T7610/0NK70N, BIOS A18 09/11/2019 RIP: 0010:btrfsic_process_written_superblock+0x22a/0x2a0 [btrfs] RSP: 0018:ffffb642afb47940 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000002 RCX: 0000000000000000 RDX: 00000000ffffffff RSI: ffff8b722fc97d00 RDI: ffff8b722fc97d00 RBP: ffff8b5601c00000 R08: 0000000000000000 R09: c0000000ffff7fff R10: 0000000000000001 R11: ffffb642afb476f8 R12: ffffffffffffffff R13: ffffb642afb47974 R14: ffff8b5499254c00 R15: 0000000000000003 FS: 00007f00a06d4080(0000) GS:ffff8b722fc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fff5cff5ff0 CR3: 00000001c0c2a006 CR4: 00000000001706e0 Call Trace: btrfsic_process_written_block+0x2f7/0x850 [btrfs] __btrfsic_submit_bio.part.19+0x310/0x330 [btrfs] ? bio_associate_blkg_from_css+0xa4/0x2c0 btrfsic_submit_bio+0x18/0x30 [btrfs] write_dev_supers+0x81/0x2a0 [btrfs] ? find_get_pages_range_tag+0x219/0x280 ? pagevec_lookup_range_tag+0x24/0x30 ? __filemap_fdatawait_range+0x6d/0xf0 ? __raw_callee_save___native_queued_spin_unlock+0x11/0x1e ? find_first_extent_bit+0x9b/0x160 [btrfs] ? __raw_callee_save___native_queued_spin_unlock+0x11/0x1e write_all_supers+0x1b3/0xa70 [btrfs] ? __raw_callee_save___native_queued_spin_unlock+0x11/0x1e btrfs_commit_transaction+0x59d/0xac0 [btrfs] close_ctree+0x11d/0x339 [btrfs] generic_shutdown_super+0x71/0x110 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x31/0x70 cleanup_mnt+0xb8/0x140 task_work_run+0x6d/0xb0 exit_to_user_mode_prepare+0x1f0/0x200 syscall_exit_to_user_mode+0x12/0x30 do_syscall_64+0x46/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f009f711dfb RSP: 002b:00007fff5cff7928 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 RAX: 0000000000000000 RBX: 000055b68c6c9970 RCX: 00007f009f711dfb RDX: 0000000000000001 RSI: 0000000000000000 RDI: 000055b68c6c9b50 RBP: 0000000000000000 R08: 000055b68c6ca900 R09: 00007f009f795580 R10: 0000000000000000 R11: 0000000000000246 R12: 000055b68c6c9b50 R13: 00007f00a04bf184 R14: 0000000000000000 R15: 00000000ffffffff ---[ end trace 2c4b82abcef9eec4 ]--- S-65536(sdb2/65536/1) --> M-1064960(sdb2/1064960/1) Reviewed-by: Filipe Manana Signed-off-by: Wang Yugui Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/disk-io.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1499531bc1511..f18c6d97932ed 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3636,11 +3636,23 @@ static void btrfs_end_empty_barrier(struct bio *bio) */ static void write_dev_flush(struct btrfs_device *device) { - struct request_queue *q = bdev_get_queue(device->bdev); struct bio *bio = device->flush_bio; +#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY + /* + * When a disk has write caching disabled, we skip submission of a bio + * with flush and sync requests before writing the superblock, since + * it's not needed. However when the integrity checker is enabled, this + * results in reports that there are metadata blocks referred by a + * superblock that were not properly flushed. So don't skip the bio + * submission only when the integrity checker is enabled for the sake + * of simplicity, since this is a debug tool and not meant for use in + * non-debug builds. + */ + struct request_queue *q = bdev_get_queue(device->bdev); if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) return; +#endif bio_reset(bio); bio->bi_end_io = btrfs_end_empty_barrier; From d4aaa4c69ec09337ebedbc834a0b1789a61a7dd7 Mon Sep 17 00:00:00 2001 From: Manaf Meethalavalappu Pallikunhi Date: Wed, 3 Nov 2021 01:30:40 +0530 Subject: [PATCH 1898/5344] thermal: core: Reset previous low and high trip during thermal zone init [ Upstream commit 99b63316c39988039965693f5f43d8b4ccb1c86c ] During the suspend is in process, thermal_zone_device_update bails out thermal zone re-evaluation for any sensor trip violation without setting next valid trip to that sensor. It assumes during resume it will re-evaluate same thermal zone and update trip. But when it is in suspend temperature goes down and on resume path while updating thermal zone if temperature is less than previously violated trip, thermal zone set trip function evaluates the same previous high and previous low trip as new high and low trip. Since there is no change in high/low trip, it bails out from thermal zone set trip API without setting any trip. It leads to a case where sensor high trip or low trip is disabled forever even though thermal zone has a valid high or low trip. During thermal zone device init, reset thermal zone previous high and low trip. It resolves above mentioned scenario. Signed-off-by: Manaf Meethalavalappu Pallikunhi Reviewed-by: Thara Gopinath Signed-off-by: Rafael J. Wysocki Signed-off-by: Sasha Levin --- drivers/thermal/thermal_core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 20eab56b02cb9..f4490b8120176 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -460,6 +460,8 @@ static void thermal_zone_device_init(struct thermal_zone_device *tz) { struct thermal_instance *pos; tz->temperature = THERMAL_TEMP_INVALID; + tz->prev_low_trip = -INT_MAX; + tz->prev_high_trip = INT_MAX; list_for_each_entry(pos, &tz->thermal_instances, tz_node) pos->initialized = false; } From c426f286f8a7c0ad842fc65444f66c620486c39f Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 5 Nov 2021 17:10:47 -0500 Subject: [PATCH 1899/5344] scsi: iscsi: Unblock session then wake up error handler [ Upstream commit a0c2f8b6709a9a4af175497ca65f93804f57b248 ] We can race where iscsi_session_recovery_timedout() has woken up the error handler thread and it's now setting the devices to offline, and session_recovery_timedout()'s call to scsi_target_unblock() is also trying to set the device's state to transport-offline. We can then get a mix of states. For the case where we can't relogin we want the devices to be in transport-offline so when we have repaired the connection __iscsi_unblock_session() can set the state back to running. Set the device state then call into libiscsi to wake up the error handler. Link: https://lore.kernel.org/r/20211105221048.6541-2-michael.christie@oracle.com Reviewed-by: Lee Duncan Signed-off-by: Mike Christie Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/scsi_transport_iscsi.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 6f21cb75d95fd..f6cce0befa7de 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -1894,12 +1894,12 @@ static void session_recovery_timedout(struct work_struct *work) } spin_unlock_irqrestore(&session->lock, flags); - if (session->transport->session_recovery_timedout) - session->transport->session_recovery_timedout(session); - ISCSI_DBG_TRANS_SESSION(session, "Unblocking SCSI target\n"); scsi_target_unblock(&session->dev, SDEV_TRANSPORT_OFFLINE); ISCSI_DBG_TRANS_SESSION(session, "Completed unblocking SCSI target\n"); + + if (session->transport->session_recovery_timedout) + session->transport->session_recovery_timedout(session); } static void __iscsi_unblock_session(struct work_struct *work) From 3d8cd4ab9301e6be649592616e1b18cf02db2b69 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 12 Nov 2021 14:15:38 -0600 Subject: [PATCH 1900/5344] ata: ahci: Add Green Sardine vendor ID as board_ahci_mobile [ Upstream commit 1527f69204fe35f341cb599f1cb01bd02daf4374 ] AMD requires that the SATA controller be configured for devsleep in order for S0i3 entry to work properly. commit b1a9585cc396 ("ata: ahci: Enable DEVSLP by default on x86 with SLP_S0") sets up a kernel policy to enable devsleep on Intel mobile platforms that are using s0ix. Add the PCI ID for the SATA controller in Green Sardine platforms to extend this policy by default for AMD based systems using s0i3 as well. Cc: Nehal-bakulchandra Shah BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=214091 Signed-off-by: Mario Limonciello Signed-off-by: Damien Le Moal Signed-off-by: Sasha Levin --- drivers/ata/ahci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index 8beb418ce167b..6f572967b5552 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -420,6 +420,7 @@ static const struct pci_device_id ahci_pci_tbl[] = { /* AMD */ { PCI_VDEVICE(AMD, 0x7800), board_ahci }, /* AMD Hudson-2 */ { PCI_VDEVICE(AMD, 0x7900), board_ahci }, /* AMD CZ */ + { PCI_VDEVICE(AMD, 0x7901), board_ahci_mobile }, /* AMD Green Sardine */ /* AMD is using RAID class only for ahci controllers */ { PCI_VENDOR_ID_AMD, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_STORAGE_RAID << 8, 0xffffff, board_ahci }, From a2dd967898e740ec5660ee1c042fe17c1dd45e58 Mon Sep 17 00:00:00 2001 From: Teng Qi Date: Wed, 17 Nov 2021 11:44:53 +0800 Subject: [PATCH 1901/5344] ethernet: hisilicon: hns: hns_dsaf_misc: fix a possible array overflow in hns_dsaf_ge_srst_by_port() [ Upstream commit a66998e0fbf213d47d02813b9679426129d0d114 ] The if statement: if (port >= DSAF_GE_NUM) return; limits the value of port less than DSAF_GE_NUM (i.e., 8). However, if the value of port is 6 or 7, an array overflow could occur: port_rst_off = dsaf_dev->mac_cb[port]->port_rst_off; because the length of dsaf_dev->mac_cb is DSAF_MAX_PORT_NUM (i.e., 6). To fix this possible array overflow, we first check port and if it is greater than or equal to DSAF_MAX_PORT_NUM, the function returns. Reported-by: TOTE Robot Signed-off-by: Teng Qi Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c index ed3829ae4ef1b..580199fdd0c22 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c @@ -398,6 +398,10 @@ static void hns_dsaf_ge_srst_by_port(struct dsaf_device *dsaf_dev, u32 port, return; if (!HNS_DSAF_IS_DEBUG(dsaf_dev)) { + /* DSAF_MAX_PORT_NUM is 6, but DSAF_GE_NUM is 8. + We need check to prevent array overflow */ + if (port >= DSAF_MAX_PORT_NUM) + return; reg_val_1 = 0x1 << port; port_rst_off = dsaf_dev->mac_cb[port]->port_rst_off; /* there is difference between V1 and V2 in register.*/ From a009f5cacd73711238cc12207d27076336b59e07 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 18 Nov 2021 13:46:32 +0800 Subject: [PATCH 1902/5344] net: tulip: de4x5: fix the problem that the array 'lp->phy[8]' may be out of bound [ Upstream commit 61217be886b5f7402843677e4be7e7e83de9cb41 ] In line 5001, if all id in the array 'lp->phy[8]' is not 0, when the 'for' end, the 'k' is 8. At this time, the array 'lp->phy[8]' may be out of bound. Signed-off-by: zhangyue Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/dec/tulip/de4x5.c | 30 +++++++++++++++----------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/dec/tulip/de4x5.c b/drivers/net/ethernet/dec/tulip/de4x5.c index c813e6f2b371e..a80252973171f 100644 --- a/drivers/net/ethernet/dec/tulip/de4x5.c +++ b/drivers/net/ethernet/dec/tulip/de4x5.c @@ -4999,19 +4999,23 @@ mii_get_phy(struct net_device *dev) } if ((j == limit) && (i < DE4X5_MAX_MII)) { for (k=0; k < DE4X5_MAX_PHY && lp->phy[k].id; k++); - lp->phy[k].addr = i; - lp->phy[k].id = id; - lp->phy[k].spd.reg = GENERIC_REG; /* ANLPA register */ - lp->phy[k].spd.mask = GENERIC_MASK; /* 100Mb/s technologies */ - lp->phy[k].spd.value = GENERIC_VALUE; /* TX & T4, H/F Duplex */ - lp->mii_cnt++; - lp->active++; - printk("%s: Using generic MII device control. If the board doesn't operate,\nplease mail the following dump to the author:\n", dev->name); - j = de4x5_debug; - de4x5_debug |= DEBUG_MII; - de4x5_dbg_mii(dev, k); - de4x5_debug = j; - printk("\n"); + if (k < DE4X5_MAX_PHY) { + lp->phy[k].addr = i; + lp->phy[k].id = id; + lp->phy[k].spd.reg = GENERIC_REG; /* ANLPA register */ + lp->phy[k].spd.mask = GENERIC_MASK; /* 100Mb/s technologies */ + lp->phy[k].spd.value = GENERIC_VALUE; /* TX & T4, H/F Duplex */ + lp->mii_cnt++; + lp->active++; + printk("%s: Using generic MII device control. If the board doesn't operate,\nplease mail the following dump to the author:\n", dev->name); + j = de4x5_debug; + de4x5_debug |= DEBUG_MII; + de4x5_dbg_mii(dev, k); + de4x5_debug = j; + printk("\n"); + } else { + goto purgatory; + } } } purgatory: From 4d82b457624d704fc18d452f9278c305a9a35658 Mon Sep 17 00:00:00 2001 From: Teng Qi Date: Thu, 18 Nov 2021 15:01:18 +0800 Subject: [PATCH 1903/5344] net: ethernet: dec: tulip: de4x5: fix possible array overflows in type3_infoblock() [ Upstream commit 0fa68da72c3be09e06dd833258ee89c33374195f ] The definition of macro MOTO_SROM_BUG is: #define MOTO_SROM_BUG (lp->active == 8 && (get_unaligned_le32( dev->dev_addr) & 0x00ffffff) == 0x3e0008) and the if statement if (MOTO_SROM_BUG) lp->active = 0; using this macro indicates lp->active could be 8. If lp->active is 8 and the second comparison of this macro is false. lp->active will remain 8 in: lp->phy[lp->active].gep = (*p ? p : NULL); p += (2 * (*p) + 1); lp->phy[lp->active].rst = (*p ? p : NULL); p += (2 * (*p) + 1); lp->phy[lp->active].mc = get_unaligned_le16(p); p += 2; lp->phy[lp->active].ana = get_unaligned_le16(p); p += 2; lp->phy[lp->active].fdx = get_unaligned_le16(p); p += 2; lp->phy[lp->active].ttm = get_unaligned_le16(p); p += 2; lp->phy[lp->active].mci = *p; However, the length of array lp->phy is 8, so array overflows can occur. To fix these possible array overflows, we first check lp->active and then return -EINVAL if it is greater or equal to ARRAY_SIZE(lp->phy) (i.e. 8). Reported-by: TOTE Robot Signed-off-by: Teng Qi Reviewed-by: Arnd Bergmann Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/dec/tulip/de4x5.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/dec/tulip/de4x5.c b/drivers/net/ethernet/dec/tulip/de4x5.c index a80252973171f..c97fc0e384ca6 100644 --- a/drivers/net/ethernet/dec/tulip/de4x5.c +++ b/drivers/net/ethernet/dec/tulip/de4x5.c @@ -4708,6 +4708,10 @@ type3_infoblock(struct net_device *dev, u_char count, u_char *p) lp->ibn = 3; lp->active = *p++; if (MOTO_SROM_BUG) lp->active = 0; + /* if (MOTO_SROM_BUG) statement indicates lp->active could + * be 8 (i.e. the size of array lp->phy) */ + if (WARN_ON(lp->active >= ARRAY_SIZE(lp->phy))) + return -EINVAL; lp->phy[lp->active].gep = (*p ? p : NULL); p += (2 * (*p) + 1); lp->phy[lp->active].rst = (*p ? p : NULL); p += (2 * (*p) + 1); lp->phy[lp->active].mc = get_unaligned_le16(p); p += 2; From f5d583cbcbe6d59fb7e0e97e3c64818dbb1fe68c Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 17 Nov 2021 23:12:47 -0800 Subject: [PATCH 1904/5344] perf hist: Fix memory leak of a perf_hpp_fmt [ Upstream commit 0ca1f534a776cc7d42f2c33da4732b74ec2790cd ] perf_hpp__column_unregister() removes an entry from a list but doesn't free the memory causing a memory leak spotted by leak sanitizer. Add the free while at the same time reducing the scope of the function to static. Signed-off-by: Ian Rogers Reviewed-by: Kajol Jain Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/20211118071247.2140392-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Sasha Levin --- tools/perf/ui/hist.c | 28 ++++++++++++++-------------- tools/perf/util/hist.h | 1 - 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c index f736755000616..9ae316445f04b 100644 --- a/tools/perf/ui/hist.c +++ b/tools/perf/ui/hist.c @@ -472,6 +472,18 @@ struct perf_hpp_list perf_hpp_list = { #undef __HPP_SORT_ACC_FN #undef __HPP_SORT_RAW_FN +static void fmt_free(struct perf_hpp_fmt *fmt) +{ + /* + * At this point fmt should be completely + * unhooked, if not it's a bug. + */ + BUG_ON(!list_empty(&fmt->list)); + BUG_ON(!list_empty(&fmt->sort_list)); + + if (fmt->free) + fmt->free(fmt); +} void perf_hpp__init(void) { @@ -535,9 +547,10 @@ void perf_hpp_list__prepend_sort_field(struct perf_hpp_list *list, list_add(&format->sort_list, &list->sorts); } -void perf_hpp__column_unregister(struct perf_hpp_fmt *format) +static void perf_hpp__column_unregister(struct perf_hpp_fmt *format) { list_del_init(&format->list); + fmt_free(format); } void perf_hpp__cancel_cumulate(void) @@ -609,19 +622,6 @@ void perf_hpp__append_sort_keys(struct perf_hpp_list *list) } -static void fmt_free(struct perf_hpp_fmt *fmt) -{ - /* - * At this point fmt should be completely - * unhooked, if not it's a bug. - */ - BUG_ON(!list_empty(&fmt->list)); - BUG_ON(!list_empty(&fmt->sort_list)); - - if (fmt->free) - fmt->free(fmt); -} - void perf_hpp__reset_output_field(struct perf_hpp_list *list) { struct perf_hpp_fmt *fmt, *tmp; diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 4792731307947..ecce30f086de7 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -361,7 +361,6 @@ enum { }; void perf_hpp__init(void); -void perf_hpp__column_unregister(struct perf_hpp_fmt *format); void perf_hpp__cancel_cumulate(void); void perf_hpp__setup_output_field(struct perf_hpp_list *list); void perf_hpp__reset_output_field(struct perf_hpp_list *list); From 35a2c1b00e6e6e0446825b2175a4018a58578284 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 17 Nov 2021 23:38:04 -0800 Subject: [PATCH 1905/5344] perf report: Fix memory leaks around perf_tip() [ Upstream commit d9fc706108c15f8bc2d4ccccf8e50f74830fabd9 ] perf_tip() may allocate memory or use a literal, this means memory wasn't freed if allocated. Change the API so that literals aren't used. At the same time add missing frees for system_path. These issues were spotted using leak sanitizer. Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/20211118073804.2149974-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Sasha Levin --- tools/perf/builtin-report.c | 15 +++++++++------ tools/perf/util/util.c | 14 +++++++------- tools/perf/util/util.h | 2 +- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index d3c0b04e2e22b..dc228bdf2bbc2 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -569,14 +569,17 @@ static int report__browse_hists(struct report *rep) int ret; struct perf_session *session = rep->session; struct evlist *evlist = session->evlist; - const char *help = perf_tip(system_path(TIPDIR)); + char *help = NULL, *path = NULL; - if (help == NULL) { + path = system_path(TIPDIR); + if (perf_tip(&help, path) || help == NULL) { /* fallback for people who don't install perf ;-) */ - help = perf_tip(DOCDIR); - if (help == NULL) - help = "Cannot load tips.txt file, please install perf!"; + free(path); + path = system_path(DOCDIR); + if (perf_tip(&help, path) || help == NULL) + help = strdup("Cannot load tips.txt file, please install perf!"); } + free(path); switch (use_browser) { case 1: @@ -598,7 +601,7 @@ static int report__browse_hists(struct report *rep) ret = perf_evlist__tty_browse_hists(evlist, rep, help); break; } - + free(help); return ret; } diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index ae56c766eda16..b3c1ae288b478 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -343,32 +343,32 @@ fetch_kernel_version(unsigned int *puint, char *str, return 0; } -const char *perf_tip(const char *dirpath) +int perf_tip(char **strp, const char *dirpath) { struct strlist *tips; struct str_node *node; - char *tip = NULL; struct strlist_config conf = { .dirname = dirpath, .file_only = true, }; + int ret = 0; + *strp = NULL; tips = strlist__new("tips.txt", &conf); if (tips == NULL) - return errno == ENOENT ? NULL : - "Tip: check path of tips.txt or get more memory! ;-p"; + return -errno; if (strlist__nr_entries(tips) == 0) goto out; node = strlist__entry(tips, random() % strlist__nr_entries(tips)); - if (asprintf(&tip, "Tip: %s", node->s) < 0) - tip = (char *)"Tip: get more memory! ;-)"; + if (asprintf(strp, "Tip: %s", node->s) < 0) + ret = -ENOMEM; out: strlist__delete(tips); - return tip; + return ret; } char *perf_exe(char *buf, int len) diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index 9969b8b46f7c3..e4a7e1cafc70a 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -37,7 +37,7 @@ int fetch_kernel_version(unsigned int *puint, #define KVER_FMT "%d.%d.%d" #define KVER_PARAM(x) KVER_VERSION(x), KVER_PATCHLEVEL(x), KVER_SUBLEVEL(x) -const char *perf_tip(const char *dirpath); +int perf_tip(char **strp, const char *dirpath); #ifndef HAVE_SCHED_GETCPU_SUPPORT int sched_getcpu(void); From 7de16af22d6d70a70b0e4bd26ee9ce6da43bb711 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Mon, 22 Nov 2021 20:32:53 +0800 Subject: [PATCH 1906/5344] net/smc: Avoid warning of possible recursive locking [ Upstream commit 7a61432dc81375be06b02f0061247d3efbdfce3a ] Possible recursive locking is detected by lockdep when SMC falls back to TCP. The corresponding warnings are as follows: ============================================ WARNING: possible recursive locking detected 5.16.0-rc1+ #18 Tainted: G E -------------------------------------------- wrk/1391 is trying to acquire lock: ffff975246c8e7d8 (&ei->socket.wq.wait){..-.}-{3:3}, at: smc_switch_to_fallback+0x109/0x250 [smc] but task is already holding lock: ffff975246c8f918 (&ei->socket.wq.wait){..-.}-{3:3}, at: smc_switch_to_fallback+0xfe/0x250 [smc] other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&ei->socket.wq.wait); lock(&ei->socket.wq.wait); *** DEADLOCK *** May be due to missing lock nesting notation 2 locks held by wrk/1391: #0: ffff975246040130 (sk_lock-AF_SMC){+.+.}-{0:0}, at: smc_connect+0x43/0x150 [smc] #1: ffff975246c8f918 (&ei->socket.wq.wait){..-.}-{3:3}, at: smc_switch_to_fallback+0xfe/0x250 [smc] stack backtrace: Call Trace: dump_stack_lvl+0x56/0x7b __lock_acquire+0x951/0x11f0 lock_acquire+0x27a/0x320 ? smc_switch_to_fallback+0x109/0x250 [smc] ? smc_switch_to_fallback+0xfe/0x250 [smc] _raw_spin_lock_irq+0x3b/0x80 ? smc_switch_to_fallback+0x109/0x250 [smc] smc_switch_to_fallback+0x109/0x250 [smc] smc_connect_fallback+0xe/0x30 [smc] __smc_connect+0xcf/0x1090 [smc] ? mark_held_locks+0x61/0x80 ? __local_bh_enable_ip+0x77/0xe0 ? lockdep_hardirqs_on+0xbf/0x130 ? smc_connect+0x12a/0x150 [smc] smc_connect+0x12a/0x150 [smc] __sys_connect+0x8a/0xc0 ? syscall_enter_from_user_mode+0x20/0x70 __x64_sys_connect+0x16/0x20 do_syscall_64+0x34/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae The nested locking in smc_switch_to_fallback() is considered to possibly cause a deadlock because smc_wait->lock and clc_wait->lock are the same type of lock. But actually it is safe so far since there is no other place trying to obtain smc_wait->lock when clc_wait->lock is held. So the patch replaces spin_lock() with spin_lock_nested() to avoid false report by lockdep. Link: https://lkml.org/lkml/2021/11/19/962 Fixes: 2153bd1e3d3d ("Transfer remaining wait queue entries during fallback") Reported-by: syzbot+e979d3597f48262cb4ee@syzkaller.appspotmail.com Signed-off-by: Wen Gu Acked-by: Tony Lu Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/smc/af_smc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 00d3787387172..fa3b20e5f4608 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -483,7 +483,7 @@ static void smc_switch_to_fallback(struct smc_sock *smc) * to clcsocket->wq during the fallback. */ spin_lock_irqsave(&smc_wait->lock, flags); - spin_lock(&clc_wait->lock); + spin_lock_nested(&clc_wait->lock, SINGLE_DEPTH_NESTING); list_splice_init(&smc_wait->head, &clc_wait->head); spin_unlock(&clc_wait->lock); spin_unlock_irqrestore(&smc_wait->lock, flags); From 2bf190f71b2fd47c2c69443bf4dbcb066414b335 Mon Sep 17 00:00:00 2001 From: Stephen Suryaputra Date: Tue, 30 Nov 2021 11:26:37 -0500 Subject: [PATCH 1907/5344] vrf: Reset IPCB/IP6CB when processing outbound pkts in vrf dev xmit commit ee201011c1e1563c114a55c86eb164b236f18e84 upstream. IPCB/IP6CB need to be initialized when processing outbound v4 or v6 pkts in the codepath of vrf device xmit function so that leftover garbage doesn't cause futher code that uses the CB to incorrectly process the pkt. One occasion of the issue might occur when MPLS route uses the vrf device as the outgoing device such as when the route is added using "ip -f mpls route add