aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/net/ipvlan/ipvtap.c
blob: 5dea2063dbc878d1f45a56c7cfea470f820ec601 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
#include <linux/etherdevice.h>
#include "ipvlan.h"
#include <linux/if_vlan.h>
#include <linux/if_tap.h>
#include <linux/interrupt.h>
#include <linux/nsproxy.h>
#include <linux/compat.h>
#include <linux/if_tun.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/cache.h>
#include <linux/sched.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/wait.h>
#include <linux/cdev.h>
#include <linux/idr.h>
#include <linux/fs.h>
#include <linux/uio.h>

#include <net/net_namespace.h>
#include <net/rtnetlink.h>
#include <net/sock.h>
#include <linux/virtio_net.h>

#define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \
		      NETIF_F_TSO6)

static dev_t ipvtap_major;
static struct cdev ipvtap_cdev;

static const void *ipvtap_net_namespace(struct device *d)
{
	struct net_device *dev = to_net_dev(d->parent);
	return dev_net(dev);
}

static struct class ipvtap_class = {
	 .name = "ipvtap",
	 .owner = THIS_MODULE,
	 .ns_type = &net_ns_type_operations,
	 .namespace = ipvtap_net_namespace,
};

struct ipvtap_dev {
	struct ipvl_dev vlan;
	struct tap_dev	  tap;
};

static void ipvtap_count_tx_dropped(struct tap_dev *tap)
{
	struct ipvtap_dev *vlantap = container_of(tap, struct ipvtap_dev, tap);
	struct ipvl_dev *vlan = &vlantap->vlan;

	this_cpu_inc(vlan->pcpu_stats->tx_drps);
}

static void ipvtap_count_rx_dropped(struct tap_dev *tap)
{
	struct ipvtap_dev *vlantap = container_of(tap, struct ipvtap_dev, tap);
	struct ipvl_dev *vlan = &vlantap->vlan;

	ipvlan_count_rx(vlan, 0, 0, 0);
}

static void ipvtap_update_features(struct tap_dev *tap,
				   netdev_features_t features)
{
	struct ipvtap_dev *vlantap = container_of(tap, struct ipvtap_dev, tap);
	struct ipvl_dev *vlan = &vlantap->vlan;

	vlan->sfeatures = features;
	netdev_update_features(vlan->dev);
}

static int ipvtap_newlink(struct net *src_net, struct net_device *dev,
			  struct nlattr *tb[], struct nlattr *data[],
			  struct netlink_ext_ack *extack)
{
	struct ipvtap_dev *vlantap = netdev_priv(dev);
	int err;

	INIT_LIST_HEAD(&vlantap->tap.queue_list);

	/* Since macvlan supports all offloads by default, make
	 * tap support all offloads also.
	 */
	vlantap->tap.tap_features = TUN_OFFLOADS;
	vlantap->tap.count_tx_dropped = ipvtap_count_tx_dropped;
	vlantap->tap.update_features =	ipvtap_update_features;
	vlantap->tap.count_rx_dropped = ipvtap_count_rx_dropped;

	err = netdev_rx_handler_register(dev, tap_handle_frame, &vlantap->tap);
	if (err)
		return err;

	/* Don't put anything that may fail after macvlan_common_newlink
	 * because we can't undo what it does.
	 */
	err =  ipvlan_link_new(src_net, dev, tb, data, extack);
	if (err) {
		netdev_rx_handler_unregister(dev);
		return err;
	}

	vlantap->tap.dev = vlantap->vlan.dev;

	return err;
}

static void ipvtap_dellink(struct net_device *dev,
			   struct list_head *head)
{
	struct ipvtap_dev *vlan = netdev_priv(dev);

	netdev_rx_handler_unregister(dev);
	tap_del_queues(&vlan->tap);
	ipvlan_link_delete(dev, head);
}

static void ipvtap_setup(struct net_device *dev)
{
	ipvlan_link_setup(dev);
	dev->tx_queue_len = TUN_READQ_SIZE;
	dev->priv_flags &= ~IFF_NO_QUEUE;
}

static struct rtnl_link_ops ipvtap_link_ops __read_mostly = {
	.kind		= "ipvtap",
	.setup		= ipvtap_setup,
	.newlink	= ipvtap_newlink,
	.dellink	= ipvtap_dellink,
	.priv_size	= sizeof(struct ipvtap_dev),
};

static int ipvtap_device_event(struct notifier_block *unused,
			       unsigned long event, void *ptr)
{
	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
	struct ipvtap_dev *vlantap;
	struct device *classdev;
	dev_t devt;
	int err;
	char tap_name[IFNAMSIZ];

	if (dev->rtnl_link_ops != &ipvtap_link_ops)
		return NOTIFY_DONE;

	snprintf(tap_name, IFNAMSIZ, "tap%d", dev->ifindex);
	vlantap = netdev_priv(dev);

	switch (event) {
	case NETDEV_REGISTER:
		/* Create the device node here after the network device has
		 * been registered but before register_netdevice has
		 * finished running.
		 */
		err = tap_get_minor(ipvtap_major, &vlantap->tap);
		if (err)
			return notifier_from_errno(err);

		devt = MKDEV(MAJOR(ipvtap_major), vlantap->tap.minor);
		classdev = device_create(&ipvtap_class, &dev->dev, devt,
					 dev, tap_name);
		if (IS_ERR(classdev)) {
			tap_free_minor(ipvtap_major, &vlantap->tap);
			return notifier_from_errno(PTR_ERR(classdev));
		}
		err = sysfs_create_link(&dev->dev.kobj, &classdev->kobj,
					tap_name);
		if (err)
			return notifier_from_errno(err);
		break;
	case NETDEV_UNREGISTER:
		/* vlan->minor == 0 if NETDEV_REGISTER above failed */
		if (vlantap->tap.minor == 0)
			break;
		sysfs_remove_link(&dev->dev.kobj, tap_name);
		devt = MKDEV(MAJOR(ipvtap_major), vlantap->tap.minor);
		device_destroy(&ipvtap_class, devt);
		tap_free_minor(ipvtap_major, &vlantap->tap);
		break;
	case NETDEV_CHANGE_TX_QUEUE_LEN:
		if (tap_queue_resize(&vlantap->tap))
			return NOTIFY_BAD;
		break;
	}

	return NOTIFY_DONE;
}

static struct notifier_block ipvtap_notifier_block __read_mostly = {
	.notifier_call	= ipvtap_device_event,
};

static int ipvtap_init(void)
{
	int err;

	err = tap_create_cdev(&ipvtap_cdev, &ipvtap_major, "ipvtap");

	if (err)
		goto out1;

	err = class_register(&ipvtap_class);
	if (err)
		goto out2;

	err = register_netdevice_notifier(&ipvtap_notifier_block);
	if (err)
		goto out3;

	err = ipvlan_link_register(&ipvtap_link_ops);
	if (err)
		goto out4;

	return 0;

out4:
	unregister_netdevice_notifier(&ipvtap_notifier_block);
out3:
	class_unregister(&ipvtap_class);
out2:
	tap_destroy_cdev(ipvtap_major, &ipvtap_cdev);
out1:
	return err;
}
module_init(ipvtap_init);

static void ipvtap_exit(void)
{
	rtnl_link_unregister(&ipvtap_link_ops);
	unregister_netdevice_notifier(&ipvtap_notifier_block);
	class_unregister(&ipvtap_class);
	tap_destroy_cdev(ipvtap_major, &ipvtap_cdev);
}
module_exit(ipvtap_exit);
MODULE_ALIAS_RTNL_LINK("ipvtap");
MODULE_AUTHOR("Sainath Grandhi <sainath.grandhi@intel.com>");
MODULE_LICENSE("GPL");
the value set in the policy. For policy capabilities that are set in the loaded policy but unknown to the kernel, log the policy capability index, since this is the only information presently available in the policy. Sample output with a policy created with a new capability defined that is not known to the kernel: SELinux: policy capability network_peer_controls=1 SELinux: policy capability open_perms=1 SELinux: policy capability extended_socket_class=1 SELinux: policy capability always_check_network=0 SELinux: policy capability cgroup_seclabel=0 SELinux: unknown policy capability 5 Resolves: https://github.com/SELinuxProject/selinux-kernel/issues/32 Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov> Signed-off-by: Paul Moore <paul@paul-moore.com> 2017-05-23selinux: do not check open permission on socketsStephen Smalley1-3/+7 open permission is currently only defined for files in the kernel (COMMON_FILE_PERMS rather than COMMON_FILE_SOCK_PERMS). Construction of an artificial test case that tries to open a socket via /proc/pid/fd will generate a recvfrom avc denial because recvfrom and open happen to map to the same permission bit in socket vs file classes. open of a socket via /proc/pid/fd is not supported by the kernel regardless and will ultimately return ENXIO. But we hit the permission check first and can thus produce these odd/misleading denials. Omit the open check when operating on a socket. Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov> Signed-off-by: Paul Moore <paul@paul-moore.com> 2017-05-23selinux: add a map permission check for mmapStephen Smalley2-1/+13 Add a map permission check on mmap so that we can distinguish memory mapped access (since it has different implications for revocation). When a file is opened and then read or written via syscalls like read(2)/write(2), we revalidate access on each read/write operation via selinux_file_permission() and therefore can revoke access if the process context, the file context, or the policy changes in such a manner that access is no longer allowed. When a file is opened and then memory mapped via mmap(2) and then subsequently read or written directly in memory, we presently have no way to revalidate or revoke access. The purpose of a separate map permission check on mmap(2) is to permit policy to prohibit memory mapping of specific files for which we need to ensure that every access is revalidated, particularly useful for scenarios where we expect the file to be relabeled at runtime in order to reflect state changes (e.g. cross-domain solution, assured pipeline without data copying). Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov> Signed-off-by: Paul Moore <paul@paul-moore.com> 2017-05-23selinux: only invoke capabilities and selinux for CAP_MAC_ADMIN checksStephen Smalley1-8/+15 SELinux uses CAP_MAC_ADMIN to control the ability to get or set a raw, uninterpreted security context unknown to the currently loaded security policy. When performing these checks, we only want to perform a base capabilities check and a SELinux permission check. If any other modules that implement a capable hook are stacked with SELinux, we do not want to require them to also have to authorize CAP_MAC_ADMIN, since it may have different implications for their security model. Rework the CAP_MAC_ADMIN checks within SELinux to only invoke the capabilities module and the SELinux permission checking. Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov> Signed-off-by: Paul Moore <paul@paul-moore.com> 2017-05-23selinux: Return an error code only as a constant in sidtab_insert()Markus Elfring1-17/+10 * Return an error code without storing it in an intermediate variable. * Delete the local variable "rc" and the jump label "out" which became unnecessary with this refactoring. Signed-off-by: Markus Elfring <elfring@users.sourceforge.net> Signed-off-by: Paul Moore <paul@paul-moore.com> 2017-05-23selinux: Return directly after a failed memory allocation in policydb_index()Markus Elfring1-10/+5 Replace five goto statements (and previous variable assignments) by direct returns after a memory allocation failure in this function. Signed-off-by: Markus Elfring <elfring@users.sourceforge.net> Signed-off-by: Paul Moore <paul@paul-moore.com> 2017-05-23selinux: Use task_alloc hook rather than task_create hookTetsuo Handa1-2/+3 This patch is a preparation for getting rid of task_create hook because task_alloc hook which can do what task_create hook can do was revived. Creating a new thread is unlikely prohibited by security policy, for fork()/execve()/exit() is fundamental of how processes are managed in Unix. If a program is known to create a new thread, it is likely that permission to create a new thread is given to that program. Therefore, a situation where security_task_create() returns an error is likely that the program was exploited and lost control. Even if SELinux failed to check permission to create a thread at security_task_create(), SELinux can later check it at security_task_alloc(). Since the new thread is not yet visible from the rest of the system, nobody can do bad things using the new thread. What we waste will be limited to some initialization steps such as dup_task_struct(), copy_creds() and audit_alloc() in copy_process(). We can tolerate these overhead for unlikely situation. Therefore, this patch changes SELinux to use task_alloc hook rather than task_create hook so that we can remove task_create hook. Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Acked-by: Stephen Smalley <sds@tycho.nsa.gov> Signed-off-by: Paul Moore <paul@paul-moore.com> 2017-05-21Linux 4.12-rc2v4.12-rc2Linus Torvalds1-1/+1 2017-05-21x86: fix 32-bit case of __get_user_asm_u64()Linus Torvalds1-3/+3 The code to fetch a 64-bit value from user space was entirely buggered, and has been since the code was merged in early 2016 in commit b2f680380ddf ("x86/mm/32: Add support for 64-bit __get_user() on 32-bit kernels"). Happily the buggered routine is almost certainly entirely unused, since the normal way to access user space memory is just with the non-inlined "get_user()", and the inlined version didn't even historically exist. The normal "get_user()" case is handled by external hand-written asm in arch/x86/lib/getuser.S that doesn't have either of these issues. There were two independent bugs in __get_user_asm_u64(): - it still did the STAC/CLAC user space access marking, even though that is now done by the wrapper macros, see commit 11f1a4b9755f ("x86: reorganize SMAP handling in user space accesses"). This didn't result in a semantic error, it just means that the inlined optimized version was hugely less efficient than the allegedly slower standard version, since the CLAC/STAC overhead is quite high on modern Intel CPU's. - the double register %eax/%edx was marked as an output, but the %eax part of it was touched early in the asm, and could thus clobber other inputs to the asm that gcc didn't expect it to touch. In particular, that meant that the generated code could look like this: mov (%eax),%eax mov 0x4(%eax),%edx where the load of %edx obviously was _supposed_ to be from the 32-bit word that followed the source of %eax, but because %eax was overwritten by the first instruction, the source of %edx was basically random garbage. The fixes are trivial: remove the extraneous STAC/CLAC entries, and mark the 64-bit output as early-clobber to let gcc know that no inputs should alias with the output register. Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Benjamin LaHaise <bcrl@kvack.org> Cc: Ingo Molnar <mingo@kernel.org> Cc: stable@kernel.org # v4.8+ Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2017-05-21Clean up x86 unsafe_get/put_user() type handlingLinus Torvalds1-2/+3 Al noticed that unsafe_put_user() had type problems, and fixed them in commit a7cc722fff0b ("fix unsafe_put_user()"), which made me look more at those functions. It turns out that unsafe_get_user() had a type issue too: it limited the largest size of the type it could handle to "unsigned long". Which is fine with the current users, but doesn't match our existing normal get_user() semantics, which can also handle "u64" even when that does not fit in a long. While at it, also clean up the type cast in unsafe_put_user(). We actually want to just make it an assignment to the expected type of the pointer, because we actually do want warnings from types that don't convert silently. And it makes the code more readable by not having that one very long and complex line. [ This patch might become stable material if we ever end up back-porting any new users of the unsafe uaccess code, but as things stand now this doesn't matter for any current existing uses. ] Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2017-05-21osf_wait4(): fix infoleakAl Viro1-2/+4 failing sys_wait4() won't fill struct rusage... Cc: stable@vger.kernel.org Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> 2017-05-21fix unsafe_put_user()Al Viro1-1/+1 __put_user_size() relies upon its first argument having the same type as what the second one points to; the only other user makes sure of that and unsafe_put_user() should do the same. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> 2017-05-20nvmet: release the sq ref on rdma read errorsVijay Immanuel3-0/+8 On rdma read errors, release the sq ref that was taken when the req was initialized. This avoids a hang in nvmet_sq_destroy() when the queue is being freed. Signed-off-by: Vijay Immanuel <vijayi@attalasystems.com> Reviewed-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <axboe@fb.com> 2017-05-20nvmet-fc: remove target cpu scheduling flagJames Smart4-15/+3 Remove NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED. It's unnecessary. Signed-off-by: James Smart <james.smart@broadcom.com> Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de> Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <axboe@fb.com> 2017-05-20nvme-fc: stop queues on error detectionJames Smart1-0/+4 Per the recommendation by Sagi on: http://lists.infradead.org/pipermail/linux-nvme/2017-April/009261.html Rather than waiting for reset work thread to stop queues and abort the ios, immediately stop the queues on error detection. Reset thread will restop the queues (as it's called on other paths), but it does not appear to have a side effect. Signed-off-by: James Smart <james.smart@broadcom.com> Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de> Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <axboe@fb.com> 2017-05-20nvme-fc: require target or discovery role for fc-nvme targetsJames Smart1-0/+6 In order to create an association, the remoteport must be serving either a target role or a discovery role. Signed-off-by: James Smart <james.smart@broadcom.com> Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de> Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <axboe@fb.com> 2017-05-20nvme-fc: correct port role bitsJames Smart1-2/+2 FC Port roles is a bit mask, not individual values. Correct nvme definitions to unique bits. Signed-off-by: James Smart <james.smart@broadcom.com> Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de> Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <axboe@fb.com> 2017-05-20nvme: unmap CMB and remove sysfs file in reset pathJon Derrick1-1/+6 CMB doesn't get unmapped until removal while getting remapped on every reset. Add the unmapping and sysfs file removal to the reset path in nvme_pci_disable to match the mapping path in nvme_pci_enable. Fixes: 202021c1a ("nvme : Add sysfs entry for NVMe CMBs when appropriate") Signed-off-by: Jon Derrick <jonathan.derrick@intel.com> Acked-by: Keith Busch <keith.busch@intel.com> Reviewed-By: Stephen Bates <sbates@raithlin.com> Cc: <stable@vger.kernel.org> # 4.9+ Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <axboe@fb.com> 2017-05-19KVM: x86: prevent uninitialized variable warning in check_svme()Radim Krčmář1-1/+1 get_msr() of MSR_EFER is currently always going to succeed, but static checker doesn't see that far. Don't complicate stuff and just use 0 for the fallback -- it means that the feature is not present. Reported-by: Dan Carpenter <dan.carpenter@oracle.com> Reviewed-by: Paolo Bonzini <pbonzini@redhat.com> Reviewed-by: David Hildenbrand <david@redhat.com> Signed-off-by: Radim Krčmář <rkrcmar@redhat.com> 2017-05-19KVM: x86/vPMU: fix undefined shift in intel_pmu_refresh()Radim Krčmář1-1/+1 Static analysis noticed that pmu->nr_arch_gp_counters can be 32 (INTEL_PMC_MAX_GENERIC) and therefore cannot be used to shift 'int'. I didn't add BUILD_BUG_ON for it as we have a better checker. Reported-by: Dan Carpenter <dan.carpenter@oracle.com> Fixes: 25462f7f5295 ("KVM: x86/vPMU: Define kvm_pmu_ops to support vPMU function dispatch") Reviewed-by: Paolo Bonzini <pbonzini@redhat.com> Reviewed-by: David Hildenbrand <david@redhat.com> Signed-off-by: Radim Krčmář <rkrcmar@redhat.com> 2017-05-19KVM: x86: zero base3 of unusable segmentsRadim Krčmář1-0/+2 Static checker noticed that base3 could be used uninitialized if the segment was not present (useable). Random stack values probably would not pass VMCS entry checks. Reported-by: Dan Carpenter <dan.carpenter@oracle.com> Fixes: 1aa366163b8b ("KVM: x86 emulator: consolidate segment accessors") Reviewed-by: Paolo Bonzini <pbonzini@redhat.com> Reviewed-by: David Hildenbrand <david@redhat.com> Signed-off-by: Radim Krčmář <rkrcmar@redhat.com> 2017-05-19KVM: X86: Fix read out-of-bounds vulnerability in kvm pio emulationWanpeng Li1-9/+15 Huawei folks reported a read out-of-bounds vulnerability in kvm pio emulation. - "inb" instruction to access PIT Mod/Command register (ioport 0x43, write only, a read should be ignored) in guest can get a random number. - "rep insb" instruction to access PIT register port 0x43 can control memcpy() in emulator_pio_in_emulated() to copy max 0x400 bytes but only read 1 bytes, which will disclose the unimportant kernel memory in host but no crash. The similar test program below can reproduce the read out-of-bounds vulnerability: void hexdump(void *mem, unsigned int len) { unsigned int i, j; for(i = 0; i < len + ((len % HEXDUMP_COLS) ? (HEXDUMP_COLS - len % HEXDUMP_COLS) : 0); i++) { /* print offset */ if(i % HEXDUMP_COLS == 0) { printf("0x%06x: ", i); } /* print hex data */ if(i < len) { printf("%02x ", 0xFF & ((char*)mem)[i]); } else /* end of block, just aligning for ASCII dump */ { printf(" "); } /* print ASCII dump */ if(i % HEXDUMP_COLS == (HEXDUMP_COLS - 1)) { for(j = i - (HEXDUMP_COLS - 1); j <= i; j++) { if(j >= len) /* end of block, not really printing */ { putchar(' '); } else if(isprint(((char*)mem)[j])) /* printable char */ { putchar(0xFF & ((char*)mem)[j]); } else /* other char */ { putchar('.'); } } putchar('\n'); } } } int main(void) { int i; if (iopl(3)) { err(1, "set iopl unsuccessfully\n"); return -1; } static char buf[0x40]; /* test ioport 0x40,0x41,0x42,0x43,0x44,0x45 */ memset(buf, 0xab, sizeof(buf)); asm volatile("push %rdi;"); asm volatile("mov %0, %%rdi;"::"q"(buf)); asm volatile ("mov $0x40, %rdx;"); asm volatile ("in %dx,%al;"); asm volatile ("stosb;"); asm volatile ("mov $0x41, %rdx;"); asm volatile ("in %dx,%al;"); asm volatile ("stosb;"); asm volatile ("mov $0x42, %rdx;"); asm volatile ("in %dx,%al;"); asm volatile ("stosb;"); asm volatile ("mov $0x43, %rdx;"); asm volatile ("in %dx,%al;"); asm volatile ("stosb;"); asm volatile ("mov $0x44, %rdx;"); asm volatile ("in %dx,%al;"); asm volatile ("stosb;"); asm volatile ("mov $0x45, %rdx;"); asm volatile ("in %dx,%al;"); asm volatile ("stosb;"); asm volatile ("pop %rdi;"); hexdump(buf, 0x40); printf("\n"); /* ins port 0x40 */ memset(buf, 0xab, sizeof(buf)); asm volatile("push %rdi;"); asm volatile("mov %0, %%rdi;"::"q"(buf)); asm volatile ("mov $0x20, %rcx;"); asm volatile ("mov $0x40, %rdx;"); asm volatile ("rep insb;"); asm volatile ("pop %rdi;"); hexdump(buf, 0x40); printf("\n"); /* ins port 0x43 */ memset(buf, 0xab, sizeof(buf)); asm volatile("push %rdi;"); asm volatile("mov %0, %%rdi;"::"q"(buf)); asm volatile ("mov $0x20, %rcx;"); asm volatile ("mov $0x43, %rdx;"); asm volatile ("rep insb;"); asm volatile ("pop %rdi;"); hexdump(buf, 0x40); printf("\n"); return 0; } The vcpu->arch.pio_data buffer is used by both in/out instrutions emulation w/o clear after using which results in some random datas are left over in the buffer. Guest reads port 0x43 will be ignored since it is write only, however, the function kernel_pio() can't distigush this ignore from successfully reads data from device's ioport. There is no new data fill the buffer from port 0x43, however, emulator_pio_in_emulated() will copy the stale data in the buffer to the guest unconditionally. This patch fixes it by clearing the buffer before in instruction emulation to avoid to grant guest the stale data in the buffer. In addition, string I/O is not supported for in kernel device. So there is no iteration to read ioport %RCX times for string I/O. The function kernel_pio() just reads one round, and then copy the io size * %RCX to the guest unconditionally, actually it copies the one round ioport data w/ other random datas which are left over in the vcpu->arch.pio_data buffer to the guest. This patch fixes it by introducing the string I/O support for in kernel device in order to grant the right ioport datas to the guest. Before the patch: 0x000000: fe 38 93 93 ff ff ab ab .8...... 0x000008: ab ab ab ab ab ab ab ab ........ 0x000010: ab ab ab ab ab ab ab ab ........ 0x000018: ab ab ab ab ab ab ab ab ........ 0x000020: ab ab ab ab ab ab ab ab ........ 0x000028: ab ab ab ab ab ab ab ab ........ 0x000030: ab ab ab ab ab ab ab ab ........ 0x000038: ab ab ab ab ab ab ab ab ........ 0x000000: f6 00 00 00 00 00 00 00 ........ 0x000008: 00 00 00 00 00 00 00 00 ........ 0x000010: 00 00 00 00 4d 51 30 30 ....MQ00 0x000018: 30 30 20 33 20 20 20 20 00 3 0x000020: ab ab ab ab ab ab ab ab ........ 0x000028: ab ab ab ab ab ab ab ab ........ 0x000030: ab ab ab ab ab ab ab ab ........ 0x000038: ab ab ab ab ab ab ab ab ........ 0x000000: f6 00 00 00 00 00 00 00 ........ 0x000008: 00 00 00 00 00 00 00 00 ........ 0x000010: 00 00 00 00 4d 51 30 30 ....MQ00 0x000018: 30 30 20 33 20 20 20 20 00 3 0x000020: ab ab ab ab ab ab ab ab ........ 0x000028: ab ab ab ab ab ab ab ab ........ 0x000030: ab ab ab ab ab ab ab ab ........ 0x000038: ab ab ab ab ab ab ab ab ........ After the patch: 0x000000: 1e 02 f8 00 ff ff ab ab ........ 0x000008: ab ab ab ab ab ab ab ab ........ 0x000010: ab ab ab ab ab ab ab ab ........ 0x000018: ab ab ab ab ab ab ab ab ........ 0x000020: ab ab ab ab ab ab ab ab ........ 0x000028: ab ab ab ab ab ab ab ab ........ 0x000030: ab ab ab ab ab ab ab ab ........ 0x000038: ab ab ab ab ab ab ab ab ........ 0x000000: d2 e2 d2 df d2 db d2 d7 ........ 0x000008: d2 d3 d2 cf d2 cb d2 c7 ........ 0x000010: d2 c4 d2 c0 d2 bc d2 b8 ........ 0x000018: d2 b4 d2 b0 d2 ac d2 a8 ........ 0x000020: ab ab ab ab ab ab ab ab ........ 0x000028: ab ab ab ab ab ab ab ab ........ 0x000030: ab ab ab ab ab ab ab ab ........ 0x000038: ab ab ab ab ab ab ab ab ........ 0x000000: 00 00 00 00 00 00 00 00 ........ 0x000008: 00 00 00 00 00 00 00 00 ........ 0x000010: 00 00 00 00 00 00 00 00 ........ 0x000018: 00 00 00 00 00 00 00 00 ........ 0x000020: ab ab ab ab ab ab ab ab ........ 0x000028: ab ab ab ab ab ab ab ab ........ 0x000030: ab ab ab ab ab ab ab ab ........ 0x000038: ab ab ab ab ab ab ab ab ........ Reported-by: Moguofang <moguofang@huawei.com> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Radim Krčmář <rkrcmar@redhat.com> Cc: Moguofang <moguofang@huawei.com> Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com> Cc: stable@vger.kernel.org Signed-off-by: Radim Krčmář <rkrcmar@redhat.com> 2017-05-19KVM: x86: Fix potential preemption when get the current kvmclock timestampWanpeng Li1-1/+9 BUG: using __this_cpu_read() in preemptible [00000000] code: qemu-system-x86/2809 caller is __this_cpu_preempt_check+0x13/0x20 CPU: 2 PID: 2809 Comm: qemu-system-x86 Not tainted 4.11.0+ #13 Call Trace: dump_stack+0x99/0xce check_preemption_disabled+0xf5/0x100 __this_cpu_preempt_check+0x13/0x20 get_kvmclock_ns+0x6f/0x110 [kvm] get_time_ref_counter+0x5d/0x80 [kvm] kvm_hv_process_stimers+0x2a1/0x8a0 [kvm] ? kvm_hv_process_stimers+0x2a1/0x8a0 [kvm] ? kvm_arch_vcpu_ioctl_run+0xac9/0x1ce0 [kvm] kvm_arch_vcpu_ioctl_run+0x5bf/0x1ce0 [kvm] kvm_vcpu_ioctl+0x384/0x7b0 [kvm] ? kvm_vcpu_ioctl+0x384/0x7b0 [kvm] ? __fget+0xf3/0x210 do_vfs_ioctl+0xa4/0x700 ? __fget+0x114/0x210 SyS_ioctl+0x79/0x90 entry_SYSCALL_64_fastpath+0x23/0xc2 RIP: 0033:0x7f9d164ed357 ? __this_cpu_preempt_check+0x13/0x20 This can be reproduced by run kvm-unit-tests/hyperv_stimer.flat w/ CONFIG_PREEMPT and CONFIG_DEBUG_PREEMPT enabled. Safe access to per-CPU data requires a couple of constraints, though: the thread working with the data cannot be preempted and it cannot be migrated while it manipulates per-CPU variables. If the thread is preempted, the thread that replaces it could try to work with the same variables; migration to another CPU could also cause confusion. However there is no preemption disable when reads host per-CPU tsc rate to calculate the current kvmclock timestamp. This patch fixes it by utilizing get_cpu/put_cpu pair to guarantee both __this_cpu_read() and rdtsc() are not preempted. Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Radim Krčmář <rkrcmar@redhat.com> Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com> Reviewed-by: Paolo Bonzini <pbonzini@redhat.com> Cc: stable@vger.kernel.org Signed-off-by: Radim Krčmář <rkrcmar@redhat.com> 2017-05-19blktrace: fix integer parseShaohua Li1-2/+2 sscanf is a very poor way to parse integer. For example, I input "discard" for act_mask, it gets 0xd and completely messes up. Using correct API to do integer parse. This patch also makes attributes accept any base of integer. Signed-off-by: Shaohua Li <shli@fb.com> Signed-off-by: Jens Axboe <axboe@fb.com> 2017-05-19i2c: designware: don't infer timings described by ACPI from clock rateArd Biesheuvel1-8/+10 Commit bd698d24b1b57 ("i2c: designware: Get selected speed mode sda-hold-time via ACPI") updated the logic that reads the timing parameters for various I2C bus rates from the DSDT, to only read the timing parameters for the currently selected mode. This causes a WARN_ON() splat on platforms that legally omit the clock frequency from the ACPI description, because in the new situation, the core I2C designware driver still accesses the fields in the driver struct that we no longer populate, and proceeds to calculate them from the clock frequency. Since the clock frequency is unspecified, the driver complains loudly using a WARN_ON(). So revert back to the old situation, where the struct fields for all timings are populated, but retain the new logic which chooses the SDA hold time from the timing mode that is currently in use. Fixes: bd698d24b1b57 ("i2c: designware: Get selected speed mode ...") Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Reported-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com> Acked-by: Jarkko Nikula <jarkko.nikula@linux.intel.com> Signed-off-by: Wolfram Sang <wsa@the-dreams.de> 2017-05-19arm64: dts: rockchip: fix include referenceArnd Bergmann1-1/+1 The way we handle include paths for DT has changed a bit, which broke a file that had an unconventional way to reference a common header file: arch/arm64/boot/dts/rockchip/rk3399-gru-kevin.dts:47:10: fatal error: include/dt-bindings/input/linux-event-codes.h: No such file or directory This removes the leading "include/" from the path name, which fixes it. Fixes: d5d332d3f7e8 ("devicetree: Move include prefixes from arch to separate directory") Signed-off-by: Arnd Bergmann <arnd@arndb.de> 2017-05-19watchdog: bcm281xx: Fix use of uninitialized spinlock.Eric Anholt1-1/+2 The bcm_kona_wdt_set_resolution_reg() call takes the spinlock, so initialize it earlier. Fixes a warning at boot with lock debugging enabled. Fixes: 6adb730dc208 ("watchdog: bcm281xx: Watchdog Driver") Signed-off-by: Eric Anholt <eric@anholt.net> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Reviewed-by: Guenter Roeck <linux@roeck-us.net> Signed-off-by: Guenter Roeck <linux@roeck-us.net> Signed-off-by: Wim Van Sebroeck <wim@iguana.be> 2017-05-19watchdog: zx2967: remove redundant dev_err call in zx2967_wdt_probe()Wei Yongjun1-3/+1 There is a error message within devm_ioremap_resource already, so remove the dev_err call to avoid redundant error message. Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com> Reviewed-by: Guenter Roeck <linux@roeck-us.net> Signed-off-by: Guenter Roeck <linux@roeck-us.net> Signed-off-by: Wim Van Sebroeck <wim@iguana.be> 2017-05-19iTCO_wdt: all versions count down twicePaolo Bonzini2-13/+11 The ICH9 is listed as having TCO v2, and indeed the behavior in the datasheet corresponds to v2 (for example the NO_REBOOT flag is accessible via the 16KiB-aligned Root Complex Base Address). However, the TCO counts twice just like in v1; the documentation of the SECOND_TO_STS bit says: "ICH9 sets this bit to 1 to indicate that the TIMEOUT bit had been (or is currently) set and a second timeout occurred before the TCO_RLD register was written. If this bit is set and the NO_REBOOT config bit is 0, then the ICH9 will reboot the system after the second timeout. The same can be found in the BayTrail (Atom E3800) datasheet, and even HOWTOs around the Internet say that it will reboot after _twice_ the specified heartbeat. I did not find the Apollo Lake datasheet, but because v4/v5 has a SECOND_TO_STS bit just like the previous version I'm enabling this for Apollo Lake as well. Cc: linux-watchdog@vger.kernel.org Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> Reviewed-by: Guenter Roeck <linux@roeck-us.net> Signed-off-by: Guenter Roeck <linux@roeck-us.net> Signed-off-by: Wim Van Sebroeck <wim@iguana.be> 2017-05-19firmware: ti_sci: fix strncat length checkArnd Bergmann1-1/+2 gcc-7 notices that the length we pass to strncat is wrong: drivers/firmware/ti_sci.c: In function 'ti_sci_probe': drivers/firmware/ti_sci.c:204:32: error: specified bound 50 equals the size of the destination [-Werror=stringop-overflow=] Instead of the total length, we must pass the length of the remaining space here. Fixes: aa276781a64a ("firmware: Add basic support for TI System Control Interface (TI-SCI) protocol") Cc: stable@vger.kernel.org Acked-by: Nishanth Menon <nm@ti.com> Acked-by: Santosh Shilimkar <ssantosh@kernel.org> Signed-off-by: Arnd Bergmann <arnd@arndb.de> 2017-05-19ARM: remove duplicate 'const' annotations'Arnd Bergmann6-6/+6 gcc-7 warns about some declarations that are more 'const' than necessary: arch/arm/mach-at91/pm.c:338:34: error: duplicate 'const' declaration specifier [-Werror=duplicate-decl-specifier] static const struct of_device_id const ramc_ids[] __initconst = { arch/arm/mach-bcm/bcm_kona_smc.c:36:34: error: duplicate 'const' declaration specifier [-Werror=duplicate-decl-specifier] static const struct of_device_id const bcm_kona_smc_ids[] __initconst = { arch/arm/mach-spear/time.c:207:34: error: duplicate 'const' declaration specifier [-Werror=duplicate-decl-specifier] static const struct of_device_id const timer_of_match[] __initconst = { arch/arm/mach-omap2/prm_common.c:714:34: error: duplicate 'const' declaration specifier [-Werror=duplicate-decl-specifier] static const struct of_device_id const omap_prcm_dt_match_table[] __initconst = { arch/arm/mach-omap2/vc.c:562:35: error: duplicate 'const' declaration specifier [-Werror=duplicate-decl-specifier] static const struct i2c_init_data const omap4_i2c_timing_data[] __initconst = { The ones in arch/arm were apparently all introduced accidentally by one commit that correctly marked a lot of variables as __initconst. Fixes: 19c233b79d1a ("ARM: appropriate __init annotation for const data") Acked-by: Alexandre Belloni <alexandre.belloni@free-electrons.com> Acked-by: Tony Lindgren <tony@atomide.com> Acked-by: Nicolas Pitre <nico@linaro.org> Acked-by: Florian Fainelli <f.fainelli@gmail.com> Acked-by: Viresh Kumar <viresh.kumar@linaro.org> Acked-by: Krzysztof Hałasa <khalasa@piap.pl> Signed-off-by: Arnd Bergmann <arnd@arndb.de> 2017-05-19arm64: defconfig: enable options needed for QCom DB410c boardRob Herring1-0/+7 Enable Qualcomm drivers needed to boot Dragonboard 410c with HDMI. This enables support for clocks, regulators, and USB PHY. Cc: Bjorn Andersson <bjorn.andersson@linaro.org> Cc: John Stultz <john.stultz@linaro.org> Signed-off-by: Rob Herring <robh@kernel.org> [Olof: Turned off _RPM configs per follow-up email] Signed-off-by: Olof Johansson <olof@lixom.net> 2017-05-18arm64: defconfig: sync with savedefconfigRob Herring1-61/+42 Sync the defconfig with savedefconfig as config options change/move over time. Generated with the following commands: make defconfig make savedefconfig cp defconfig arch/arm64/configs/defconfig Signed-off-by: Rob Herring <robh@kernel.org> Signed-off-by: Olof Johansson <olof@lixom.net>