From e646d060aec4c266783429669567226c1099b65f Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Wed, 10 Aug 2016 14:25:28 -0400
Subject: [PATCH 1/9] locking/mutex: Add waiter parameter to
 mutex_optimistic_spin()

This patch adds a new waiter parameter to the mutex_optimistic_spin()
function to prepare it to be used by a waiter-spinner that doesn't
need to go into the OSQ as there can only be one waiter-spinner which
is the head of the waiting queue.

Signed-off-by: Waiman Long <Waiman.Long@hp.com>
[imre: Applied from https://lkml.org/lkml/2016/8/10/507]
Signed-off-by: Imre Deak <imre.deak@intel.com>
---
 kernel/locking/mutex.c | 62 ++++++++++++++++++++++++++++--------------
 1 file changed, 42 insertions(+), 20 deletions(-)

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index a70b90db3909c..3bcbbd1857ccc 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -273,11 +273,16 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
 
 /*
  * Atomically try to take the lock when it is available
+ *
+ * For waiter-spinner, the count needs to be set to -1 first which will be
+ * cleared to 0 later on if the list becomes empty. For regular spinner,
+ * the count will be set to 0 as the the woken waiter will set it to -1,
+ * if necessary.
  */
-static inline bool mutex_try_to_acquire(struct mutex *lock)
+static inline bool mutex_try_to_acquire(struct mutex *lock, int waiter)
 {
 	return !mutex_is_locked(lock) &&
-		(atomic_cmpxchg_acquire(&lock->count, 1, 0) == 1);
+		(atomic_cmpxchg_acquire(&lock->count, 1, waiter ? -1 : 0) == 1);
 }
 
 /*
@@ -302,22 +307,37 @@ static inline bool mutex_try_to_acquire(struct mutex *lock)
  *
  * Returns true when the lock was taken, otherwise false, indicating
  * that we need to jump to the slowpath and sleep.
+ *
+ * The waiter flag is set to true if the spinner is a waiter in the wait
+ * queue. The waiter-spinner will spin on the lock directly and concurrently
+ * with the spinner at the head of the OSQ, if present.
  */
 static bool mutex_optimistic_spin(struct mutex *lock,
-				  struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
+				  struct ww_acquire_ctx *ww_ctx,
+				  const bool use_ww_ctx, int waiter)
 {
 	struct task_struct *task = current;
+	bool acquired = false;
 
-	if (!mutex_can_spin_on_owner(lock))
-		goto done;
+	if (!waiter) {
+		/*
+		 * The purpose of the mutex_can_spin_on_owner() function is
+		 * to eliminate the overhead of osq_lock() and osq_unlock()
+		 * in case spinning isn't possible. As a waiter-spinner
+		 * is not going to take OSQ lock anyway, there is no need
+		 * to call mutex_can_spin_on_owner().
+		 */
+		if (!mutex_can_spin_on_owner(lock))
+			goto done;
 
-	/*
-	 * In order to avoid a stampede of mutex spinners trying to
-	 * acquire the mutex all at once, the spinners need to take a
-	 * MCS (queued) lock first before spinning on the owner field.
-	 */
-	if (!osq_lock(&lock->osq))
-		goto done;
+		/*
+		 * In order to avoid a stampede of mutex spinners trying to
+		 * acquire the mutex all at once, the spinners need to take a
+		 * MCS (queued) lock first before spinning on the owner field.
+		 */
+		if (!osq_lock(&lock->osq))
+			goto done;
+	}
 
 	while (true) {
 		struct task_struct *owner;
@@ -347,7 +367,7 @@ static bool mutex_optimistic_spin(struct mutex *lock,
 			break;
 
 		/* Try to acquire the mutex if it is unlocked. */
-		if (mutex_try_to_acquire(lock)) {
+		if (mutex_try_to_acquire(lock, waiter)) {
 			lock_acquired(&lock->dep_map, ip);
 
 			if (use_ww_ctx) {
@@ -358,8 +378,8 @@ static bool mutex_optimistic_spin(struct mutex *lock,
 			}
 
 			mutex_set_owner(lock);
-			osq_unlock(&lock->osq);
-			return true;
+			acquired = true;
+			break;
 		}
 
 		/*
@@ -380,14 +400,15 @@ static bool mutex_optimistic_spin(struct mutex *lock,
 		cpu_relax_lowlatency();
 	}
 
-	osq_unlock(&lock->osq);
+	if (!waiter)
+		osq_unlock(&lock->osq);
 done:
 	/*
 	 * If we fell out of the spin path because of need_resched(),
 	 * reschedule now, before we try-lock the mutex. This avoids getting
 	 * scheduled out right after we obtained the mutex.
 	 */
-	if (need_resched()) {
+	if (!acquired && need_resched()) {
 		/*
 		 * We _should_ have TASK_RUNNING here, but just in case
 		 * we do not, make it so, otherwise we might get stuck.
@@ -396,11 +417,12 @@ static bool mutex_optimistic_spin(struct mutex *lock,
 		schedule_preempt_disabled();
 	}
 
-	return false;
+	return acquired;
 }
 #else
 static bool mutex_optimistic_spin(struct mutex *lock,
-				  struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
+				  struct ww_acquire_ctx *ww_ctx,
+				  const bool use_ww_ctx, int waiter)
 {
 	return false;
 }
@@ -520,7 +542,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	preempt_disable();
 	mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
 
-	if (mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) {
+	if (mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, false)) {
 		/* got the lock, yay! */
 		preempt_enable();
 		return 0;

From d23adb3ff8010dfb621cd51de668910978fc96ee Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Wed, 10 Aug 2016 14:25:29 -0400
Subject: [PATCH 2/9] locking/mutex: Enable optimistic spinning of woken task
 in wait queue

Ding Tianhong reported a live-lock situation where a constant stream
of incoming optimistic spinners blocked a task in the wait list from
getting the mutex.

This patch attempts to alleviate this live-lock condition by enabling
the woken task in the wait queue to enter into an optimistic spinning
loop itself in parallel with the regular spinners in the OSQ. This
help to reduce the live-locking chance.

Running the AIM7 benchmarks on a 4-socket E7-4820 v3 system (with ext4
filesystem), the additional spinning of the waiter-spinning improved
performance for the following workloads at high user count:

  Workload	% Improvement
  --------	-------------
  alltests	    3.9%
  disk		    3.4%
  fserver	    2.0%
  long		    3.8%
  new_fserver	   10.5%

The other workloads were about the same as before.

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
[imre: Applied from https://lkml.org/lkml/2016/8/10/98]
Signed-off-by: Imre Deak <imre.deak@intel.com>
---
 kernel/locking/mutex.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 3bcbbd1857ccc..15b521ac2b9e9 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -531,6 +531,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	struct task_struct *task = current;
 	struct mutex_waiter waiter;
 	unsigned long flags;
+	bool  acquired = false;	/* True if the lock is acquired */
 	int ret;
 
 	if (use_ww_ctx) {
@@ -567,7 +568,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 
 	lock_contended(&lock->dep_map, ip);
 
-	for (;;) {
+	while (!acquired) {
 		/*
 		 * Lets try to take the lock again - this is needed even if
 		 * we get here for the first time (shortly after failing to
@@ -602,6 +603,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		/* didn't get the lock, go to sleep: */
 		spin_unlock_mutex(&lock->wait_lock, flags);
 		schedule_preempt_disabled();
+
+		/*
+		 * Optimistically spinning on the mutex without the wait lock.
+		 */
+		acquired = mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx,
+						 true);
 		spin_lock_mutex(&lock->wait_lock, flags);
 	}
 	__set_task_state(task, TASK_RUNNING);
@@ -612,6 +619,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		atomic_set(&lock->count, 0);
 	debug_mutex_free_waiter(&waiter);
 
+	if (acquired)
+		goto unlock;
+
 skip_wait:
 	/* got the lock - cleanup and rejoice! */
 	lock_acquired(&lock->dep_map, ip);
@@ -622,6 +632,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		ww_mutex_set_context_slowpath(ww, ww_ctx);
 	}
 
+unlock:
 	spin_unlock_mutex(&lock->wait_lock, flags);
 	preempt_enable();
 	return 0;

From d98ac5652a0b67b581f15b23a75ed498848cd931 Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Wed, 10 Aug 2016 14:25:30 -0400
Subject: [PATCH 3/9] locking/mutex: Ensure forward progress of waiter-spinner

As both an optimistic spinner and a waiter-spinner (a woken task from
the wait queue spinning) can be spinning on the lock at the same time,
we cannot ensure forward progress for the waiter-spinner. So it is
possible for the waiter-spinner to be starved of getting the lock,
though not likely.

This patch adds a flag to indicate that a waiter-spinner is
spinning and hence has priority over the acquisition of the lock. A
waiter-spinner sets this flag while spinning. An optimistic spinner
will check this flag and yield if set. This essentially makes the
waiter-spinner jump to the head of the optimistic spinning queue to
acquire the lock.

There will be no increase in size for the mutex structure for
64-bit architectures as there is an existing 4-byte hole. For 32-bit
architectures, there will be a size increase of 4 bytes.

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
[imre: Applied from https://lkml.org/lkml/2016/8/10/506
 Fixed 0day reported compile error due to calling
 cpu_relax_lowlatency() in the for loop increment expression]
Signed-off-by: Imre Deak <imre.deak@intel.com>
---
 include/linux/mutex.h  |  1 +
 kernel/locking/mutex.c | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 2cb7531e7d7a6..f8e91ade629ea 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -57,6 +57,7 @@ struct mutex {
 #endif
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 	struct optimistic_spin_queue osq; /* Spinner MCS lock */
+	int waiter_spinning;
 #endif
 #ifdef CONFIG_DEBUG_MUTEXES
 	void			*magic;
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 15b521ac2b9e9..8b4395f9000e5 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -55,6 +55,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
 	mutex_clear_owner(lock);
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 	osq_lock_init(&lock->osq);
+	lock->waiter_spinning = false;
 #endif
 
 	debug_mutex_init(lock, name, key);
@@ -337,6 +338,12 @@ static bool mutex_optimistic_spin(struct mutex *lock,
 		 */
 		if (!osq_lock(&lock->osq))
 			goto done;
+	} else {
+		/*
+		 * Turn on the waiter spinning flag to discourage the spinner
+		 * from getting the lock.
+		 */
+		lock->waiter_spinning = true;
 	}
 
 	while (true) {
@@ -358,6 +365,17 @@ static bool mutex_optimistic_spin(struct mutex *lock,
 				break;
 		}
 
+		/*
+		 * For regular opt-spinner, it waits until the waiter_spinning
+		 * flag isn't set. This will ensure forward progress for
+		 * the waiter spinner.
+		 */
+		if (!waiter && READ_ONCE(lock->waiter_spinning)) {
+			if (need_resched())
+				break;
+			goto relax_cpu;
+		}
+
 		/*
 		 * If there's an owner, wait for it to either
 		 * release the lock or go to sleep.
@@ -391,6 +409,7 @@ static bool mutex_optimistic_spin(struct mutex *lock,
 		if (!owner && (need_resched() || rt_task(task)))
 			break;
 
+relax_cpu:
 		/*
 		 * The cpu_relax() call is a compiler barrier which forces
 		 * everything in this loop to be re-loaded. We don't need
@@ -402,6 +421,8 @@ static bool mutex_optimistic_spin(struct mutex *lock,
 
 	if (!waiter)
 		osq_unlock(&lock->osq);
+	else
+		lock->waiter_spinning = false;
 done:
 	/*
 	 * If we fell out of the spin path because of need_resched(),

From a16b35f0f4502854237791f755b5f14992f0356b Mon Sep 17 00:00:00 2001
From: Jason Low <jason.low2@hpe.com>
Date: Wed, 10 Aug 2016 11:44:08 -0700
Subject: [PATCH 4/9] locking/mutex: Prevent lock starvation when spinning is
 disabled

Imre reported an issue where threads are getting starved when trying
to acquire a mutex. Threads acquiring a mutex can get arbitrarily delayed
sleeping on a mutex because other threads can continually steal the lock
in the fastpath and/or through optimistic spinning.

Waiman has developed patches that allow waiters to return to optimistic
spinning, thus reducing the probability that starvation occurs. However,
Imre still sees this starvation problem in the workloads when optimistic
spinning is disabled.

This patch adds an additional boolean to the mutex that gets used in
the CONFIG_SMP && !CONFIG_MUTEX_SPIN_ON_OWNER cases. The flag signifies
whether or not other threads need to yield to a waiter and gets set
when a waiter spends too much time waiting for the mutex. The threshold
is currently set to 16 wakeups, and once the wakeup threshold is exceeded,
other threads must yield to the top waiter. The flag gets cleared
immediately after the top waiter acquires the mutex.

This prevents waiters from getting starved without sacrificing much
much performance, as lock stealing is still allowed and only
temporarily disabled when it is detected that a waiter has been waiting
for too long.

Reported-by: Imre Deak <imre.deak@intel.com>
Signed-off-by: Jason Low <jason.low2@hpe.com>
[imre: Applied from https://lkml.org/lkml/2016/8/10/376
 Rebased on preceding starvation fix patches from Waiman Long,
 s/enabled/disabled/ in subject,
 fixed 0day reported compile errors:
 for !CONFIG_SMP,
 misplaced break in __mutex_lock_common,
 using wrong ww_mutex ptr instead of the base mutex ptr]
Signed-off-by: Imre Deak <imre.deak@intel.com>
---
 include/linux/mutex.h  |  2 +
 kernel/locking/mutex.c | 88 ++++++++++++++++++++++++++++++++++++------
 2 files changed, 78 insertions(+), 12 deletions(-)

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index f8e91ade629ea..ae6ac5e3e2f4c 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -58,6 +58,8 @@ struct mutex {
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 	struct optimistic_spin_queue osq; /* Spinner MCS lock */
 	int waiter_spinning;
+#elif defined(CONFIG_SMP)
+	bool yield_to_waiter;
 #endif
 #ifdef CONFIG_DEBUG_MUTEXES
 	void			*magic;
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 8b4395f9000e5..f9f18361d7abf 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -56,6 +56,8 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 	osq_lock_init(&lock->osq);
 	lock->waiter_spinning = false;
+#elif defined(CONFIG_SMP)
+	lock->yield_to_waiter = false;
 #endif
 
 	debug_mutex_init(lock, name, key);
@@ -72,6 +74,9 @@ EXPORT_SYMBOL(__mutex_init);
  */
 __visible void __sched __mutex_lock_slowpath(atomic_t *lock_count);
 
+
+static inline bool need_yield_to_waiter(struct mutex *lock);
+
 /**
  * mutex_lock - acquire the mutex
  * @lock: the mutex to be acquired
@@ -100,7 +105,10 @@ void __sched mutex_lock(struct mutex *lock)
 	 * The locking fastpath is the 1->0 transition from
 	 * 'unlocked' into 'locked' state.
 	 */
-	__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
+	if (!need_yield_to_waiter(lock))
+		__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
+	else
+		__mutex_lock_slowpath(&lock->count);
 	mutex_set_owner(lock);
 }
 
@@ -440,6 +448,7 @@ static bool mutex_optimistic_spin(struct mutex *lock,
 
 	return acquired;
 }
+
 #else
 static bool mutex_optimistic_spin(struct mutex *lock,
 				  struct ww_acquire_ctx *ww_ctx,
@@ -449,6 +458,46 @@ static bool mutex_optimistic_spin(struct mutex *lock,
 }
 #endif
 
+#if !defined(CONFIG_MUTEX_SPIN_ON_OWNER) && defined(CONFIG_SMP)
+#define MUTEX_WAKEUP_THRESHOLD 16
+
+static inline void do_yield_to_waiter(struct mutex *lock, int *wakeups)
+{
+	*wakeups += 1;
+
+	if (*wakeups < MUTEX_WAKEUP_THRESHOLD)
+		return;
+
+	if (lock->yield_to_waiter != true)
+		lock->yield_to_waiter = true;
+}
+
+static inline void clear_yield_to_waiter(struct mutex *lock)
+{
+	lock->yield_to_waiter = false;
+}
+
+static inline bool need_yield_to_waiter(struct mutex *lock)
+{
+	return lock->yield_to_waiter;
+}
+#else
+static inline void do_yield_to_waiter(struct mutex *lock, int *wakeups)
+{
+	return;
+}
+
+static inline void clear_yield_to_waiter(struct mutex *lock)
+{
+	return;
+}
+
+static inline bool need_yield_to_waiter(struct mutex *lock)
+{
+	return false;
+}
+#endif
+
 __visible __used noinline
 void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
 
@@ -554,6 +603,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	unsigned long flags;
 	bool  acquired = false;	/* True if the lock is acquired */
 	int ret;
+	int wakeups = 0;
 
 	if (use_ww_ctx) {
 		struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
@@ -576,7 +626,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	 * Once more, try to acquire the lock. Only try-lock the mutex if
 	 * it is unlocked to reduce unnecessary xchg() operations.
 	 */
-	if (!mutex_is_locked(lock) &&
+	if (!need_yield_to_waiter(lock) && !mutex_is_locked(lock) &&
 	    (atomic_xchg_acquire(&lock->count, 0) == 1))
 		goto skip_wait;
 
@@ -600,9 +650,14 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		 * other waiters. We only attempt the xchg if the count is
 		 * non-negative in order to avoid unnecessary xchg operations:
 		 */
-		if (atomic_read(&lock->count) >= 0 &&
-		    (atomic_xchg_acquire(&lock->count, -1) == 1))
+		if ((!need_yield_to_waiter(lock) || wakeups > 1) &&
+		    atomic_read(&lock->count) >= 0 &&
+		    (atomic_xchg_acquire(&lock->count, -1) == 1)) {
+			if (wakeups > 1)
+				clear_yield_to_waiter(lock);
+
 			break;
+		}
 
 		/*
 		 * got a signal? (This code gets eliminated in the
@@ -631,6 +686,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		acquired = mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx,
 						 true);
 		spin_lock_mutex(&lock->wait_lock, flags);
+		do_yield_to_waiter(lock, &wakeups);
 	}
 	__set_task_state(task, TASK_RUNNING);
 
@@ -843,10 +899,13 @@ __mutex_lock_interruptible_slowpath(struct mutex *lock);
  */
 int __sched mutex_lock_interruptible(struct mutex *lock)
 {
-	int ret;
+	int ret = 1;
 
 	might_sleep();
-	ret =  __mutex_fastpath_lock_retval(&lock->count);
+
+	if (!need_yield_to_waiter(lock))
+		ret =  __mutex_fastpath_lock_retval(&lock->count);
+
 	if (likely(!ret)) {
 		mutex_set_owner(lock);
 		return 0;
@@ -858,10 +917,13 @@ EXPORT_SYMBOL(mutex_lock_interruptible);
 
 int __sched mutex_lock_killable(struct mutex *lock)
 {
-	int ret;
+	int ret = 1;
 
 	might_sleep();
-	ret = __mutex_fastpath_lock_retval(&lock->count);
+
+	if (!need_yield_to_waiter(lock))
+		ret = __mutex_fastpath_lock_retval(&lock->count);
+
 	if (likely(!ret)) {
 		mutex_set_owner(lock);
 		return 0;
@@ -971,11 +1033,12 @@ EXPORT_SYMBOL(mutex_trylock);
 int __sched
 __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
-	int ret;
+	int ret = 1;
 
 	might_sleep();
 
-	ret = __mutex_fastpath_lock_retval(&lock->base.count);
+	if (!need_yield_to_waiter(&lock->base))
+		ret = __mutex_fastpath_lock_retval(&lock->base.count);
 
 	if (likely(!ret)) {
 		ww_mutex_set_context_fastpath(lock, ctx);
@@ -989,11 +1052,12 @@ EXPORT_SYMBOL(__ww_mutex_lock);
 int __sched
 __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
-	int ret;
+	int ret = 1;
 
 	might_sleep();
 
-	ret = __mutex_fastpath_lock_retval(&lock->base.count);
+	if (!need_yield_to_waiter(&lock->base))
+		ret = __mutex_fastpath_lock_retval(&lock->base.count);
 
 	if (likely(!ret)) {
 		ww_mutex_set_context_fastpath(lock, ctx);

From afc11cca351fb7ac8868b348ecd346077ede72e0 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Sat, 15 Oct 2016 09:54:01 +0100
Subject: [PATCH 5/9] mm/vmalloc: Replace opencoded 4-level page walkers

Rather than open-code the intricacies of walking the 4-level page
tables, use the generic page table walker apply_to_page_range() instead.

The important change is that it now cleans up after an
unsuccessful insertion and propagates the correct error. The current
failure may lead to a WARN if we encounter ENOMEM in one
vmap_pte_range() and later retry with the same page range.

WARNING: CPU: 0 PID: 605 at mm/vmalloc.c:136 vmap_page_range_noflush+0x2c1/0x340
i.e. WARN_ON(!pte_none(*pte))

v2: Don't convert the vunmap code over to apply_to_page_range() as it
may try to allocate during atomic sections, such as exiting a task:

[    9.837563]  [<ffffffff810519b0>] pte_alloc_one_kernel+0x10/0x20
[    9.837568]  [<ffffffff811a7486>] __pte_alloc_kernel+0x16/0xa0
[    9.837572]  [<ffffffff811aaa76>] apply_to_page_range+0x3f6/0x460
[    9.837576]  [<ffffffff811b8888>] free_unmap_vmap_area_noflush+0x28/0x40
[    9.837579]  [<ffffffff811b9dcd>] remove_vm_area+0x4d/0x60
[    9.837582]  [<ffffffff811b9e09>] __vunmap+0x29/0x130
[    9.837585]  [<ffffffff811b9f7d>] vfree+0x3d/0x90
[    9.837589]  [<ffffffff8107ace6>] put_task_stack+0x76/0x130

References: https://bugs.freedesktop.org/show_bug.cgi?id=98269
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Wang Xiaoqiang <wangxq10@lzu.edu.cn>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: linux-mm@kvack.org
---
 mm/vmalloc.c | 93 +++++++++++++++-------------------------------------
 1 file changed, 27 insertions(+), 66 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f2481cb4e6b21..7e945c63c7efb 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -117,63 +117,27 @@ static void vunmap_page_range(unsigned long addr, unsigned long end)
 	} while (pgd++, addr = next, addr != end);
 }
 
-static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
-		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
-{
-	pte_t *pte;
-
-	/*
-	 * nr is a running index into the array which helps higher level
-	 * callers keep track of where we're up to.
-	 */
-
-	pte = pte_alloc_kernel(pmd, addr);
-	if (!pte)
-		return -ENOMEM;
-	do {
-		struct page *page = pages[*nr];
-
-		if (WARN_ON(!pte_none(*pte)))
-			return -EBUSY;
-		if (WARN_ON(!page))
-			return -ENOMEM;
-		set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
-		(*nr)++;
-	} while (pte++, addr += PAGE_SIZE, addr != end);
-	return 0;
-}
+struct vmap_page {
+	pgprot_t prot;
+	struct page **pages;
+	unsigned long count;
+};
 
-static int vmap_pmd_range(pud_t *pud, unsigned long addr,
-		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
+static int vmap_page(pte_t *pte, pgtable_t token,
+		     unsigned long addr, void *data)
 {
-	pmd_t *pmd;
-	unsigned long next;
-
-	pmd = pmd_alloc(&init_mm, pud, addr);
-	if (!pmd)
-		return -ENOMEM;
-	do {
-		next = pmd_addr_end(addr, end);
-		if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
-			return -ENOMEM;
-	} while (pmd++, addr = next, addr != end);
-	return 0;
-}
+	struct vmap_page *v = data;
+	struct page *page;
 
-static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
-		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
-{
-	pud_t *pud;
-	unsigned long next;
+	if (WARN_ON(!pte_none(*pte)))
+		return -EBUSY;
 
-	pud = pud_alloc(&init_mm, pgd, addr);
-	if (!pud)
+	page = v->pages[v->count];
+	if (WARN_ON(!page))
 		return -ENOMEM;
-	do {
-		next = pud_addr_end(addr, end);
-		if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
-			return -ENOMEM;
-	} while (pud++, addr = next, addr != end);
+
+	set_pte_at(&init_mm, addr, pte, mk_pte(page, v->prot));
+	v->count++;
 	return 0;
 }
 
@@ -186,22 +150,19 @@ static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
 static int vmap_page_range_noflush(unsigned long start, unsigned long end,
 				   pgprot_t prot, struct page **pages)
 {
-	pgd_t *pgd;
-	unsigned long next;
-	unsigned long addr = start;
-	int err = 0;
-	int nr = 0;
+	struct vmap_page v = { prot, pages };
+	int err;
 
-	BUG_ON(addr >= end);
-	pgd = pgd_offset_k(addr);
-	do {
-		next = pgd_addr_end(addr, end);
-		err = vmap_pud_range(pgd, addr, next, prot, pages, &nr);
-		if (err)
-			return err;
-	} while (pgd++, addr = next, addr != end);
+	if ((end - start) >> PAGE_SHIFT > INT_MAX)
+		return -EINVAL;
+
+	err = apply_to_page_range(&init_mm, start, end - start, vmap_page, &v);
+	if (unlikely(err)) {
+		vunmap_page_range(start, start + (v.count << PAGE_SHIFT));
+		return err;
+	}
 
-	return nr;
+	return v.count;
 }
 
 static int vmap_page_range(unsigned long start, unsigned long end,

From 8ac4058cc437f33cf5b273d22fa0856bb05827bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
Date: Wed, 19 Oct 2016 21:02:04 +0300
Subject: [PATCH 6/9] rtc: cmos: Don't enable interrupts in the middle of the
 interrupt handler
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using spin_lock_irq()/spin_unlock_irq() from within the interrupt
handler is a no-no. Let's save/restore the flags to avoid turning on
interrupts prematurely.

We hit this in a bunch of our CI systems, but for whatever reason I
wasn't able to reproduce on my own machine, so this fix is just
based on the backtrace.

[  202.634918] WARNING: CPU: 0 PID: 0 at kernel/locking/lockdep.c:2729 trace_hardirqs_on_caller+0x113/0x1b0
[  202.634919] DEBUG_LOCKS_WARN_ON(current->hardirq_context)
[  202.634929] Modules linked in: snd_hda_intel i915 x86_pkg_temp_thermal intel_powerclamp coretemp crct10dif_pclmul crc32_pclmul ghash_clmulni_intel lpc_ich snd_hda_codec_realtek snd_hda_codec_generic snd_hda_codec_hdmi snd_hda_codec snd_hwdep i2c_designware_platform i2c_designware_core snd_hda_core mei_me mei snd_pcm r8169 mii sdhci_acpi sdhci mmc_core i2c_hid [last unloaded: i915]
[  202.634930] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G     U          4.9.0-rc1-CI-CI_DRM_1734+ #1
[  202.634931] Hardware name: GIGABYTE M4HM87P-00/M4HM87P-00, BIOS F6 12/10/2014
[  202.634933]  ffff88011ea03d68 ffffffff8142dce5 ffff88011ea03db8 0000000000000000
[  202.634934]  ffff88011ea03da8 ffffffff8107e496 00000aa900000002 ffffffff81e249a0
[  202.634935]  ffffffff81815637 ffffffff82e7c280 0000000000000000 0000000000000004
[  202.634936] Call Trace:
[  202.634939]  <IRQ>
[  202.634939]  [<ffffffff8142dce5>] dump_stack+0x67/0x92
[  202.634941]  [<ffffffff8107e496>] __warn+0xc6/0xe0
[  202.634944]  [<ffffffff81815637>] ? _raw_spin_unlock_irq+0x27/0x50
[  202.634945]  [<ffffffff8107e4fa>] warn_slowpath_fmt+0x4a/0x50
[  202.634946]  [<ffffffff810d6d83>] trace_hardirqs_on_caller+0x113/0x1b0
[  202.634948]  [<ffffffff810d6e2d>] trace_hardirqs_on+0xd/0x10
[  202.634949]  [<ffffffff81815637>] _raw_spin_unlock_irq+0x27/0x50
[  202.634951]  [<ffffffff81672042>] rtc_handler+0x32/0xa0
[  202.634954]  [<ffffffff814c08a3>] acpi_ev_fixed_event_detect+0xd4/0xfb
[  202.634956]  [<ffffffff814c2ccb>] acpi_ev_sci_xrupt_handler+0xf/0x2d
[  202.634957]  [<ffffffff814ab3ee>] acpi_irq+0x11/0x2c
[  202.634960]  [<ffffffff810e5288>] __handle_irq_event_percpu+0x58/0x370
[  202.634961]  [<ffffffff810e55be>] handle_irq_event_percpu+0x1e/0x50
[  202.634962]  [<ffffffff810e5624>] handle_irq_event+0x34/0x60
[  202.634963]  [<ffffffff810e8906>] handle_fasteoi_irq+0xa6/0x170
[  202.634966]  [<ffffffff8101eef5>] handle_irq+0x15/0x20
[  202.634967]  [<ffffffff8101e548>] do_IRQ+0x68/0x130
[  202.634968]  [<ffffffff81816789>] common_interrupt+0x89/0x89
[  202.634970]  <EOI>
[  202.634970]  [<ffffffff81814c73>] ? mwait_idle+0x93/0x210
[  202.634971]  [<ffffffff81814c6a>] ? mwait_idle+0x8a/0x210
[  202.634972]  [<ffffffff81026b0a>] arch_cpu_idle+0xa/0x10
[  202.634973]  [<ffffffff8181509e>] default_idle_call+0x1e/0x30
[  202.634974]  [<ffffffff810cbf6c>] cpu_startup_entry+0x17c/0x1f0
[  202.634976]  [<ffffffff8180ca87>] rest_init+0x127/0x130
[  202.634978]  [<ffffffff81f77f08>] start_kernel+0x3f6/0x403
[  202.634980]  [<ffffffff81f7728f>] x86_64_start_reservations+0x2a/0x2c
[  202.634981]  [<ffffffff81f77404>] x86_64_start_kernel+0x173/0x186
[  202.634982] ---[ end trace 293c99618fa08d34 ]---

Cc: Gabriele Mazzotta <gabriele.mzt@gmail.com>
Cc: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Fixes: 983bf1256edb ("rtc: cmos: Clear ACPI-driven alarms upon resume")
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: http://patchwork.freedesktop.org/patch/msgid/1476900124-14405-1-git-send-email-ville.syrjala@linux.intel.com
---
 drivers/rtc/rtc-cmos.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c
index dd3d59806ffa0..19cd49ad92dc3 100644
--- a/drivers/rtc/rtc-cmos.c
+++ b/drivers/rtc/rtc-cmos.c
@@ -996,8 +996,9 @@ static u32 rtc_handler(void *context)
 	struct cmos_rtc *cmos = dev_get_drvdata(dev);
 	unsigned char rtc_control = 0;
 	unsigned char rtc_intr;
+	unsigned long flags;
 
-	spin_lock_irq(&rtc_lock);
+	spin_lock_irqsave(&rtc_lock, flags);
 	if (cmos_rtc.suspend_ctrl)
 		rtc_control = CMOS_READ(RTC_CONTROL);
 	if (rtc_control & RTC_AIE) {
@@ -1006,7 +1007,7 @@ static u32 rtc_handler(void *context)
 		rtc_intr = CMOS_READ(RTC_INTR_FLAGS);
 		rtc_update_irq(cmos->rtc, 1, rtc_intr);
 	}
-	spin_unlock_irq(&rtc_lock);
+	spin_unlock_irqrestore(&rtc_lock, flags);
 
 	pm_wakeup_event(dev, 0);
 	acpi_clear_event(ACPI_EVENT_RTC);

From 15bdcbcc55853f70237aca4baa4152f8396799d4 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 20 Oct 2016 12:02:36 +0100
Subject: [PATCH 7/9] kbuild: Disable PIE by default

debian/ubuntu gcc changed it defaults to enable PIE breaking the kernel
build.

https://patchwork.ozlabs.org/patch/616621/
---
 Makefile                     | 5 +++++
 arch/x86/Makefile            | 1 +
 arch/x86/entry/vdso/Makefile | 2 ++
 3 files changed, 8 insertions(+)

diff --git a/Makefile b/Makefile
index a2650f9c6a259..870e32fbfec07 100644
--- a/Makefile
+++ b/Makefile
@@ -641,6 +641,11 @@ endif
 # Tell gcc to never replace conditional load with a non-conditional one
 KBUILD_CFLAGS	+= $(call cc-option,--param=allow-store-data-races=0)
 
+# force no-pie for distro compilers that enable pie by default
+KBUILD_CFLAGS += $(call cc-option, -fno-pie)
+KBUILD_CFLAGS += $(call cc-option, -no-pie)
+KBUILD_AFLAGS += $(call cc-option, -fno-pie)
+
 include scripts/Makefile.gcc-plugins
 
 ifdef CONFIG_READABLE_ASM
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 2d449337a3605..7ee0422b97d34 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -27,6 +27,7 @@ REALMODE_CFLAGS	:= $(M16_CFLAGS) -g -Os -D__KERNEL__ \
 		   -mno-mmx -mno-sse \
 		   $(call cc-option, -ffreestanding) \
 		   $(call cc-option, -fno-stack-protector) \
+		   $(call cc-option, -fno-pie) \
 		   $(call cc-option, -mpreferred-stack-boundary=2)
 export REALMODE_CFLAGS
 
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index d5409660f5de6..d9a46a8e845d9 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -73,6 +73,7 @@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE
 CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
        $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \
        -fno-omit-frame-pointer -foptimize-sibling-calls \
+       $(call cc-option, -fno-pie) \
        -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO
 
 $(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
@@ -147,6 +148,7 @@ KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32))
 KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32))
 KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32))
 KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic
+KBUILD_CFLAGS_32 += $(call cc-option, -no-pie)
 KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector)
 KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
 KBUILD_CFLAGS_32 += -fno-omit-frame-pointer

From 7fc9e0c9ebb67261bc86264eb7592a9e8d12e7fe Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 17 Nov 2016 20:31:45 +0000
Subject: [PATCH 8/9] drm-intel-nightly: 2016y-11m-17d-20h-31m-28s UTC
 integration manifest

---
 integration-manifest | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 integration-manifest

diff --git a/integration-manifest b/integration-manifest
new file mode 100644
index 0000000000000..5785d41010805
--- /dev/null
+++ b/integration-manifest
@@ -0,0 +1,20 @@
+origin/drm-intel-fixes bc9db5ad3253c8e17969bd802c47b73e63f125ab
+	drm/i915: Assume non-DP++ port if dvo_port is HDMI and there's no AUX ch specified in the VBT
+drm-upstream/drm-fixes 51a4c38a5511c0027c54d330f7dd2239f6c95b82
+	Merge branch 'mediatek-drm-fixes-2016-11-11' of https://github.com/ckhu-mediatek/linux.git-tags into drm-fixes
+origin/drm-intel-next-fixes 105f1a65b04a8f4f7abec11b200b1fb54f3d4b46
+	drm/i915: Fix conflict resolution from backmerge of v4.8-rc8 to drm-next
+origin/drm-intel-next-queued 6288c79ea519f6ca3a54d87974b48ca6a8b986af
+	drm/i915: Add a few more sanity checks for stolen handling
+drm-upstream/drm-next b7c0e47d98249c2ddf21ea197b651093c6aaee00
+	Merge tag 'drm-vc4-next-2016-11-16' of https://github.com/anholt/linux into drm-next
+sound-upstream/for-next 43e575fabbaa1cc9e25dd0bb31eb6657ac7092ec
+	ALSA: core: Fix kernel-doc warnings
+sound-upstream/for-linus 6ff1a25318ebf688ef9593fe09cd449f6fb4ad31
+	ALSA: usb-audio: Fix use-after-free of usb_device at disconnect
+origin/topic/drm-fixes c6935931c1894ff857616ff8549b61236a19148f
+	Linux 4.8-rc5
+origin/topic/drm-misc 27bfa74b9029a4205b16ec8c3a7090287e6e9995
+	drm: also move DSI panels to the front of the connector list
+origin/topic/core-for-CI 15bdcbcc55853f70237aca4baa4152f8396799d4
+	kbuild: Disable PIE by default

From cc31ead1995c679e7b751be0711054543dca8746 Mon Sep 17 00:00:00 2001
From: Jike Song <jike.song@intel.com>
Date: Fri, 18 Nov 2016 14:33:29 +0800
Subject: [PATCH 9/9] drm/i915/gvt: add vfio/mdev support to kvmgt

KVMGT requires vfio/mdev to mediate device opreations such as MMIO
read/write. KVMGT provides a intel_vgpu_ops to mdev framework, whenever
device operation happens, the framework calls into methods provided
by us. There are several types of userspace API for a vfio device,
among which intel vgpu maintains compatibility with 'vfio_pci'.

v2: - fix inject_msi return type;
    - return error instead of 0 if handle invalid
    - fix a build warning

Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Xiaoguang Chen <xiaoguang.chen@intel.com>
Signed-off-by: Jike Song <jike.song@intel.com>
Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>
---
 drivers/gpu/drm/i915/Kconfig     |   1 +
 drivers/gpu/drm/i915/gvt/gvt.h   |   6 +-
 drivers/gpu/drm/i915/gvt/kvmgt.c | 862 +++++++++++++++++++++++++++++--
 3 files changed, 829 insertions(+), 40 deletions(-)

diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
index 5ddde7349fbde..183f5dc1c3f22 100644
--- a/drivers/gpu/drm/i915/Kconfig
+++ b/drivers/gpu/drm/i915/Kconfig
@@ -116,6 +116,7 @@ config DRM_I915_GVT_KVMGT
 	tristate "Enable KVM/VFIO support for Intel GVT-g"
 	depends on DRM_I915_GVT
 	depends on KVM
+	depends on VFIO_MDEV && VFIO_MDEV_DEVICE
 	default n
 	help
 	  Choose this option if you want to enable KVMGT support for
diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h
index 3d4223e8ebe3a..b9d4a8db077be 100644
--- a/drivers/gpu/drm/i915/gvt/gvt.h
+++ b/drivers/gpu/drm/i915/gvt/gvt.h
@@ -164,15 +164,15 @@ struct intel_vgpu {
 
 #if IS_ENABLED(CONFIG_DRM_I915_GVT_KVMGT)
 	struct {
-		struct device *mdev;
+		struct mdev_device *mdev;
 		struct vfio_region *region;
 		int num_regions;
 		struct eventfd_ctx *intx_trigger;
 		struct eventfd_ctx *msi_trigger;
 		struct rb_root cache;
 		struct mutex cache_lock;
-		void *vfio_group;
-		struct notifier_block iommu_notifier;
+		struct kvm *kvm;
+		struct work_struct release_work;
 	} vdev;
 #endif
 };
diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index dc03650331574..82f5314e5114c 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -39,24 +39,13 @@
 #include <linux/uuid.h>
 #include <linux/kvm_host.h>
 #include <linux/vfio.h>
+#include <linux/mdev.h>
 
 #include "i915_drv.h"
 #include "gvt.h"
 
-static inline long kvmgt_pin_pages(struct device *dev, unsigned long *user_pfn,
-			long npage, int prot, unsigned long *phys_pfn)
-{
-	return 0;
-}
-static inline long kvmgt_unpin_pages(struct device *dev, unsigned long *pfn,
-			long npage)
-{
-	return 0;
-}
-
 static const struct intel_gvt_ops *intel_gvt_ops;
 
-
 /* helper macros copied from vfio-pci */
 #define VFIO_PCI_OFFSET_SHIFT   40
 #define VFIO_PCI_OFFSET_TO_INDEX(off)   (off >> VFIO_PCI_OFFSET_SHIFT)
@@ -90,6 +79,15 @@ struct gvt_dma {
 	kvm_pfn_t pfn;
 };
 
+static inline bool handle_valid(unsigned long handle)
+{
+	return !!(handle & ~0xff);
+}
+
+static int kvmgt_guest_init(struct mdev_device *mdev);
+static void intel_vgpu_release_work(struct work_struct *work);
+static bool kvmgt_guest_exit(struct kvmgt_guest_info *info);
+
 static struct gvt_dma *__gvt_cache_find(struct intel_vgpu *vgpu, gfn_t gfn)
 {
 	struct rb_node *node = vgpu->vdev.cache.rb_node;
@@ -167,9 +165,9 @@ static void __gvt_cache_remove_entry(struct intel_vgpu *vgpu,
 
 static void gvt_cache_remove(struct intel_vgpu *vgpu, gfn_t gfn)
 {
-	struct device *dev = vgpu->vdev.mdev;
+	struct device *dev = &vgpu->vdev.mdev->dev;
 	struct gvt_dma *this;
-	unsigned long pfn;
+	unsigned long g1;
 
 	mutex_lock(&vgpu->vdev.cache_lock);
 	this  = __gvt_cache_find(vgpu, gfn);
@@ -178,8 +176,8 @@ static void gvt_cache_remove(struct intel_vgpu *vgpu, gfn_t gfn)
 		return;
 	}
 
-	pfn = this->pfn;
-	WARN_ON((kvmgt_unpin_pages(dev, &pfn, 1) != 1));
+	g1 = gfn;
+	WARN_ON((vfio_unpin_pages(dev, &g1, 1) != 1));
 	__gvt_cache_remove_entry(vgpu, this);
 	mutex_unlock(&vgpu->vdev.cache_lock);
 }
@@ -194,15 +192,15 @@ static void gvt_cache_destroy(struct intel_vgpu *vgpu)
 {
 	struct gvt_dma *dma;
 	struct rb_node *node = NULL;
-	struct device *dev = vgpu->vdev.mdev;
-	unsigned long pfn;
+	struct device *dev = &vgpu->vdev.mdev->dev;
+	unsigned long g1;
 
 	mutex_lock(&vgpu->vdev.cache_lock);
 	while ((node = rb_first(&vgpu->vdev.cache))) {
 		dma = rb_entry(node, struct gvt_dma, node);
-		pfn = dma->pfn;
+		g1 = dma->gfn;
 
-		kvmgt_unpin_pages(dev, &pfn, 1);
+		vfio_unpin_pages(dev, &g1, 1);
 		__gvt_cache_remove_entry(vgpu, dma);
 	}
 	mutex_unlock(&vgpu->vdev.cache_lock);
@@ -226,7 +224,53 @@ static struct intel_vgpu_type *intel_gvt_find_vgpu_type(struct intel_gvt *gvt,
 	return NULL;
 }
 
+static ssize_t available_instance_show(struct kobject *kobj, struct device *dev,
+		char *buf)
+{
+	struct intel_vgpu_type *type;
+	unsigned int num = 0;
+	void *gvt = kdev_to_i915(dev)->gvt;
+
+	type = intel_gvt_find_vgpu_type(gvt, kobject_name(kobj));
+	if (!type)
+		num = 0;
+	else
+		num = type->avail_instance;
+
+	return sprintf(buf, "%u\n", num);
+}
+
+static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
+		char *buf)
+{
+	return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
+}
+
+static ssize_t description_show(struct kobject *kobj, struct device *dev,
+		char *buf)
+{
+	struct intel_vgpu_type *type;
+	void *gvt = kdev_to_i915(dev)->gvt;
+
+	type = intel_gvt_find_vgpu_type(gvt, kobject_name(kobj));
+	if (!type)
+		return 0;
+
+	return sprintf(buf, "low_gm_size: %dMB\nhigh_gm_size: %dMB\n"
+				"fence: %d\n",
+				BYTES_TO_MB(type->low_gm_size),
+				BYTES_TO_MB(type->high_gm_size),
+				type->fence);
+}
+
+static MDEV_TYPE_ATTR_RO(available_instance);
+static MDEV_TYPE_ATTR_RO(device_api);
+static MDEV_TYPE_ATTR_RO(description);
+
 static struct attribute *type_attrs[] = {
+	&mdev_type_attr_available_instance.attr,
+	&mdev_type_attr_device_api.attr,
+	&mdev_type_attr_description.attr,
 	NULL,
 };
 
@@ -322,7 +366,7 @@ static void kvmgt_protect_table_add(struct kvmgt_guest_info *info, gfn_t gfn)
 	if (kvmgt_gfn_is_write_protected(info, gfn))
 		return;
 
-	p = kmalloc(sizeof(struct kvmgt_pgfn), GFP_ATOMIC);
+	p = kzalloc(sizeof(struct kvmgt_pgfn), GFP_ATOMIC);
 	if (WARN(!p, "gfn: 0x%llx\n", gfn))
 		return;
 
@@ -342,6 +386,646 @@ static void kvmgt_protect_table_del(struct kvmgt_guest_info *info,
 	}
 }
 
+static int intel_vgpu_create(struct kobject *kobj, struct mdev_device *mdev)
+{
+	struct intel_vgpu *vgpu;
+	struct intel_vgpu_type *type;
+	struct device *pdev;
+	void *gvt;
+
+	pdev = mdev->parent->dev;
+	gvt = kdev_to_i915(pdev)->gvt;
+
+	type = intel_gvt_find_vgpu_type(gvt, kobject_name(kobj));
+	if (!type) {
+		gvt_err("failed to find type %s to create\n",
+						kobject_name(kobj));
+		return -EINVAL;
+	}
+
+	vgpu = intel_gvt_ops->vgpu_create(gvt, type);
+	if (IS_ERR_OR_NULL(vgpu)) {
+		gvt_err("create intel vgpu failed\n");
+		return -EINVAL;
+	}
+
+	INIT_WORK(&vgpu->vdev.release_work, intel_vgpu_release_work);
+
+	vgpu->vdev.mdev = mdev;
+	mdev_set_drvdata(mdev, vgpu);
+
+	gvt_dbg_core("intel_vgpu_create succeeded for mdev: %s\n",
+		     dev_name(&mdev->dev));
+	return 0;
+}
+
+static int intel_vgpu_remove(struct mdev_device *mdev)
+{
+	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
+
+	if (handle_valid(vgpu->handle))
+		return -EBUSY;
+
+	intel_gvt_ops->vgpu_destroy(vgpu);
+	return 0;
+}
+
+static int intel_vgpu_open(struct mdev_device *mdev)
+{
+	return kvmgt_guest_init(mdev);
+}
+
+static void __intel_vgpu_release(struct intel_vgpu *vgpu)
+{
+	struct kvmgt_guest_info *info;
+
+	if (!handle_valid(vgpu->handle))
+		return;
+
+	info = (struct kvmgt_guest_info *)vgpu->handle;
+	kvmgt_guest_exit(info);
+	vgpu->handle = 0;
+}
+
+static void intel_vgpu_release(struct mdev_device *mdev)
+{
+	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
+
+	__intel_vgpu_release(vgpu);
+}
+
+static void intel_vgpu_release_work(struct work_struct *work)
+{
+	struct intel_vgpu *vgpu = container_of(work, struct intel_vgpu,
+					vdev.release_work);
+	__intel_vgpu_release(vgpu);
+}
+
+static uint64_t intel_vgpu_get_bar0_addr(struct intel_vgpu *vgpu)
+{
+	u32 start_lo, start_hi;
+	u32 mem_type;
+	int pos = PCI_BASE_ADDRESS_0;
+
+	start_lo = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + pos)) &
+			PCI_BASE_ADDRESS_MEM_MASK;
+	mem_type = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + pos)) &
+			PCI_BASE_ADDRESS_MEM_TYPE_MASK;
+
+	switch (mem_type) {
+	case PCI_BASE_ADDRESS_MEM_TYPE_64:
+		start_hi = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space
+						+ pos + 4));
+		break;
+	case PCI_BASE_ADDRESS_MEM_TYPE_32:
+	case PCI_BASE_ADDRESS_MEM_TYPE_1M:
+		/* 1M mem BAR treated as 32-bit BAR */
+	default:
+		/* mem unknown type treated as 32-bit BAR */
+		start_hi = 0;
+		break;
+	}
+
+	return ((u64)start_hi << 32) | start_lo;
+}
+
+static ssize_t intel_vgpu_rw(struct mdev_device *mdev, char *buf,
+			size_t count, loff_t *ppos, bool is_write)
+{
+	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
+	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+	uint64_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+	int ret = -EINVAL;
+
+
+	if (index >= VFIO_PCI_NUM_REGIONS) {
+		gvt_err("invalid index: %u\n", index);
+		return -EINVAL;
+	}
+
+	switch (index) {
+	case VFIO_PCI_CONFIG_REGION_INDEX:
+		if (is_write)
+			ret = intel_gvt_ops->emulate_cfg_write(vgpu, pos,
+						buf, count);
+		else
+			ret = intel_gvt_ops->emulate_cfg_read(vgpu, pos,
+						buf, count);
+		break;
+	case VFIO_PCI_BAR0_REGION_INDEX:
+	case VFIO_PCI_BAR1_REGION_INDEX:
+		if (is_write) {
+			uint64_t bar0_start = intel_vgpu_get_bar0_addr(vgpu);
+
+			ret = intel_gvt_ops->emulate_mmio_write(vgpu,
+						bar0_start + pos, buf, count);
+		} else {
+			uint64_t bar0_start = intel_vgpu_get_bar0_addr(vgpu);
+
+			ret = intel_gvt_ops->emulate_mmio_read(vgpu,
+						bar0_start + pos, buf, count);
+		}
+		break;
+	case VFIO_PCI_BAR2_REGION_INDEX:
+	case VFIO_PCI_BAR3_REGION_INDEX:
+	case VFIO_PCI_BAR4_REGION_INDEX:
+	case VFIO_PCI_BAR5_REGION_INDEX:
+	case VFIO_PCI_VGA_REGION_INDEX:
+	case VFIO_PCI_ROM_REGION_INDEX:
+	default:
+		gvt_err("unsupported region: %u\n", index);
+	}
+
+	return ret == 0 ? count : ret;
+}
+
+static ssize_t intel_vgpu_read(struct mdev_device *mdev, char __user *buf,
+			size_t count, loff_t *ppos)
+{
+	unsigned int done = 0;
+	int ret;
+
+	while (count) {
+		size_t filled;
+
+		if (count >= 4 && !(*ppos % 4)) {
+			u32 val;
+
+			ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
+					ppos, false);
+			if (ret <= 0)
+				goto read_err;
+
+			if (copy_to_user(buf, &val, sizeof(val)))
+				goto read_err;
+
+			filled = 4;
+		} else if (count >= 2 && !(*ppos % 2)) {
+			u16 val;
+
+			ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
+					ppos, false);
+			if (ret <= 0)
+				goto read_err;
+
+			if (copy_to_user(buf, &val, sizeof(val)))
+				goto read_err;
+
+			filled = 2;
+		} else {
+			u8 val;
+
+			ret = intel_vgpu_rw(mdev, &val, sizeof(val), ppos,
+					false);
+			if (ret <= 0)
+				goto read_err;
+
+			if (copy_to_user(buf, &val, sizeof(val)))
+				goto read_err;
+
+			filled = 1;
+		}
+
+		count -= filled;
+		done += filled;
+		*ppos += filled;
+		buf += filled;
+	}
+
+	return done;
+
+read_err:
+	return -EFAULT;
+}
+
+static ssize_t intel_vgpu_write(struct mdev_device *mdev,
+				const char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	unsigned int done = 0;
+	int ret;
+
+	while (count) {
+		size_t filled;
+
+		if (count >= 4 && !(*ppos % 4)) {
+			u32 val;
+
+			if (copy_from_user(&val, buf, sizeof(val)))
+				goto write_err;
+
+			ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
+					ppos, true);
+			if (ret <= 0)
+				goto write_err;
+
+			filled = 4;
+		} else if (count >= 2 && !(*ppos % 2)) {
+			u16 val;
+
+			if (copy_from_user(&val, buf, sizeof(val)))
+				goto write_err;
+
+			ret = intel_vgpu_rw(mdev, (char *)&val,
+					sizeof(val), ppos, true);
+			if (ret <= 0)
+				goto write_err;
+
+			filled = 2;
+		} else {
+			u8 val;
+
+			if (copy_from_user(&val, buf, sizeof(val)))
+				goto write_err;
+
+			ret = intel_vgpu_rw(mdev, &val, sizeof(val),
+					ppos, true);
+			if (ret <= 0)
+				goto write_err;
+
+			filled = 1;
+		}
+
+		count -= filled;
+		done += filled;
+		*ppos += filled;
+		buf += filled;
+	}
+
+	return done;
+write_err:
+	return -EFAULT;
+}
+
+static int intel_vgpu_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
+{
+	unsigned int index;
+	u64 virtaddr;
+	unsigned long req_size, pgoff = 0;
+	pgprot_t pg_prot;
+	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
+
+	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
+	if (index >= VFIO_PCI_ROM_REGION_INDEX)
+		return -EINVAL;
+
+	if (vma->vm_end < vma->vm_start)
+		return -EINVAL;
+	if ((vma->vm_flags & VM_SHARED) == 0)
+		return -EINVAL;
+	if (index != VFIO_PCI_BAR2_REGION_INDEX)
+		return -EINVAL;
+
+	pg_prot = vma->vm_page_prot;
+	virtaddr = vma->vm_start;
+	req_size = vma->vm_end - vma->vm_start;
+	pgoff = vgpu_aperture_pa_base(vgpu) >> PAGE_SHIFT;
+
+	return remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot);
+}
+
+static int intel_vgpu_get_irq_count(struct intel_vgpu *vgpu, int type)
+{
+	if (type == VFIO_PCI_INTX_IRQ_INDEX || type == VFIO_PCI_MSI_IRQ_INDEX)
+		return 1;
+
+	return 0;
+}
+
+static int intel_vgpu_set_intx_mask(struct intel_vgpu *vgpu, unsigned index,
+			unsigned start, unsigned count, uint32_t flags,
+			void *data)
+{
+	return 0;
+}
+
+static int intel_vgpu_set_intx_unmask(struct intel_vgpu *vgpu,
+		unsigned int index, unsigned int start, unsigned int count,
+		uint32_t flags, void *data)
+{
+	return 0;
+}
+
+static int intel_vgpu_set_intx_trigger(struct intel_vgpu *vgpu,
+		unsigned int index, unsigned int start, unsigned int count,
+		uint32_t flags, void *data)
+{
+	return 0;
+}
+
+static int intel_vgpu_set_msi_trigger(struct intel_vgpu *vgpu,
+		unsigned int index, unsigned int start, unsigned int count,
+		uint32_t flags, void *data)
+{
+	struct eventfd_ctx *trigger;
+
+	if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+		int fd = *(int *)data;
+
+		trigger = eventfd_ctx_fdget(fd);
+		if (IS_ERR(trigger)) {
+			gvt_err("eventfd_ctx_fdget failed\n");
+			return PTR_ERR(trigger);
+		}
+		vgpu->vdev.msi_trigger = trigger;
+	}
+
+	return 0;
+}
+
+static int intel_vgpu_set_irqs(struct intel_vgpu *vgpu, uint32_t flags,
+		unsigned int index, unsigned int start, unsigned int count,
+		void *data)
+{
+	int (*func)(struct intel_vgpu *vgpu, unsigned index,
+			unsigned start, unsigned count, uint32_t flags,
+			void *data) = NULL;
+
+	switch (index) {
+	case VFIO_PCI_INTX_IRQ_INDEX:
+		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+		case VFIO_IRQ_SET_ACTION_MASK:
+			func = intel_vgpu_set_intx_mask;
+			break;
+		case VFIO_IRQ_SET_ACTION_UNMASK:
+			func = intel_vgpu_set_intx_unmask;
+			break;
+		case VFIO_IRQ_SET_ACTION_TRIGGER:
+			func = intel_vgpu_set_intx_trigger;
+			break;
+		}
+		break;
+	case VFIO_PCI_MSI_IRQ_INDEX:
+		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+		case VFIO_IRQ_SET_ACTION_MASK:
+		case VFIO_IRQ_SET_ACTION_UNMASK:
+			/* XXX Need masking support exported */
+			break;
+		case VFIO_IRQ_SET_ACTION_TRIGGER:
+			func = intel_vgpu_set_msi_trigger;
+			break;
+		}
+		break;
+	}
+
+	if (!func)
+		return -ENOTTY;
+
+	return func(vgpu, index, start, count, flags, data);
+}
+
+static long intel_vgpu_ioctl(struct mdev_device *mdev, unsigned int cmd,
+			     unsigned long arg)
+{
+	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
+	unsigned long minsz;
+
+	gvt_dbg_core("vgpu%d ioctl, cmd: %d\n", vgpu->id, cmd);
+
+	if (cmd == VFIO_DEVICE_GET_INFO) {
+		struct vfio_device_info info;
+
+		minsz = offsetofend(struct vfio_device_info, num_irqs);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		info.flags = VFIO_DEVICE_FLAGS_PCI;
+		info.flags |= VFIO_DEVICE_FLAGS_RESET;
+		info.num_regions = VFIO_PCI_NUM_REGIONS;
+		info.num_irqs = VFIO_PCI_NUM_IRQS;
+
+		return copy_to_user((void __user *)arg, &info, minsz) ?
+			-EFAULT : 0;
+
+	} else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
+		struct vfio_region_info info;
+		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+		int i, ret;
+		struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
+		size_t size;
+		int nr_areas = 1;
+		int cap_type_id;
+
+		minsz = offsetofend(struct vfio_region_info, offset);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		switch (info.index) {
+		case VFIO_PCI_CONFIG_REGION_INDEX:
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+			info.size = INTEL_GVT_MAX_CFG_SPACE_SZ;
+			info.flags = VFIO_REGION_INFO_FLAG_READ |
+				     VFIO_REGION_INFO_FLAG_WRITE;
+			break;
+		case VFIO_PCI_BAR0_REGION_INDEX:
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+			info.size = vgpu->cfg_space.bar[info.index].size;
+			if (!info.size) {
+				info.flags = 0;
+				break;
+			}
+
+			info.flags = VFIO_REGION_INFO_FLAG_READ |
+				     VFIO_REGION_INFO_FLAG_WRITE;
+			break;
+		case VFIO_PCI_BAR1_REGION_INDEX:
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+			info.size = 0;
+			info.flags = 0;
+			break;
+		case VFIO_PCI_BAR2_REGION_INDEX:
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+			info.flags = VFIO_REGION_INFO_FLAG_CAPS |
+					VFIO_REGION_INFO_FLAG_MMAP |
+					VFIO_REGION_INFO_FLAG_READ |
+					VFIO_REGION_INFO_FLAG_WRITE;
+			info.size = gvt_aperture_sz(vgpu->gvt);
+
+			size = sizeof(*sparse) +
+					(nr_areas * sizeof(*sparse->areas));
+			sparse = kzalloc(size, GFP_KERNEL);
+			if (!sparse)
+				return -ENOMEM;
+
+			sparse->nr_areas = nr_areas;
+			cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
+			sparse->areas[0].offset =
+					PAGE_ALIGN(vgpu_aperture_offset(vgpu));
+			sparse->areas[0].size = vgpu_aperture_sz(vgpu);
+			if (!caps.buf) {
+				kfree(caps.buf);
+				caps.buf = NULL;
+				caps.size = 0;
+			}
+			break;
+
+		case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+			info.size = 0;
+
+			info.flags = 0;
+			gvt_dbg_core("get region info bar:%d\n", info.index);
+			break;
+
+		case VFIO_PCI_ROM_REGION_INDEX:
+		case VFIO_PCI_VGA_REGION_INDEX:
+			gvt_dbg_core("get region info index:%d\n", info.index);
+			break;
+		default:
+			{
+				struct vfio_region_info_cap_type cap_type;
+
+				if (info.index >= VFIO_PCI_NUM_REGIONS +
+						vgpu->vdev.num_regions)
+					return -EINVAL;
+
+				i = info.index - VFIO_PCI_NUM_REGIONS;
+
+				info.offset =
+					VFIO_PCI_INDEX_TO_OFFSET(info.index);
+				info.size = vgpu->vdev.region[i].size;
+				info.flags = vgpu->vdev.region[i].flags;
+
+				cap_type.type = vgpu->vdev.region[i].type;
+				cap_type.subtype = vgpu->vdev.region[i].subtype;
+
+				ret = vfio_info_add_capability(&caps,
+						VFIO_REGION_INFO_CAP_TYPE,
+						&cap_type);
+				if (ret)
+					return ret;
+			}
+		}
+
+		if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) {
+			switch (cap_type_id) {
+			case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
+				ret = vfio_info_add_capability(&caps,
+					VFIO_REGION_INFO_CAP_SPARSE_MMAP,
+					sparse);
+				kfree(sparse);
+				if (ret)
+					return ret;
+				break;
+			default:
+				return -EINVAL;
+			}
+		}
+
+		if (caps.size) {
+			if (info.argsz < sizeof(info) + caps.size) {
+				info.argsz = sizeof(info) + caps.size;
+				info.cap_offset = 0;
+			} else {
+				vfio_info_cap_shift(&caps, sizeof(info));
+				if (copy_to_user((void __user *)arg +
+						  sizeof(info), caps.buf,
+						  caps.size)) {
+					kfree(caps.buf);
+					return -EFAULT;
+				}
+				info.cap_offset = sizeof(info);
+			}
+
+			kfree(caps.buf);
+		}
+
+		return copy_to_user((void __user *)arg, &info, minsz) ?
+			-EFAULT : 0;
+	} else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
+		struct vfio_irq_info info;
+
+		minsz = offsetofend(struct vfio_irq_info, count);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
+			return -EINVAL;
+
+		switch (info.index) {
+		case VFIO_PCI_INTX_IRQ_INDEX:
+		case VFIO_PCI_MSI_IRQ_INDEX:
+			break;
+		default:
+			return -EINVAL;
+		}
+
+		info.flags = VFIO_IRQ_INFO_EVENTFD;
+
+		info.count = intel_vgpu_get_irq_count(vgpu, info.index);
+
+		if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
+			info.flags |= (VFIO_IRQ_INFO_MASKABLE |
+				       VFIO_IRQ_INFO_AUTOMASKED);
+		else
+			info.flags |= VFIO_IRQ_INFO_NORESIZE;
+
+		return copy_to_user((void __user *)arg, &info, minsz) ?
+			-EFAULT : 0;
+	} else if (cmd == VFIO_DEVICE_SET_IRQS) {
+		struct vfio_irq_set hdr;
+		u8 *data = NULL;
+		int ret = 0;
+		size_t data_size = 0;
+
+		minsz = offsetofend(struct vfio_irq_set, count);
+
+		if (copy_from_user(&hdr, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
+			int max = intel_vgpu_get_irq_count(vgpu, hdr.index);
+
+			ret = vfio_set_irqs_validate_and_prepare(&hdr, max,
+						VFIO_PCI_NUM_IRQS, &data_size);
+			if (ret) {
+				gvt_err("intel:vfio_set_irqs_validate_and_prepare failed\n");
+				return -EINVAL;
+			}
+			if (data_size) {
+				data = memdup_user((void __user *)(arg + minsz),
+						   data_size);
+				if (IS_ERR(data))
+					return PTR_ERR(data);
+			}
+		}
+
+		ret = intel_vgpu_set_irqs(vgpu, hdr.flags, hdr.index,
+					hdr.start, hdr.count, data);
+		kfree(data);
+
+		return ret;
+	} else if (cmd == VFIO_DEVICE_RESET) {
+		intel_gvt_ops->vgpu_reset(vgpu);
+		return 0;
+	}
+
+	return 0;
+}
+
+static const struct parent_ops intel_vgpu_ops = {
+	.supported_type_groups	= intel_vgpu_type_groups,
+	.create			= intel_vgpu_create,
+	.remove			= intel_vgpu_remove,
+
+	.open			= intel_vgpu_open,
+	.release		= intel_vgpu_release,
+
+	.read			= intel_vgpu_read,
+	.write			= intel_vgpu_write,
+	.mmap			= intel_vgpu_mmap,
+	.ioctl			= intel_vgpu_ioctl,
+};
+
 static int kvmgt_host_init(struct device *dev, void *gvt, const void *ops)
 {
 	if (!intel_gvt_init_vgpu_type_groups(gvt))
@@ -349,22 +1033,28 @@ static int kvmgt_host_init(struct device *dev, void *gvt, const void *ops)
 
 	intel_gvt_ops = ops;
 
-	/* MDEV is not yet available */
-	return -ENODEV;
+	return mdev_register_device(dev, &intel_vgpu_ops);
 }
 
 static void kvmgt_host_exit(struct device *dev, void *gvt)
 {
 	intel_gvt_cleanup_vgpu_type_groups(gvt);
+	mdev_unregister_device(dev);
 }
 
 static int kvmgt_write_protect_add(unsigned long handle, u64 gfn)
 {
-	struct kvmgt_guest_info *info = (struct kvmgt_guest_info *)handle;
-	struct kvm *kvm = info->kvm;
+	struct kvmgt_guest_info *info;
+	struct kvm *kvm;
 	struct kvm_memory_slot *slot;
 	int idx;
 
+	if (!handle_valid(handle))
+		return -ESRCH;
+
+	info = (struct kvmgt_guest_info *)handle;
+	kvm = info->kvm;
+
 	idx = srcu_read_lock(&kvm->srcu);
 	slot = gfn_to_memslot(kvm, gfn);
 
@@ -384,11 +1074,17 @@ static int kvmgt_write_protect_add(unsigned long handle, u64 gfn)
 
 static int kvmgt_write_protect_remove(unsigned long handle, u64 gfn)
 {
-	struct kvmgt_guest_info *info = (struct kvmgt_guest_info *)handle;
-	struct kvm *kvm = info->kvm;
+	struct kvmgt_guest_info *info;
+	struct kvm *kvm;
 	struct kvm_memory_slot *slot;
 	int idx;
 
+	if (!handle_valid(handle))
+		return 0;
+
+	info = (struct kvmgt_guest_info *)handle;
+	kvm = info->kvm;
+
 	idx = srcu_read_lock(&kvm->srcu);
 	slot = gfn_to_memslot(kvm, gfn);
 
@@ -476,6 +1172,85 @@ static int kvmgt_detect_host(void)
 	return kvmgt_check_guest() ? -ENODEV : 0;
 }
 
+static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu, struct kvm *kvm)
+{
+	struct intel_vgpu *itr;
+	struct kvmgt_guest_info *info;
+	int id;
+	bool ret = false;
+
+	mutex_lock(&vgpu->gvt->lock);
+	for_each_active_vgpu(vgpu->gvt, itr, id) {
+		if (!handle_valid(itr->handle))
+			continue;
+
+		info = (struct kvmgt_guest_info *)itr->handle;
+		if (kvm && kvm == info->kvm) {
+			ret = true;
+			goto out;
+		}
+	}
+out:
+	mutex_unlock(&vgpu->gvt->lock);
+	return ret;
+}
+
+static int kvmgt_guest_init(struct mdev_device *mdev)
+{
+	struct kvmgt_guest_info *info;
+	struct intel_vgpu *vgpu;
+	struct kvm *kvm;
+
+	vgpu = mdev_get_drvdata(mdev);
+	if (handle_valid(vgpu->handle))
+		return -EEXIST;
+
+	kvm = vgpu->vdev.kvm;
+	if (!kvm || kvm->mm != current->mm) {
+		gvt_err("KVM is required to use Intel vGPU\n");
+		return -ESRCH;
+	}
+
+	if (__kvmgt_vgpu_exist(vgpu, kvm))
+		return -EEXIST;
+
+	info = vzalloc(sizeof(struct kvmgt_guest_info));
+	if (!info)
+		return -ENOMEM;
+
+	vgpu->handle = (unsigned long)info;
+	info->vgpu = vgpu;
+	info->kvm = kvm;
+
+	kvmgt_protect_table_init(info);
+	gvt_cache_init(vgpu);
+
+	info->track_node.track_write = kvmgt_page_track_write;
+	info->track_node.track_flush_slot = kvmgt_page_track_flush_slot;
+	kvm_page_track_register_notifier(kvm, &info->track_node);
+
+	return 0;
+}
+
+static bool kvmgt_guest_exit(struct kvmgt_guest_info *info)
+{
+	struct intel_vgpu *vgpu;
+
+	if (!info) {
+		gvt_err("kvmgt_guest_info invalid\n");
+		return false;
+	}
+
+	vgpu = info->vgpu;
+
+	kvm_page_track_unregister_notifier(info->kvm, &info->track_node);
+	kvmgt_protect_table_destroy(info);
+	gvt_cache_destroy(vgpu);
+	vfree(info);
+
+	return true;
+}
+
 static int kvmgt_attach_vgpu(void *vgpu, unsigned long *handle)
 {
 	/* nothing to do here */
@@ -489,30 +1264,40 @@ static void kvmgt_detach_vgpu(unsigned long handle)
 
 static int kvmgt_inject_msi(unsigned long handle, u32 addr, u16 data)
 {
-	struct kvmgt_guest_info *info = (struct kvmgt_guest_info *)handle;
-	struct intel_vgpu *vgpu = info->vgpu;
+	struct kvmgt_guest_info *info;
+	struct intel_vgpu *vgpu;
 
-	if (vgpu->vdev.msi_trigger)
-		return eventfd_signal(vgpu->vdev.msi_trigger, 1) == 1;
+	if (!handle_valid(handle))
+		return -ESRCH;
 
-	return false;
+	info = (struct kvmgt_guest_info *)handle;
+	vgpu = info->vgpu;
+
+	if (eventfd_signal(vgpu->vdev.msi_trigger, 1) == 1)
+		return 0;
+
+	return -EFAULT;
 }
 
 static unsigned long kvmgt_gfn_to_pfn(unsigned long handle, unsigned long gfn)
 {
 	unsigned long pfn;
-	struct kvmgt_guest_info *info = (struct kvmgt_guest_info *)handle;
+	struct kvmgt_guest_info *info;
 	int rc;
 
+	if (!handle_valid(handle))
+		return INTEL_GVT_INVALID_ADDR;
+
+	info = (struct kvmgt_guest_info *)handle;
 	pfn = gvt_cache_find(info->vgpu, gfn);
 	if (pfn != 0)
 		return pfn;
 
-	rc = kvmgt_pin_pages(info->vgpu->vdev.mdev, &gfn, 1,
-			     IOMMU_READ | IOMMU_WRITE, &pfn);
+	rc = vfio_pin_pages(&info->vgpu->vdev.mdev->dev, &gfn, 1,
+				IOMMU_READ | IOMMU_WRITE, &pfn);
 	if (rc != 1) {
 		gvt_err("vfio_pin_pages failed for gfn: 0x%lx\n", gfn);
-		return 0;
+		return INTEL_GVT_INVALID_ADDR;
 	}
 
 	gvt_cache_add(info->vgpu, gfn, pfn);
@@ -525,7 +1310,7 @@ static void *kvmgt_gpa_to_hva(unsigned long handle, unsigned long gpa)
 	gfn_t gfn = gpa_to_gfn(gpa);
 
 	pfn = kvmgt_gfn_to_pfn(handle, gfn);
-	if (!pfn)
+	if (pfn == INTEL_GVT_INVALID_ADDR)
 		return NULL;
 
 	return (char *)pfn_to_kaddr(pfn) + offset_in_page(gpa);
@@ -536,6 +1321,9 @@ static int kvmgt_rw_gpa(unsigned long handle, unsigned long gpa,
 {
 	void *hva = NULL;
 
+	if (!handle_valid(handle))
+		return -ESRCH;
+
 	hva = kvmgt_gpa_to_hva(handle, gpa);
 	if (!hva)
 		return -EFAULT;