1 files changed, 180 insertions, 124 deletions
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index e17bbe20119..7857430943e 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -683,7 +683,6 @@ static void notify_ring(struct drm_device *dev,
 
 	wake_up_all(&ring->irq_queue);
 	if (i915_enable_hangcheck) {
-		dev_priv->gpu_error.hangcheck_count = 0;
 		mod_timer(&dev_priv->gpu_error.hangcheck_timer,
 			  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
 	}
@@ -1656,7 +1655,7 @@ static u32 capture_pinned_bo(struct drm_i915_error_buffer *err,
 	struct drm_i915_gem_object *obj;
 	int i = 0;
 
-	list_for_each_entry(obj, head, gtt_list) {
+	list_for_each_entry(obj, head, global_list) {
 		if (obj->pin_count == 0)
 			continue;
 
@@ -1798,7 +1797,7 @@ static void i915_gem_record_active_context(struct intel_ring_buffer *ring,
 	if (ring->id != RCS || !error->ccid)
 		return;
 
-	list_for_each_entry(obj, &dev_priv->mm.bound_list, gtt_list) {
+	list_for_each_entry(obj, &dev_priv->mm.bound_list, global_list) {
 		if ((error->ccid & PAGE_MASK) == obj->gtt_offset) {
 			ering->ctx = i915_error_object_create_sized(dev_priv,
 								    obj, 1);
@@ -1935,7 +1934,7 @@ static void i915_capture_error_state(struct drm_device *dev)
 	list_for_each_entry(obj, &dev_priv->mm.active_list, mm_list)
 		i++;
 	error->active_bo_count = i;
-	list_for_each_entry(obj, &dev_priv->mm.bound_list, gtt_list)
+	list_for_each_entry(obj, &dev_priv->mm.bound_list, global_list)
 		if (obj->pin_count)
 			i++;
 	error->pinned_bo_count = i - error->active_bo_count;
@@ -2315,38 +2314,28 @@ ring_last_seqno(struct intel_ring_buffer *ring)
 			  struct drm_i915_gem_request, list)->seqno;
 }
 
-static bool i915_hangcheck_ring_idle(struct intel_ring_buffer *ring,
-				     u32 ring_seqno, bool *err)
+static bool
+ring_idle(struct intel_ring_buffer *ring, u32 seqno)
 {
-	if (list_empty(&ring->request_list) ||
-	    i915_seqno_passed(ring_seqno, ring_last_seqno(ring))) {
-		/* Issue a wake-up to catch stuck h/w. */
-		if (waitqueue_active(&ring->irq_queue)) {
-			DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
-				  ring->name);
-			wake_up_all(&ring->irq_queue);
-			*err = true;
-		}
-		return true;
-	}
-	return false;
+	return (list_empty(&ring->request_list) ||
+		i915_seqno_passed(seqno, ring_last_seqno(ring)));
 }
 
-static bool semaphore_passed(struct intel_ring_buffer *ring)
+static struct intel_ring_buffer *
+semaphore_waits_for(struct intel_ring_buffer *ring, u32 *seqno)
 {
 	struct drm_i915_private *dev_priv = ring->dev->dev_private;
-	u32 acthd = intel_ring_get_active_head(ring) & HEAD_ADDR;
-	struct intel_ring_buffer *signaller;
-	u32 cmd, ipehr, acthd_min;
+	u32 cmd, ipehr, acthd, acthd_min;
 
 	ipehr = I915_READ(RING_IPEHR(ring->mmio_base));
 	if ((ipehr & ~(0x3 << 16)) !=
 	    (MI_SEMAPHORE_MBOX | MI_SEMAPHORE_COMPARE | MI_SEMAPHORE_REGISTER))
-		return false;
+		return NULL;
 
 	/* ACTHD is likely pointing to the dword after the actual command,
 	 * so scan backwards until we find the MBOX.
 	 */
+	acthd = intel_ring_get_active_head(ring) & HEAD_ADDR;
 	acthd_min = max((int)acthd - 3 * 4, 0);
 	do {
 		cmd = ioread32(ring->virtual_start + acthd);
@@ -2355,128 +2344,216 @@ static bool semaphore_passed(struct intel_ring_buffer *ring)
 
 		acthd -= 4;
 		if (acthd < acthd_min)
-			return false;
+			return NULL;
 	} while (1);
 
-	signaller = &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3];
-	return i915_seqno_passed(signaller->get_seqno(signaller, false),
-				 ioread32(ring->virtual_start+acthd+4)+1);
+	*seqno = ioread32(ring->virtual_start+acthd+4)+1;
+	return &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3];
 }
 
-static bool kick_ring(struct intel_ring_buffer *ring)
+static int semaphore_passed(struct intel_ring_buffer *ring)
 {
-	struct drm_device *dev = ring->dev;
-	struct drm_i915_private *dev_priv = dev->dev_private;
-	u32 tmp = I915_READ_CTL(ring);
-	if (tmp & RING_WAIT) {
-		DRM_ERROR("Kicking stuck wait on %s\n",
-			  ring->name);
-		I915_WRITE_CTL(ring, tmp);
-		return true;
-	}
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+	struct intel_ring_buffer *signaller;
+	u32 seqno, ctl;
 
-	if (INTEL_INFO(dev)->gen >= 6 &&
-	    tmp & RING_WAIT_SEMAPHORE &&
-	    semaphore_passed(ring)) {
-		DRM_ERROR("Kicking stuck semaphore on %s\n",
-			  ring->name);
-		I915_WRITE_CTL(ring, tmp);
-		return true;
-	}
-	return false;
+	ring->hangcheck.deadlock = true;
+
+	signaller = semaphore_waits_for(ring, &seqno);
+	if (signaller == NULL || signaller->hangcheck.deadlock)
+		return -1;
+
+	/* cursory check for an unkickable deadlock */
+	ctl = I915_READ_CTL(signaller);
+	if (ctl & RING_WAIT_SEMAPHORE && semaphore_passed(signaller) < 0)
+		return -1;
+
+	return i915_seqno_passed(signaller->get_seqno(signaller, false), seqno);
 }
 
-static bool i915_hangcheck_ring_hung(struct intel_ring_buffer *ring)
+static void semaphore_clear_deadlocks(struct drm_i915_private *dev_priv)
 {
-	if (IS_GEN2(ring->dev))
-		return false;
+	struct intel_ring_buffer *ring;
+	int i;
 
-	/* Is the chip hanging on a WAIT_FOR_EVENT?
-	 * If so we can simply poke the RB_WAIT bit
-	 * and break the hang. This should work on
-	 * all but the second generation chipsets.
-	 */
-	return !kick_ring(ring);
+	for_each_ring(ring, dev_priv, i)
+		ring->hangcheck.deadlock = false;
 }
 
-static bool i915_hangcheck_hung(struct drm_device *dev)
+static enum intel_ring_hangcheck_action
+ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
 {
-	drm_i915_private_t *dev_priv = dev->dev_private;
+	struct drm_device *dev = ring->dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	u32 tmp;
 
-	if (dev_priv->gpu_error.hangcheck_count++ > 1) {
-		bool hung = true;
-		struct intel_ring_buffer *ring;
-		int i;
+	if (ring->hangcheck.acthd != acthd)
+		return active;
 
-		DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
-		i915_handle_error(dev, true);
+	if (IS_GEN2(dev))
+		return hung;
 
-		for_each_ring(ring, dev_priv, i)
-			hung &= i915_hangcheck_ring_hung(ring);
+	/* Is the chip hanging on a WAIT_FOR_EVENT?
+	 * If so we can simply poke the RB_WAIT bit
+	 * and break the hang. This should work on
+	 * all but the second generation chipsets.
+	 */
+	tmp = I915_READ_CTL(ring);
+	if (tmp & RING_WAIT) {
+		DRM_ERROR("Kicking stuck wait on %s\n",
+			  ring->name);
+		I915_WRITE_CTL(ring, tmp);
+		return kick;
+	}
 
-		return hung;
+	if (INTEL_INFO(dev)->gen >= 6 && tmp & RING_WAIT_SEMAPHORE) {
+		switch (semaphore_passed(ring)) {
+		default:
+			return hung;
+		case 1:
+			DRM_ERROR("Kicking stuck semaphore on %s\n",
+				  ring->name);
+			I915_WRITE_CTL(ring, tmp);
+			return kick;
+		case 0:
+			return wait;
+		}
 	}
 
-	return false;
+	return hung;
 }
 
 /**
  * This is called when the chip hasn't reported back with completed
- * batchbuffers in a long time. The first time this is called we simply record
- * ACTHD. If ACTHD hasn't changed by the time the hangcheck timer elapses
- * again, we assume the chip is wedged and try to fix it.
+ * batchbuffers in a long time. We keep track per ring seqno progress and
+ * if there are no progress, hangcheck score for that ring is increased.
+ * Further, acthd is inspected to see if the ring is stuck. On stuck case
+ * we kick the ring. If we see no progress on three subsequent calls
+ * we assume chip is wedged and try to fix it by resetting the chip.
  */
 void i915_hangcheck_elapsed(unsigned long data)
 {
 	struct drm_device *dev = (struct drm_device *)data;
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	struct intel_ring_buffer *ring;
-	bool err = false, idle;
 	int i;
-	u32 seqno[I915_NUM_RINGS];
-	bool work_done;
+	int busy_count = 0, rings_hung = 0;
+	bool stuck[I915_NUM_RINGS] = { 0 };
+#define BUSY 1
+#define KICK 5
+#define HUNG 20
+#define FIRE 30
 
 	if (!i915_enable_hangcheck)
 		return;
 
-	idle = true;
 	for_each_ring(ring, dev_priv, i) {
-		seqno[i] = ring->get_seqno(ring, false);
-		idle &= i915_hangcheck_ring_idle(ring, seqno[i], &err);
-	}
-
-	/* If all work is done then ACTHD clearly hasn't advanced. */
-	if (idle) {
-		if (err) {
-			if (i915_hangcheck_hung(dev))
-				return;
-
-			goto repeat;
+		u32 seqno, acthd;
+		bool busy = true;
+
+		semaphore_clear_deadlocks(dev_priv);
+
+		seqno = ring->get_seqno(ring, false);
+		acthd = intel_ring_get_active_head(ring);
+
+		if (ring->hangcheck.seqno == seqno) {
+			if (ring_idle(ring, seqno)) {
+				if (waitqueue_active(&ring->irq_queue)) {
+					/* Issue a wake-up to catch stuck h/w. */
+					DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
+						  ring->name);
+					wake_up_all(&ring->irq_queue);
+					ring->hangcheck.score += HUNG;
+				} else
+					busy = false;
+			} else {
+				int score;
+
+				/* We always increment the hangcheck score
+				 * if the ring is busy and still processing
+				 * the same request, so that no single request
+				 * can run indefinitely (such as a chain of
+				 * batches). The only time we do not increment
+				 * the hangcheck score on this ring, if this
+				 * ring is in a legitimate wait for another
+				 * ring. In that case the waiting ring is a
+				 * victim and we want to be sure we catch the
+				 * right culprit. Then every time we do kick
+				 * the ring, add a small increment to the
+				 * score so that we can catch a batch that is
+				 * being repeatedly kicked and so responsible
+				 * for stalling the machine.
+				 */
+				ring->hangcheck.action = ring_stuck(ring,
+								    acthd);
+
+				switch (ring->hangcheck.action) {
+				case wait:
+					score = 0;
+					break;
+				case active:
+					score = BUSY;
+					break;
+				case kick:
+					score = KICK;
+					break;
+				case hung:
+					score = HUNG;
+					stuck[i] = true;
+					break;
+				}
+				ring->hangcheck.score += score;
+			}
+		} else {
+			/* Gradually reduce the count so that we catch DoS
+			 * attempts across multiple batches.
+			 */
+			if (ring->hangcheck.score > 0)
+				ring->hangcheck.score--;
 		}
 
-		dev_priv->gpu_error.hangcheck_count = 0;
-		return;
+		ring->hangcheck.seqno = seqno;
+		ring->hangcheck.acthd = acthd;
+		busy_count += busy;
 	}
 
-	work_done = false;
 	for_each_ring(ring, dev_priv, i) {
-		if (ring->hangcheck.seqno != seqno[i]) {
-			work_done = true;
-			ring->hangcheck.seqno = seqno[i];
+		if (ring->hangcheck.score > FIRE) {
+			DRM_ERROR("%s on %s\n",
+				  stuck[i] ? "stuck" : "no progress",
+				  ring->name);
+			rings_hung++;
 		}
 	}
 
-	if (!work_done) {
-		if (i915_hangcheck_hung(dev))
-			return;
-	} else {
-		dev_priv->gpu_error.hangcheck_count = 0;
-	}
+	if (rings_hung)
+		return i915_handle_error(dev, true);
 
-repeat:
-	/* Reset timer case chip hangs without another request being added */
-	mod_timer(&dev_priv->gpu_error.hangcheck_timer,
-		  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
+	if (busy_count)
+		/* Reset timer case chip hangs without another request
+		 * being added */
+		mod_timer(&dev_priv->gpu_error.hangcheck_timer,
+			  round_jiffies_up(jiffies +
+					   DRM_I915_HANGCHECK_JIFFIES));
+}
+
+static void ibx_irq_preinstall(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+
+	if (HAS_PCH_NOP(dev))
+		return;
+
+	/* south display irq */
+	I915_WRITE(SDEIMR, 0xffffffff);
+	/*
+	 * SDEIER is also touched by the interrupt handler to work around missed
+	 * PCH interrupts. Hence we can't update it after the interrupt handler
+	 * is enabled - instead we unconditionally enable all PCH interrupt
+	 * sources here, but then only unmask them as needed with SDEIMR.
+	 */
+	I915_WRITE(SDEIER, 0xffffffff);
+	POSTING_READ(SDEIER);
 }
 
 /* drm_dma.h hooks
@@ -2500,16 +2577,7 @@ static void ironlake_irq_preinstall(struct drm_device *dev)
 	I915_WRITE(GTIER, 0x0);
 	POSTING_READ(GTIER);
 
-	/* south display irq */
-	I915_WRITE(SDEIMR, 0xffffffff);
-	/*
-	 * SDEIER is also touched by the interrupt handler to work around missed
-	 * PCH interrupts. Hence we can't update it after the interrupt handler
-	 * is enabled - instead we unconditionally enable all PCH interrupt
-	 * sources here, but then only unmask them as needed with SDEIMR.
-	 */
-	I915_WRITE(SDEIER, 0xffffffff);
-	POSTING_READ(SDEIER);
+	ibx_irq_preinstall(dev);
 }
 
 static void ivybridge_irq_preinstall(struct drm_device *dev)
@@ -2536,19 +2604,7 @@ static void ivybridge_irq_preinstall(struct drm_device *dev)
 	I915_WRITE(GEN6_PMIER, 0x0);
 	POSTING_READ(GEN6_PMIER);
 
-	if (HAS_PCH_NOP(dev))
-		return;
-
-	/* south display irq */
-	I915_WRITE(SDEIMR, 0xffffffff);
-	/*
-	 * SDEIER is also touched by the interrupt handler to work around missed
-	 * PCH interrupts. Hence we can't update it after the interrupt handler
-	 * is enabled - instead we unconditionally enable all PCH interrupt
-	 * sources here, but then only unmask them as needed with SDEIMR.
-	 */
-	I915_WRITE(SDEIER, 0xffffffff);
-	POSTING_READ(SDEIER);
+	ibx_irq_preinstall(dev);
 }
 
 static void valleyview_irq_preinstall(struct drm_device *dev)