aboutsummaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/i915/i915_irq.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/i915/i915_irq.c')
-rw-r--r--drivers/gpu/drm/i915/i915_irq.c304
1 files changed, 180 insertions, 124 deletions
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index e17bbe20119..7857430943e 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -683,7 +683,6 @@ static void notify_ring(struct drm_device *dev,
wake_up_all(&ring->irq_queue);
if (i915_enable_hangcheck) {
- dev_priv->gpu_error.hangcheck_count = 0;
mod_timer(&dev_priv->gpu_error.hangcheck_timer,
round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
}
@@ -1656,7 +1655,7 @@ static u32 capture_pinned_bo(struct drm_i915_error_buffer *err,
struct drm_i915_gem_object *obj;
int i = 0;
- list_for_each_entry(obj, head, gtt_list) {
+ list_for_each_entry(obj, head, global_list) {
if (obj->pin_count == 0)
continue;
@@ -1798,7 +1797,7 @@ static void i915_gem_record_active_context(struct intel_ring_buffer *ring,
if (ring->id != RCS || !error->ccid)
return;
- list_for_each_entry(obj, &dev_priv->mm.bound_list, gtt_list) {
+ list_for_each_entry(obj, &dev_priv->mm.bound_list, global_list) {
if ((error->ccid & PAGE_MASK) == obj->gtt_offset) {
ering->ctx = i915_error_object_create_sized(dev_priv,
obj, 1);
@@ -1935,7 +1934,7 @@ static void i915_capture_error_state(struct drm_device *dev)
list_for_each_entry(obj, &dev_priv->mm.active_list, mm_list)
i++;
error->active_bo_count = i;
- list_for_each_entry(obj, &dev_priv->mm.bound_list, gtt_list)
+ list_for_each_entry(obj, &dev_priv->mm.bound_list, global_list)
if (obj->pin_count)
i++;
error->pinned_bo_count = i - error->active_bo_count;
@@ -2315,38 +2314,28 @@ ring_last_seqno(struct intel_ring_buffer *ring)
struct drm_i915_gem_request, list)->seqno;
}
-static bool i915_hangcheck_ring_idle(struct intel_ring_buffer *ring,
- u32 ring_seqno, bool *err)
+static bool
+ring_idle(struct intel_ring_buffer *ring, u32 seqno)
{
- if (list_empty(&ring->request_list) ||
- i915_seqno_passed(ring_seqno, ring_last_seqno(ring))) {
- /* Issue a wake-up to catch stuck h/w. */
- if (waitqueue_active(&ring->irq_queue)) {
- DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
- ring->name);
- wake_up_all(&ring->irq_queue);
- *err = true;
- }
- return true;
- }
- return false;
+ return (list_empty(&ring->request_list) ||
+ i915_seqno_passed(seqno, ring_last_seqno(ring)));
}
-static bool semaphore_passed(struct intel_ring_buffer *ring)
+static struct intel_ring_buffer *
+semaphore_waits_for(struct intel_ring_buffer *ring, u32 *seqno)
{
struct drm_i915_private *dev_priv = ring->dev->dev_private;
- u32 acthd = intel_ring_get_active_head(ring) & HEAD_ADDR;
- struct intel_ring_buffer *signaller;
- u32 cmd, ipehr, acthd_min;
+ u32 cmd, ipehr, acthd, acthd_min;
ipehr = I915_READ(RING_IPEHR(ring->mmio_base));
if ((ipehr & ~(0x3 << 16)) !=
(MI_SEMAPHORE_MBOX | MI_SEMAPHORE_COMPARE | MI_SEMAPHORE_REGISTER))
- return false;
+ return NULL;
/* ACTHD is likely pointing to the dword after the actual command,
* so scan backwards until we find the MBOX.
*/
+ acthd = intel_ring_get_active_head(ring) & HEAD_ADDR;
acthd_min = max((int)acthd - 3 * 4, 0);
do {
cmd = ioread32(ring->virtual_start + acthd);
@@ -2355,128 +2344,216 @@ static bool semaphore_passed(struct intel_ring_buffer *ring)
acthd -= 4;
if (acthd < acthd_min)
- return false;
+ return NULL;
} while (1);
- signaller = &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3];
- return i915_seqno_passed(signaller->get_seqno(signaller, false),
- ioread32(ring->virtual_start+acthd+4)+1);
+ *seqno = ioread32(ring->virtual_start+acthd+4)+1;
+ return &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3];
}
-static bool kick_ring(struct intel_ring_buffer *ring)
+static int semaphore_passed(struct intel_ring_buffer *ring)
{
- struct drm_device *dev = ring->dev;
- struct drm_i915_private *dev_priv = dev->dev_private;
- u32 tmp = I915_READ_CTL(ring);
- if (tmp & RING_WAIT) {
- DRM_ERROR("Kicking stuck wait on %s\n",
- ring->name);
- I915_WRITE_CTL(ring, tmp);
- return true;
- }
+ struct drm_i915_private *dev_priv = ring->dev->dev_private;
+ struct intel_ring_buffer *signaller;
+ u32 seqno, ctl;
- if (INTEL_INFO(dev)->gen >= 6 &&
- tmp & RING_WAIT_SEMAPHORE &&
- semaphore_passed(ring)) {
- DRM_ERROR("Kicking stuck semaphore on %s\n",
- ring->name);
- I915_WRITE_CTL(ring, tmp);
- return true;
- }
- return false;
+ ring->hangcheck.deadlock = true;
+
+ signaller = semaphore_waits_for(ring, &seqno);
+ if (signaller == NULL || signaller->hangcheck.deadlock)
+ return -1;
+
+ /* cursory check for an unkickable deadlock */
+ ctl = I915_READ_CTL(signaller);
+ if (ctl & RING_WAIT_SEMAPHORE && semaphore_passed(signaller) < 0)
+ return -1;
+
+ return i915_seqno_passed(signaller->get_seqno(signaller, false), seqno);
}
-static bool i915_hangcheck_ring_hung(struct intel_ring_buffer *ring)
+static void semaphore_clear_deadlocks(struct drm_i915_private *dev_priv)
{
- if (IS_GEN2(ring->dev))
- return false;
+ struct intel_ring_buffer *ring;
+ int i;
- /* Is the chip hanging on a WAIT_FOR_EVENT?
- * If so we can simply poke the RB_WAIT bit
- * and break the hang. This should work on
- * all but the second generation chipsets.
- */
- return !kick_ring(ring);
+ for_each_ring(ring, dev_priv, i)
+ ring->hangcheck.deadlock = false;
}
-static bool i915_hangcheck_hung(struct drm_device *dev)
+static enum intel_ring_hangcheck_action
+ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
{
- drm_i915_private_t *dev_priv = dev->dev_private;
+ struct drm_device *dev = ring->dev;
+ struct drm_i915_private *dev_priv = dev->dev_private;
+ u32 tmp;
- if (dev_priv->gpu_error.hangcheck_count++ > 1) {
- bool hung = true;
- struct intel_ring_buffer *ring;
- int i;
+ if (ring->hangcheck.acthd != acthd)
+ return active;
- DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
- i915_handle_error(dev, true);
+ if (IS_GEN2(dev))
+ return hung;
- for_each_ring(ring, dev_priv, i)
- hung &= i915_hangcheck_ring_hung(ring);
+ /* Is the chip hanging on a WAIT_FOR_EVENT?
+ * If so we can simply poke the RB_WAIT bit
+ * and break the hang. This should work on
+ * all but the second generation chipsets.
+ */
+ tmp = I915_READ_CTL(ring);
+ if (tmp & RING_WAIT) {
+ DRM_ERROR("Kicking stuck wait on %s\n",
+ ring->name);
+ I915_WRITE_CTL(ring, tmp);
+ return kick;
+ }
- return hung;
+ if (INTEL_INFO(dev)->gen >= 6 && tmp & RING_WAIT_SEMAPHORE) {
+ switch (semaphore_passed(ring)) {
+ default:
+ return hung;
+ case 1:
+ DRM_ERROR("Kicking stuck semaphore on %s\n",
+ ring->name);
+ I915_WRITE_CTL(ring, tmp);
+ return kick;
+ case 0:
+ return wait;
+ }
}
- return false;
+ return hung;
}
/**
* This is called when the chip hasn't reported back with completed
- * batchbuffers in a long time. The first time this is called we simply record
- * ACTHD. If ACTHD hasn't changed by the time the hangcheck timer elapses
- * again, we assume the chip is wedged and try to fix it.
+ * batchbuffers in a long time. We keep track per ring seqno progress and
+ * if there are no progress, hangcheck score for that ring is increased.
+ * Further, acthd is inspected to see if the ring is stuck. On stuck case
+ * we kick the ring. If we see no progress on three subsequent calls
+ * we assume chip is wedged and try to fix it by resetting the chip.
*/
void i915_hangcheck_elapsed(unsigned long data)
{
struct drm_device *dev = (struct drm_device *)data;
drm_i915_private_t *dev_priv = dev->dev_private;
struct intel_ring_buffer *ring;
- bool err = false, idle;
int i;
- u32 seqno[I915_NUM_RINGS];
- bool work_done;
+ int busy_count = 0, rings_hung = 0;
+ bool stuck[I915_NUM_RINGS] = { 0 };
+#define BUSY 1
+#define KICK 5
+#define HUNG 20
+#define FIRE 30
if (!i915_enable_hangcheck)
return;
- idle = true;
for_each_ring(ring, dev_priv, i) {
- seqno[i] = ring->get_seqno(ring, false);
- idle &= i915_hangcheck_ring_idle(ring, seqno[i], &err);
- }
-
- /* If all work is done then ACTHD clearly hasn't advanced. */
- if (idle) {
- if (err) {
- if (i915_hangcheck_hung(dev))
- return;
-
- goto repeat;
+ u32 seqno, acthd;
+ bool busy = true;
+
+ semaphore_clear_deadlocks(dev_priv);
+
+ seqno = ring->get_seqno(ring, false);
+ acthd = intel_ring_get_active_head(ring);
+
+ if (ring->hangcheck.seqno == seqno) {
+ if (ring_idle(ring, seqno)) {
+ if (waitqueue_active(&ring->irq_queue)) {
+ /* Issue a wake-up to catch stuck h/w. */
+ DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
+ ring->name);
+ wake_up_all(&ring->irq_queue);
+ ring->hangcheck.score += HUNG;
+ } else
+ busy = false;
+ } else {
+ int score;
+
+ /* We always increment the hangcheck score
+ * if the ring is busy and still processing
+ * the same request, so that no single request
+ * can run indefinitely (such as a chain of
+ * batches). The only time we do not increment
+ * the hangcheck score on this ring, if this
+ * ring is in a legitimate wait for another
+ * ring. In that case the waiting ring is a
+ * victim and we want to be sure we catch the
+ * right culprit. Then every time we do kick
+ * the ring, add a small increment to the
+ * score so that we can catch a batch that is
+ * being repeatedly kicked and so responsible
+ * for stalling the machine.
+ */
+ ring->hangcheck.action = ring_stuck(ring,
+ acthd);
+
+ switch (ring->hangcheck.action) {
+ case wait:
+ score = 0;
+ break;
+ case active:
+ score = BUSY;
+ break;
+ case kick:
+ score = KICK;
+ break;
+ case hung:
+ score = HUNG;
+ stuck[i] = true;
+ break;
+ }
+ ring->hangcheck.score += score;
+ }
+ } else {
+ /* Gradually reduce the count so that we catch DoS
+ * attempts across multiple batches.
+ */
+ if (ring->hangcheck.score > 0)
+ ring->hangcheck.score--;
}
- dev_priv->gpu_error.hangcheck_count = 0;
- return;
+ ring->hangcheck.seqno = seqno;
+ ring->hangcheck.acthd = acthd;
+ busy_count += busy;
}
- work_done = false;
for_each_ring(ring, dev_priv, i) {
- if (ring->hangcheck.seqno != seqno[i]) {
- work_done = true;
- ring->hangcheck.seqno = seqno[i];
+ if (ring->hangcheck.score > FIRE) {
+ DRM_ERROR("%s on %s\n",
+ stuck[i] ? "stuck" : "no progress",
+ ring->name);
+ rings_hung++;
}
}
- if (!work_done) {
- if (i915_hangcheck_hung(dev))
- return;
- } else {
- dev_priv->gpu_error.hangcheck_count = 0;
- }
+ if (rings_hung)
+ return i915_handle_error(dev, true);
-repeat:
- /* Reset timer case chip hangs without another request being added */
- mod_timer(&dev_priv->gpu_error.hangcheck_timer,
- round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
+ if (busy_count)
+ /* Reset timer case chip hangs without another request
+ * being added */
+ mod_timer(&dev_priv->gpu_error.hangcheck_timer,
+ round_jiffies_up(jiffies +
+ DRM_I915_HANGCHECK_JIFFIES));
+}
+
+static void ibx_irq_preinstall(struct drm_device *dev)
+{
+ struct drm_i915_private *dev_priv = dev->dev_private;
+
+ if (HAS_PCH_NOP(dev))
+ return;
+
+ /* south display irq */
+ I915_WRITE(SDEIMR, 0xffffffff);
+ /*
+ * SDEIER is also touched by the interrupt handler to work around missed
+ * PCH interrupts. Hence we can't update it after the interrupt handler
+ * is enabled - instead we unconditionally enable all PCH interrupt
+ * sources here, but then only unmask them as needed with SDEIMR.
+ */
+ I915_WRITE(SDEIER, 0xffffffff);
+ POSTING_READ(SDEIER);
}
/* drm_dma.h hooks
@@ -2500,16 +2577,7 @@ static void ironlake_irq_preinstall(struct drm_device *dev)
I915_WRITE(GTIER, 0x0);
POSTING_READ(GTIER);
- /* south display irq */
- I915_WRITE(SDEIMR, 0xffffffff);
- /*
- * SDEIER is also touched by the interrupt handler to work around missed
- * PCH interrupts. Hence we can't update it after the interrupt handler
- * is enabled - instead we unconditionally enable all PCH interrupt
- * sources here, but then only unmask them as needed with SDEIMR.
- */
- I915_WRITE(SDEIER, 0xffffffff);
- POSTING_READ(SDEIER);
+ ibx_irq_preinstall(dev);
}
static void ivybridge_irq_preinstall(struct drm_device *dev)
@@ -2536,19 +2604,7 @@ static void ivybridge_irq_preinstall(struct drm_device *dev)
I915_WRITE(GEN6_PMIER, 0x0);
POSTING_READ(GEN6_PMIER);
- if (HAS_PCH_NOP(dev))
- return;
-
- /* south display irq */
- I915_WRITE(SDEIMR, 0xffffffff);
- /*
- * SDEIER is also touched by the interrupt handler to work around missed
- * PCH interrupts. Hence we can't update it after the interrupt handler
- * is enabled - instead we unconditionally enable all PCH interrupt
- * sources here, but then only unmask them as needed with SDEIMR.
- */
- I915_WRITE(SDEIER, 0xffffffff);
- POSTING_READ(SDEIER);
+ ibx_irq_preinstall(dev);
}
static void valleyview_irq_preinstall(struct drm_device *dev)