diff options
Diffstat (limited to 'drivers/gpu/drm/i915/i915_irq.c')
-rw-r--r-- | drivers/gpu/drm/i915/i915_irq.c | 304 |
1 files changed, 180 insertions, 124 deletions
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index e17bbe20119..7857430943e 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -683,7 +683,6 @@ static void notify_ring(struct drm_device *dev, wake_up_all(&ring->irq_queue); if (i915_enable_hangcheck) { - dev_priv->gpu_error.hangcheck_count = 0; mod_timer(&dev_priv->gpu_error.hangcheck_timer, round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES)); } @@ -1656,7 +1655,7 @@ static u32 capture_pinned_bo(struct drm_i915_error_buffer *err, struct drm_i915_gem_object *obj; int i = 0; - list_for_each_entry(obj, head, gtt_list) { + list_for_each_entry(obj, head, global_list) { if (obj->pin_count == 0) continue; @@ -1798,7 +1797,7 @@ static void i915_gem_record_active_context(struct intel_ring_buffer *ring, if (ring->id != RCS || !error->ccid) return; - list_for_each_entry(obj, &dev_priv->mm.bound_list, gtt_list) { + list_for_each_entry(obj, &dev_priv->mm.bound_list, global_list) { if ((error->ccid & PAGE_MASK) == obj->gtt_offset) { ering->ctx = i915_error_object_create_sized(dev_priv, obj, 1); @@ -1935,7 +1934,7 @@ static void i915_capture_error_state(struct drm_device *dev) list_for_each_entry(obj, &dev_priv->mm.active_list, mm_list) i++; error->active_bo_count = i; - list_for_each_entry(obj, &dev_priv->mm.bound_list, gtt_list) + list_for_each_entry(obj, &dev_priv->mm.bound_list, global_list) if (obj->pin_count) i++; error->pinned_bo_count = i - error->active_bo_count; @@ -2315,38 +2314,28 @@ ring_last_seqno(struct intel_ring_buffer *ring) struct drm_i915_gem_request, list)->seqno; } -static bool i915_hangcheck_ring_idle(struct intel_ring_buffer *ring, - u32 ring_seqno, bool *err) +static bool +ring_idle(struct intel_ring_buffer *ring, u32 seqno) { - if (list_empty(&ring->request_list) || - i915_seqno_passed(ring_seqno, ring_last_seqno(ring))) { - /* Issue a wake-up to catch stuck h/w. */ - if (waitqueue_active(&ring->irq_queue)) { - DRM_ERROR("Hangcheck timer elapsed... %s idle\n", - ring->name); - wake_up_all(&ring->irq_queue); - *err = true; - } - return true; - } - return false; + return (list_empty(&ring->request_list) || + i915_seqno_passed(seqno, ring_last_seqno(ring))); } -static bool semaphore_passed(struct intel_ring_buffer *ring) +static struct intel_ring_buffer * +semaphore_waits_for(struct intel_ring_buffer *ring, u32 *seqno) { struct drm_i915_private *dev_priv = ring->dev->dev_private; - u32 acthd = intel_ring_get_active_head(ring) & HEAD_ADDR; - struct intel_ring_buffer *signaller; - u32 cmd, ipehr, acthd_min; + u32 cmd, ipehr, acthd, acthd_min; ipehr = I915_READ(RING_IPEHR(ring->mmio_base)); if ((ipehr & ~(0x3 << 16)) != (MI_SEMAPHORE_MBOX | MI_SEMAPHORE_COMPARE | MI_SEMAPHORE_REGISTER)) - return false; + return NULL; /* ACTHD is likely pointing to the dword after the actual command, * so scan backwards until we find the MBOX. */ + acthd = intel_ring_get_active_head(ring) & HEAD_ADDR; acthd_min = max((int)acthd - 3 * 4, 0); do { cmd = ioread32(ring->virtual_start + acthd); @@ -2355,128 +2344,216 @@ static bool semaphore_passed(struct intel_ring_buffer *ring) acthd -= 4; if (acthd < acthd_min) - return false; + return NULL; } while (1); - signaller = &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3]; - return i915_seqno_passed(signaller->get_seqno(signaller, false), - ioread32(ring->virtual_start+acthd+4)+1); + *seqno = ioread32(ring->virtual_start+acthd+4)+1; + return &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3]; } -static bool kick_ring(struct intel_ring_buffer *ring) +static int semaphore_passed(struct intel_ring_buffer *ring) { - struct drm_device *dev = ring->dev; - struct drm_i915_private *dev_priv = dev->dev_private; - u32 tmp = I915_READ_CTL(ring); - if (tmp & RING_WAIT) { - DRM_ERROR("Kicking stuck wait on %s\n", - ring->name); - I915_WRITE_CTL(ring, tmp); - return true; - } + struct drm_i915_private *dev_priv = ring->dev->dev_private; + struct intel_ring_buffer *signaller; + u32 seqno, ctl; - if (INTEL_INFO(dev)->gen >= 6 && - tmp & RING_WAIT_SEMAPHORE && - semaphore_passed(ring)) { - DRM_ERROR("Kicking stuck semaphore on %s\n", - ring->name); - I915_WRITE_CTL(ring, tmp); - return true; - } - return false; + ring->hangcheck.deadlock = true; + + signaller = semaphore_waits_for(ring, &seqno); + if (signaller == NULL || signaller->hangcheck.deadlock) + return -1; + + /* cursory check for an unkickable deadlock */ + ctl = I915_READ_CTL(signaller); + if (ctl & RING_WAIT_SEMAPHORE && semaphore_passed(signaller) < 0) + return -1; + + return i915_seqno_passed(signaller->get_seqno(signaller, false), seqno); } -static bool i915_hangcheck_ring_hung(struct intel_ring_buffer *ring) +static void semaphore_clear_deadlocks(struct drm_i915_private *dev_priv) { - if (IS_GEN2(ring->dev)) - return false; + struct intel_ring_buffer *ring; + int i; - /* Is the chip hanging on a WAIT_FOR_EVENT? - * If so we can simply poke the RB_WAIT bit - * and break the hang. This should work on - * all but the second generation chipsets. - */ - return !kick_ring(ring); + for_each_ring(ring, dev_priv, i) + ring->hangcheck.deadlock = false; } -static bool i915_hangcheck_hung(struct drm_device *dev) +static enum intel_ring_hangcheck_action +ring_stuck(struct intel_ring_buffer *ring, u32 acthd) { - drm_i915_private_t *dev_priv = dev->dev_private; + struct drm_device *dev = ring->dev; + struct drm_i915_private *dev_priv = dev->dev_private; + u32 tmp; - if (dev_priv->gpu_error.hangcheck_count++ > 1) { - bool hung = true; - struct intel_ring_buffer *ring; - int i; + if (ring->hangcheck.acthd != acthd) + return active; - DRM_ERROR("Hangcheck timer elapsed... GPU hung\n"); - i915_handle_error(dev, true); + if (IS_GEN2(dev)) + return hung; - for_each_ring(ring, dev_priv, i) - hung &= i915_hangcheck_ring_hung(ring); + /* Is the chip hanging on a WAIT_FOR_EVENT? + * If so we can simply poke the RB_WAIT bit + * and break the hang. This should work on + * all but the second generation chipsets. + */ + tmp = I915_READ_CTL(ring); + if (tmp & RING_WAIT) { + DRM_ERROR("Kicking stuck wait on %s\n", + ring->name); + I915_WRITE_CTL(ring, tmp); + return kick; + } - return hung; + if (INTEL_INFO(dev)->gen >= 6 && tmp & RING_WAIT_SEMAPHORE) { + switch (semaphore_passed(ring)) { + default: + return hung; + case 1: + DRM_ERROR("Kicking stuck semaphore on %s\n", + ring->name); + I915_WRITE_CTL(ring, tmp); + return kick; + case 0: + return wait; + } } - return false; + return hung; } /** * This is called when the chip hasn't reported back with completed - * batchbuffers in a long time. The first time this is called we simply record - * ACTHD. If ACTHD hasn't changed by the time the hangcheck timer elapses - * again, we assume the chip is wedged and try to fix it. + * batchbuffers in a long time. We keep track per ring seqno progress and + * if there are no progress, hangcheck score for that ring is increased. + * Further, acthd is inspected to see if the ring is stuck. On stuck case + * we kick the ring. If we see no progress on three subsequent calls + * we assume chip is wedged and try to fix it by resetting the chip. */ void i915_hangcheck_elapsed(unsigned long data) { struct drm_device *dev = (struct drm_device *)data; drm_i915_private_t *dev_priv = dev->dev_private; struct intel_ring_buffer *ring; - bool err = false, idle; int i; - u32 seqno[I915_NUM_RINGS]; - bool work_done; + int busy_count = 0, rings_hung = 0; + bool stuck[I915_NUM_RINGS] = { 0 }; +#define BUSY 1 +#define KICK 5 +#define HUNG 20 +#define FIRE 30 if (!i915_enable_hangcheck) return; - idle = true; for_each_ring(ring, dev_priv, i) { - seqno[i] = ring->get_seqno(ring, false); - idle &= i915_hangcheck_ring_idle(ring, seqno[i], &err); - } - - /* If all work is done then ACTHD clearly hasn't advanced. */ - if (idle) { - if (err) { - if (i915_hangcheck_hung(dev)) - return; - - goto repeat; + u32 seqno, acthd; + bool busy = true; + + semaphore_clear_deadlocks(dev_priv); + + seqno = ring->get_seqno(ring, false); + acthd = intel_ring_get_active_head(ring); + + if (ring->hangcheck.seqno == seqno) { + if (ring_idle(ring, seqno)) { + if (waitqueue_active(&ring->irq_queue)) { + /* Issue a wake-up to catch stuck h/w. */ + DRM_ERROR("Hangcheck timer elapsed... %s idle\n", + ring->name); + wake_up_all(&ring->irq_queue); + ring->hangcheck.score += HUNG; + } else + busy = false; + } else { + int score; + + /* We always increment the hangcheck score + * if the ring is busy and still processing + * the same request, so that no single request + * can run indefinitely (such as a chain of + * batches). The only time we do not increment + * the hangcheck score on this ring, if this + * ring is in a legitimate wait for another + * ring. In that case the waiting ring is a + * victim and we want to be sure we catch the + * right culprit. Then every time we do kick + * the ring, add a small increment to the + * score so that we can catch a batch that is + * being repeatedly kicked and so responsible + * for stalling the machine. + */ + ring->hangcheck.action = ring_stuck(ring, + acthd); + + switch (ring->hangcheck.action) { + case wait: + score = 0; + break; + case active: + score = BUSY; + break; + case kick: + score = KICK; + break; + case hung: + score = HUNG; + stuck[i] = true; + break; + } + ring->hangcheck.score += score; + } + } else { + /* Gradually reduce the count so that we catch DoS + * attempts across multiple batches. + */ + if (ring->hangcheck.score > 0) + ring->hangcheck.score--; } - dev_priv->gpu_error.hangcheck_count = 0; - return; + ring->hangcheck.seqno = seqno; + ring->hangcheck.acthd = acthd; + busy_count += busy; } - work_done = false; for_each_ring(ring, dev_priv, i) { - if (ring->hangcheck.seqno != seqno[i]) { - work_done = true; - ring->hangcheck.seqno = seqno[i]; + if (ring->hangcheck.score > FIRE) { + DRM_ERROR("%s on %s\n", + stuck[i] ? "stuck" : "no progress", + ring->name); + rings_hung++; } } - if (!work_done) { - if (i915_hangcheck_hung(dev)) - return; - } else { - dev_priv->gpu_error.hangcheck_count = 0; - } + if (rings_hung) + return i915_handle_error(dev, true); -repeat: - /* Reset timer case chip hangs without another request being added */ - mod_timer(&dev_priv->gpu_error.hangcheck_timer, - round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES)); + if (busy_count) + /* Reset timer case chip hangs without another request + * being added */ + mod_timer(&dev_priv->gpu_error.hangcheck_timer, + round_jiffies_up(jiffies + + DRM_I915_HANGCHECK_JIFFIES)); +} + +static void ibx_irq_preinstall(struct drm_device *dev) +{ + struct drm_i915_private *dev_priv = dev->dev_private; + + if (HAS_PCH_NOP(dev)) + return; + + /* south display irq */ + I915_WRITE(SDEIMR, 0xffffffff); + /* + * SDEIER is also touched by the interrupt handler to work around missed + * PCH interrupts. Hence we can't update it after the interrupt handler + * is enabled - instead we unconditionally enable all PCH interrupt + * sources here, but then only unmask them as needed with SDEIMR. + */ + I915_WRITE(SDEIER, 0xffffffff); + POSTING_READ(SDEIER); } /* drm_dma.h hooks @@ -2500,16 +2577,7 @@ static void ironlake_irq_preinstall(struct drm_device *dev) I915_WRITE(GTIER, 0x0); POSTING_READ(GTIER); - /* south display irq */ - I915_WRITE(SDEIMR, 0xffffffff); - /* - * SDEIER is also touched by the interrupt handler to work around missed - * PCH interrupts. Hence we can't update it after the interrupt handler - * is enabled - instead we unconditionally enable all PCH interrupt - * sources here, but then only unmask them as needed with SDEIMR. - */ - I915_WRITE(SDEIER, 0xffffffff); - POSTING_READ(SDEIER); + ibx_irq_preinstall(dev); } static void ivybridge_irq_preinstall(struct drm_device *dev) @@ -2536,19 +2604,7 @@ static void ivybridge_irq_preinstall(struct drm_device *dev) I915_WRITE(GEN6_PMIER, 0x0); POSTING_READ(GEN6_PMIER); - if (HAS_PCH_NOP(dev)) - return; - - /* south display irq */ - I915_WRITE(SDEIMR, 0xffffffff); - /* - * SDEIER is also touched by the interrupt handler to work around missed - * PCH interrupts. Hence we can't update it after the interrupt handler - * is enabled - instead we unconditionally enable all PCH interrupt - * sources here, but then only unmask them as needed with SDEIMR. - */ - I915_WRITE(SDEIER, 0xffffffff); - POSTING_READ(SDEIER); + ibx_irq_preinstall(dev); } static void valleyview_irq_preinstall(struct drm_device *dev) |