@@ -624,22 +624,21 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
624
624
}
625
625
626
626
/*
627
- * Tell whether there are active queues or groups with differentiated weights.
627
+ * Tell whether there are active queues with different weights or
628
+ * active groups.
628
629
*/
629
- static bool bfq_differentiated_weights (struct bfq_data * bfqd )
630
+ static bool bfq_varied_queue_weights_or_active_groups (struct bfq_data * bfqd )
630
631
{
631
632
/*
632
- * For weights to differ, at least one of the trees must contain
633
+ * For queue weights to differ, queue_weights_tree must contain
633
634
* at least two nodes.
634
635
*/
635
636
return (!RB_EMPTY_ROOT (& bfqd -> queue_weights_tree ) &&
636
637
(bfqd -> queue_weights_tree .rb_node -> rb_left ||
637
638
bfqd -> queue_weights_tree .rb_node -> rb_right )
638
639
#ifdef CONFIG_BFQ_GROUP_IOSCHED
639
640
) ||
640
- (!RB_EMPTY_ROOT (& bfqd -> group_weights_tree ) &&
641
- (bfqd -> group_weights_tree .rb_node -> rb_left ||
642
- bfqd -> group_weights_tree .rb_node -> rb_right )
641
+ (bfqd -> num_active_groups > 0
643
642
#endif
644
643
);
645
644
}
@@ -657,26 +656,25 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd)
657
656
* 3) all active groups at the same level in the groups tree have the same
658
657
* number of children.
659
658
*
660
- * Unfortunately, keeping the necessary state for evaluating exactly the
661
- * above symmetry conditions would be quite complex and time-consuming.
662
- * Therefore this function evaluates, instead, the following stronger
663
- * sub-conditions, for which it is much easier to maintain the needed
664
- * state:
659
+ * Unfortunately, keeping the necessary state for evaluating exactly
660
+ * the last two symmetry sub- conditions above would be quite complex
661
+ * and time consuming. Therefore this function evaluates, instead,
662
+ * only the following stronger two sub-conditions, for which it is
663
+ * much easier to maintain the needed state:
665
664
* 1) all active queues have the same weight,
666
- * 2) all active groups have the same weight,
667
- * 3) all active groups have at most one active child each.
668
- * In particular, the last two conditions are always true if hierarchical
669
- * support and the cgroups interface are not enabled, thus no state needs
670
- * to be maintained in this case.
665
+ * 2) there are no active groups.
666
+ * In particular, the last condition is always true if hierarchical
667
+ * support or the cgroups interface are not enabled, thus no state
668
+ * needs to be maintained in this case.
671
669
*/
672
670
static bool bfq_symmetric_scenario (struct bfq_data * bfqd )
673
671
{
674
- return !bfq_differentiated_weights (bfqd );
672
+ return !bfq_varied_queue_weights_or_active_groups (bfqd );
675
673
}
676
674
677
675
/*
678
676
* If the weight-counter tree passed as input contains no counter for
679
- * the weight of the input entity , then add that counter; otherwise just
677
+ * the weight of the input queue , then add that counter; otherwise just
680
678
* increment the existing counter.
681
679
*
682
680
* Note that weight-counter trees contain few nodes in mostly symmetric
@@ -687,25 +685,25 @@ static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
687
685
* In most scenarios, the rate at which nodes are created/destroyed
688
686
* should be low too.
689
687
*/
690
- void bfq_weights_tree_add (struct bfq_data * bfqd , struct bfq_entity * entity ,
688
+ void bfq_weights_tree_add (struct bfq_data * bfqd , struct bfq_queue * bfqq ,
691
689
struct rb_root * root )
692
690
{
691
+ struct bfq_entity * entity = & bfqq -> entity ;
693
692
struct rb_node * * new = & (root -> rb_node ), * parent = NULL ;
694
693
695
694
/*
696
- * Do not insert if the entity is already associated with a
695
+ * Do not insert if the queue is already associated with a
697
696
* counter, which happens if:
698
- * 1) the entity is associated with a queue,
699
- * 2) a request arrival has caused the queue to become both
697
+ * 1) a request arrival has caused the queue to become both
700
698
* non-weight-raised, and hence change its weight, and
701
699
* backlogged; in this respect, each of the two events
702
700
* causes an invocation of this function,
703
- * 3 ) this is the invocation of this function caused by the
701
+ * 2 ) this is the invocation of this function caused by the
704
702
* second event. This second invocation is actually useless,
705
703
* and we handle this fact by exiting immediately. More
706
704
* efficient or clearer solutions might possibly be adopted.
707
705
*/
708
- if (entity -> weight_counter )
706
+ if (bfqq -> weight_counter )
709
707
return ;
710
708
711
709
while (* new ) {
@@ -715,7 +713,7 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
715
713
parent = * new ;
716
714
717
715
if (entity -> weight == __counter -> weight ) {
718
- entity -> weight_counter = __counter ;
716
+ bfqq -> weight_counter = __counter ;
719
717
goto inc_counter ;
720
718
}
721
719
if (entity -> weight < __counter -> weight )
@@ -724,66 +722,67 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
724
722
new = & ((* new )-> rb_right );
725
723
}
726
724
727
- entity -> weight_counter = kzalloc (sizeof (struct bfq_weight_counter ),
728
- GFP_ATOMIC );
725
+ bfqq -> weight_counter = kzalloc (sizeof (struct bfq_weight_counter ),
726
+ GFP_ATOMIC );
729
727
730
728
/*
731
729
* In the unlucky event of an allocation failure, we just
732
- * exit. This will cause the weight of entity to not be
733
- * considered in bfq_differentiated_weights, which, in its
734
- * turn, causes the scenario to be deemed wrongly symmetric in
735
- * case entity's weight would have been the only weight making
736
- * the scenario asymmetric. On the bright side, no unbalance
737
- * will however occur when entity becomes inactive again (the
738
- * invocation of this function is triggered by an activation
739
- * of entity). In fact, bfq_weights_tree_remove does nothing
740
- * if !entity->weight_counter.
730
+ * exit. This will cause the weight of queue to not be
731
+ * considered in bfq_varied_queue_weights_or_active_groups,
732
+ * which, in its turn, causes the scenario to be deemed
733
+ * wrongly symmetric in case bfqq's weight would have been
734
+ * the only weight making the scenario asymmetric. On the
735
+ * bright side, no unbalance will however occur when bfqq
736
+ * becomes inactive again (the invocation of this function
737
+ * is triggered by an activation of queue). In fact,
738
+ * bfq_weights_tree_remove does nothing if
739
+ * !bfqq->weight_counter.
741
740
*/
742
- if (unlikely (!entity -> weight_counter ))
741
+ if (unlikely (!bfqq -> weight_counter ))
743
742
return ;
744
743
745
- entity -> weight_counter -> weight = entity -> weight ;
746
- rb_link_node (& entity -> weight_counter -> weights_node , parent , new );
747
- rb_insert_color (& entity -> weight_counter -> weights_node , root );
744
+ bfqq -> weight_counter -> weight = entity -> weight ;
745
+ rb_link_node (& bfqq -> weight_counter -> weights_node , parent , new );
746
+ rb_insert_color (& bfqq -> weight_counter -> weights_node , root );
748
747
749
748
inc_counter :
750
- entity -> weight_counter -> num_active ++ ;
749
+ bfqq -> weight_counter -> num_active ++ ;
751
750
}
752
751
753
752
/*
754
- * Decrement the weight counter associated with the entity , and, if the
753
+ * Decrement the weight counter associated with the queue , and, if the
755
754
* counter reaches 0, remove the counter from the tree.
756
755
* See the comments to the function bfq_weights_tree_add() for considerations
757
756
* about overhead.
758
757
*/
759
758
void __bfq_weights_tree_remove (struct bfq_data * bfqd ,
760
- struct bfq_entity * entity ,
759
+ struct bfq_queue * bfqq ,
761
760
struct rb_root * root )
762
761
{
763
- if (!entity -> weight_counter )
762
+ if (!bfqq -> weight_counter )
764
763
return ;
765
764
766
- entity -> weight_counter -> num_active -- ;
767
- if (entity -> weight_counter -> num_active > 0 )
765
+ bfqq -> weight_counter -> num_active -- ;
766
+ if (bfqq -> weight_counter -> num_active > 0 )
768
767
goto reset_entity_pointer ;
769
768
770
- rb_erase (& entity -> weight_counter -> weights_node , root );
771
- kfree (entity -> weight_counter );
769
+ rb_erase (& bfqq -> weight_counter -> weights_node , root );
770
+ kfree (bfqq -> weight_counter );
772
771
773
772
reset_entity_pointer :
774
- entity -> weight_counter = NULL ;
773
+ bfqq -> weight_counter = NULL ;
775
774
}
776
775
777
776
/*
778
- * Invoke __bfq_weights_tree_remove on bfqq and all its inactive
779
- * parent entities .
777
+ * Invoke __bfq_weights_tree_remove on bfqq and decrement the number
778
+ * of active groups for each queue's inactive parent entity .
780
779
*/
781
780
void bfq_weights_tree_remove (struct bfq_data * bfqd ,
782
781
struct bfq_queue * bfqq )
783
782
{
784
783
struct bfq_entity * entity = bfqq -> entity .parent ;
785
784
786
- __bfq_weights_tree_remove (bfqd , & bfqq -> entity ,
785
+ __bfq_weights_tree_remove (bfqd , bfqq ,
787
786
& bfqd -> queue_weights_tree );
788
787
789
788
for_each_entity (entity ) {
@@ -797,17 +796,13 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd,
797
796
* next_in_service for details on why
798
797
* in_service_entity must be checked too).
799
798
*
800
- * As a consequence, the weight of entity is
801
- * not to be removed. In addition, if entity
802
- * is active, then its parent entities are
803
- * active as well, and thus their weights are
804
- * not to be removed either. In the end, this
805
- * loop must stop here.
799
+ * As a consequence, its parent entities are
800
+ * active as well, and thus this loop must
801
+ * stop here.
806
802
*/
807
803
break ;
808
804
}
809
- __bfq_weights_tree_remove (bfqd , entity ,
810
- & bfqd -> group_weights_tree );
805
+ bfqd -> num_active_groups -- ;
811
806
}
812
807
}
813
808
@@ -3506,9 +3501,11 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
3506
3501
* symmetric scenario where:
3507
3502
* (i) each of these processes must get the same throughput as
3508
3503
* the others;
3509
- * (ii) all these processes have the same I/O pattern
3510
- (either sequential or random).
3511
- * In fact, in such a scenario, the drive will tend to treat
3504
+ * (ii) the I/O of each process has the same properties, in
3505
+ * terms of locality (sequential or random), direction
3506
+ * (reads or writes), request sizes, greediness
3507
+ * (from I/O-bound to sporadic), and so on.
3508
+ * In fact, in such a scenario, the drive tends to treat
3512
3509
* the requests of each of these processes in about the same
3513
3510
* way as the requests of the others, and thus to provide
3514
3511
* each of these processes with about the same throughput
@@ -3517,18 +3514,50 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
3517
3514
* certainly needed to guarantee that bfqq receives its
3518
3515
* assigned fraction of the device throughput (see [1] for
3519
3516
* details).
3517
+ * The problem is that idling may significantly reduce
3518
+ * throughput with certain combinations of types of I/O and
3519
+ * devices. An important example is sync random I/O, on flash
3520
+ * storage with command queueing. So, unless bfqq falls in the
3521
+ * above cases where idling also boosts throughput, it would
3522
+ * be important to check conditions (i) and (ii) accurately,
3523
+ * so as to avoid idling when not strictly needed for service
3524
+ * guarantees.
3525
+ *
3526
+ * Unfortunately, it is extremely difficult to thoroughly
3527
+ * check condition (ii). And, in case there are active groups,
3528
+ * it becomes very difficult to check condition (i) too. In
3529
+ * fact, if there are active groups, then, for condition (i)
3530
+ * to become false, it is enough that an active group contains
3531
+ * more active processes or sub-groups than some other active
3532
+ * group. We address this issue with the following bi-modal
3533
+ * behavior, implemented in the function
3534
+ * bfq_symmetric_scenario().
3520
3535
*
3521
- * We address this issue by controlling, actually, only the
3522
- * symmetry sub-condition (i), i.e., provided that
3523
- * sub-condition (i) holds, idling is not performed,
3524
- * regardless of whether sub-condition (ii) holds. In other
3525
- * words, only if sub-condition (i) holds, then idling is
3536
+ * If there are active groups, then the scenario is tagged as
3537
+ * asymmetric, conservatively, without checking any of the
3538
+ * conditions (i) and (ii). So the device is idled for bfqq.
3539
+ * This behavior matches also the fact that groups are created
3540
+ * exactly if controlling I/O (to preserve bandwidth and
3541
+ * latency guarantees) is a primary concern.
3542
+ *
3543
+ * On the opposite end, if there are no active groups, then
3544
+ * only condition (i) is actually controlled, i.e., provided
3545
+ * that condition (i) holds, idling is not performed,
3546
+ * regardless of whether condition (ii) holds. In other words,
3547
+ * only if condition (i) does not hold, then idling is
3526
3548
* allowed, and the device tends to be prevented from queueing
3527
- * many requests, possibly of several processes. The reason
3528
- * for not controlling also sub-condition (ii) is that we
3529
- * exploit preemption to preserve guarantees in case of
3530
- * symmetric scenarios, even if (ii) does not hold, as
3531
- * explained in the next two paragraphs.
3549
+ * many requests, possibly of several processes. Since there
3550
+ * are no active groups, then, to control condition (i) it is
3551
+ * enough to check whether all active queues have the same
3552
+ * weight.
3553
+ *
3554
+ * Not checking condition (ii) evidently exposes bfqq to the
3555
+ * risk of getting less throughput than its fair share.
3556
+ * However, for queues with the same weight, a further
3557
+ * mechanism, preemption, mitigates or even eliminates this
3558
+ * problem. And it does so without consequences on overall
3559
+ * throughput. This mechanism and its benefits are explained
3560
+ * in the next three paragraphs.
3532
3561
*
3533
3562
* Even if a queue, say Q, is expired when it remains idle, Q
3534
3563
* can still preempt the new in-service queue if the next
@@ -3542,11 +3571,7 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
3542
3571
* idling allows the internal queues of the device to contain
3543
3572
* many requests, and thus to reorder requests, we can rather
3544
3573
* safely assume that the internal scheduler still preserves a
3545
- * minimum of mid-term fairness. The motivation for using
3546
- * preemption instead of idling is that, by not idling,
3547
- * service guarantees are preserved without minimally
3548
- * sacrificing throughput. In other words, both a high
3549
- * throughput and its desired distribution are obtained.
3574
+ * minimum of mid-term fairness.
3550
3575
*
3551
3576
* More precisely, this preemption-based, idleless approach
3552
3577
* provides fairness in terms of IOPS, and not sectors per
@@ -3565,27 +3590,27 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
3565
3590
* 1024/8 times as high as the service received by the other
3566
3591
* queue.
3567
3592
*
3568
- * On the other hand, device idling is performed, and thus
3569
- * pure sector-domain guarantees are provided, for the
3570
- * following queues, which are likely to need stronger
3571
- * throughput guarantees: weight-raised queues, and queues
3572
- * with a higher weight than other queues. When such queues
3573
- * are active, sub-condition (i) is false, which triggers
3574
- * device idling.
3593
+ * The motivation for using preemption instead of idling (for
3594
+ * queues with the same weight) is that, by not idling,
3595
+ * service guarantees are preserved (completely or at least in
3596
+ * part) without minimally sacrificing throughput. And, if
3597
+ * there is no active group, then the primary expectation for
3598
+ * this device is probably a high throughput.
3575
3599
*
3576
- * According to the above considerations, the next variable is
3577
- * true (only) if sub-condition (i) holds. To compute the
3578
- * value of this variable, we not only use the return value of
3579
- * the function bfq_symmetric_scenario(), but also check
3580
- * whether bfqq is being weight-raised, because
3581
- * bfq_symmetric_scenario() does not take into account also
3582
- * weight-raised queues (see comments on
3583
- * bfq_weights_tree_add()). In particular, if bfqq is being
3584
- * weight-raised, it is important to idle only if there are
3585
- * other, non-weight-raised queues that may steal throughput
3586
- * to bfqq. Actually, we should be even more precise, and
3587
- * differentiate between interactive weight raising and
3588
- * soft real-time weight raising.
3600
+ * We are now left only with explaining the additional
3601
+ * compound condition that is checked below for deciding
3602
+ * whether the scenario is asymmetric. To explain this
3603
+ * compound condition, we need to add that the function
3604
+ * bfq_symmetric_scenario checks the weights of only
3605
+ * non-weight-raised queues, for efficiency reasons (see
3606
+ * comments on bfq_weights_tree_add()). Then the fact that
3607
+ * bfqq is weight-raised is checked explicitly here. More
3608
+ * precisely, the compound condition below takes into account
3609
+ * also the fact that, even if bfqq is being weight-raised,
3610
+ * the scenario is still symmetric if all active queues happen
3611
+ * to be weight-raised. Actually, we should be even more
3612
+ * precise here, and differentiate between interactive weight
3613
+ * raising and soft real-time weight raising.
3589
3614
*
3590
3615
* As a side note, it is worth considering that the above
3591
3616
* device-idling countermeasures may however fail in the
@@ -5392,7 +5417,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
5392
5417
bfqd -> idle_slice_timer .function = bfq_idle_slice_timer ;
5393
5418
5394
5419
bfqd -> queue_weights_tree = RB_ROOT ;
5395
- bfqd -> group_weights_tree = RB_ROOT ;
5420
+ bfqd -> num_active_groups = 0 ;
5396
5421
5397
5422
INIT_LIST_HEAD (& bfqd -> active_list );
5398
5423
INIT_LIST_HEAD (& bfqd -> idle_list );
0 commit comments