@@ -302,7 +302,8 @@ class KafkaConsumer(six.Iterator):
302
302
'sasl_plain_password' : None ,
303
303
'sasl_kerberos_service_name' : 'kafka' ,
304
304
'sasl_kerberos_domain_name' : None ,
305
- 'sasl_oauth_token_provider' : None
305
+ 'sasl_oauth_token_provider' : None ,
306
+ 'legacy_iterator' : False , # enable to revert to < 1.4.7 iterator
306
307
}
307
308
DEFAULT_SESSION_TIMEOUT_MS_0_9 = 30000
308
309
@@ -597,7 +598,7 @@ def partitions_for_topic(self, topic):
597
598
partitions = cluster .partitions_for_topic (topic )
598
599
return partitions
599
600
600
- def poll (self , timeout_ms = 0 , max_records = None ):
601
+ def poll (self , timeout_ms = 0 , max_records = None , update_offsets = True ):
601
602
"""Fetch data from assigned topics / partitions.
602
603
603
604
Records are fetched and returned in batches by topic-partition.
@@ -621,6 +622,12 @@ def poll(self, timeout_ms=0, max_records=None):
621
622
dict: Topic to list of records since the last fetch for the
622
623
subscribed list of topics and partitions.
623
624
"""
625
+ # Note: update_offsets is an internal-use only argument. It is used to
626
+ # support the python iterator interface, and which wraps consumer.poll()
627
+ # and requires that the partition offsets tracked by the fetcher are not
628
+ # updated until the iterator returns each record to the user. As such,
629
+ # the argument is not documented and should not be relied on by library
630
+ # users to not break in the future.
624
631
assert timeout_ms >= 0 , 'Timeout must not be negative'
625
632
if max_records is None :
626
633
max_records = self .config ['max_poll_records' ]
@@ -631,7 +638,7 @@ def poll(self, timeout_ms=0, max_records=None):
631
638
start = time .time ()
632
639
remaining = timeout_ms
633
640
while True :
634
- records = self ._poll_once (remaining , max_records )
641
+ records = self ._poll_once (remaining , max_records , update_offsets = update_offsets )
635
642
if records :
636
643
return records
637
644
@@ -641,7 +648,7 @@ def poll(self, timeout_ms=0, max_records=None):
641
648
if remaining <= 0 :
642
649
return {}
643
650
644
- def _poll_once (self , timeout_ms , max_records ):
651
+ def _poll_once (self , timeout_ms , max_records , update_offsets = True ):
645
652
"""Do one round of polling. In addition to checking for new data, this does
646
653
any needed heart-beating, auto-commits, and offset updates.
647
654
@@ -660,7 +667,7 @@ def _poll_once(self, timeout_ms, max_records):
660
667
661
668
# If data is available already, e.g. from a previous network client
662
669
# poll() call to commit, then just return it immediately
663
- records , partial = self ._fetcher .fetched_records (max_records )
670
+ records , partial = self ._fetcher .fetched_records (max_records , update_offsets = update_offsets )
664
671
if records :
665
672
# Before returning the fetched records, we can send off the
666
673
# next round of fetches and avoid block waiting for their
@@ -680,7 +687,7 @@ def _poll_once(self, timeout_ms, max_records):
680
687
if self ._coordinator .need_rejoin ():
681
688
return {}
682
689
683
- records , _ = self ._fetcher .fetched_records (max_records )
690
+ records , _ = self ._fetcher .fetched_records (max_records , update_offsets = update_offsets )
684
691
return records
685
692
686
693
def position (self , partition ):
@@ -743,6 +750,9 @@ def pause(self, *partitions):
743
750
for partition in partitions :
744
751
log .debug ("Pausing partition %s" , partition )
745
752
self ._subscription .pause (partition )
753
+ # Because the iterator checks is_fetchable() on each iteration
754
+ # we expect pauses to get handled automatically and therefore
755
+ # we do not need to reset the full iterator (forcing a full refetch)
746
756
747
757
def paused (self ):
748
758
"""Get the partitions that were previously paused using
@@ -790,6 +800,8 @@ def seek(self, partition, offset):
790
800
assert partition in self ._subscription .assigned_partitions (), 'Unassigned partition'
791
801
log .debug ("Seeking to offset %s for partition %s" , offset , partition )
792
802
self ._subscription .assignment [partition ].seek (offset )
803
+ if not self .config ['legacy_iterator' ]:
804
+ self ._iterator = None
793
805
794
806
def seek_to_beginning (self , * partitions ):
795
807
"""Seek to the oldest available offset for partitions.
@@ -814,6 +826,8 @@ def seek_to_beginning(self, *partitions):
814
826
for tp in partitions :
815
827
log .debug ("Seeking to beginning of partition %s" , tp )
816
828
self ._subscription .need_offset_reset (tp , OffsetResetStrategy .EARLIEST )
829
+ if not self .config ['legacy_iterator' ]:
830
+ self ._iterator = None
817
831
818
832
def seek_to_end (self , * partitions ):
819
833
"""Seek to the most recent available offset for partitions.
@@ -838,6 +852,8 @@ def seek_to_end(self, *partitions):
838
852
for tp in partitions :
839
853
log .debug ("Seeking to end of partition %s" , tp )
840
854
self ._subscription .need_offset_reset (tp , OffsetResetStrategy .LATEST )
855
+ if not self .config ['legacy_iterator' ]:
856
+ self ._iterator = None
841
857
842
858
def subscribe (self , topics = (), pattern = None , listener = None ):
843
859
"""Subscribe to a list of topics, or a topic regex pattern.
@@ -913,6 +929,8 @@ def unsubscribe(self):
913
929
self ._client .cluster .need_all_topic_metadata = False
914
930
self ._client .set_topics ([])
915
931
log .debug ("Unsubscribed all topics or patterns and assigned partitions" )
932
+ if not self .config ['legacy_iterator' ]:
933
+ self ._iterator = None
916
934
917
935
def metrics (self , raw = False ):
918
936
"""Get metrics on consumer performance.
@@ -1075,6 +1093,25 @@ def _update_fetch_positions(self, partitions):
1075
1093
# Then, do any offset lookups in case some positions are not known
1076
1094
self ._fetcher .update_fetch_positions (partitions )
1077
1095
1096
+ def _message_generator_v2 (self ):
1097
+ timeout_ms = 1000 * (self ._consumer_timeout - time .time ())
1098
+ record_map = self .poll (timeout_ms = timeout_ms , update_offsets = False )
1099
+ for tp , records in six .iteritems (record_map ):
1100
+ # Generators are stateful, and it is possible that the tp / records
1101
+ # here may become stale during iteration -- i.e., we seek to a
1102
+ # different offset, pause consumption, or lose assignment.
1103
+ for record in records :
1104
+ # is_fetchable(tp) should handle assignment changes and offset
1105
+ # resets; for all other changes (e.g., seeks) we'll rely on the
1106
+ # outer function destroying the existing iterator/generator
1107
+ # via self._iterator = None
1108
+ if not self ._subscription .is_fetchable (tp ):
1109
+ log .debug ("Not returning fetched records for partition %s"
1110
+ " since it is no longer fetchable" , tp )
1111
+ break
1112
+ self ._subscription .assignment [tp ].position = record .offset + 1
1113
+ yield record
1114
+
1078
1115
def _message_generator (self ):
1079
1116
assert self .assignment () or self .subscription () is not None , 'No topic subscription or manual partition assignment'
1080
1117
while time .time () < self ._consumer_timeout :
@@ -1127,6 +1164,26 @@ def __iter__(self): # pylint: disable=non-iterator-returned
1127
1164
return self
1128
1165
1129
1166
def __next__ (self ):
1167
+ # Now that the heartbeat thread runs in the background
1168
+ # there should be no reason to maintain a separate iterator
1169
+ # but we'll keep it available for a few releases just in case
1170
+ if self .config ['legacy_iterator' ]:
1171
+ return self .next_v1 ()
1172
+ else :
1173
+ return self .next_v2 ()
1174
+
1175
+ def next_v2 (self ):
1176
+ self ._set_consumer_timeout ()
1177
+ while time .time () < self ._consumer_timeout :
1178
+ if not self ._iterator :
1179
+ self ._iterator = self ._message_generator_v2 ()
1180
+ try :
1181
+ return next (self ._iterator )
1182
+ except StopIteration :
1183
+ self ._iterator = None
1184
+ raise StopIteration ()
1185
+
1186
+ def next_v1 (self ):
1130
1187
if not self ._iterator :
1131
1188
self ._iterator = self ._message_generator ()
1132
1189
0 commit comments