@@ -1539,16 +1539,27 @@ async def update_crawl_state(
15391539 print (f"status.stopReason: { status .stopReason } " , flush = True )
15401540
15411541 print (f"stats.size initial: { stats .size } " , flush = True )
1542+ print (f"status.filesAdded: { status .filesAdded } " , flush = True )
15421543 print (f"status.filesAddedSize: { status .filesAddedSize } " , flush = True )
15431544
15441545 # need to add size of previously completed WACZ files as well!
1545- # TODO: This sometimes results in the crawl's stats.size being
1546- # twice as large as expected when pausing crawls, as stats.size
1547- # is not necessarily decremented once WACZ files are uploaded
1548- # This then can have a downstream effects on the storage quota check
1549- stats .size += status .filesAddedSize
1550-
1551- print (f"stats.size after adding filesAddedSize: { stats .size } " , flush = True )
1546+ # TODO: Fix this so that it works as expected with pausing
1547+ # - The if clause here is close to a solution except it still results
1548+ # in pauses after the first showing a smaller-than-expected size
1549+ # because it no longer counts files added previous to resuming the crawl.
1550+ # - Kind of seems like what we need here is either a way of still adding
1551+ # files added prior to the current pause without double-adding files
1552+ # that are currently being uploaded.
1553+ # - Another way to do that might be to have the crawler decrement the size
1554+ # of a crawl by the amount of WACZs that are uploaded, so that this here
1555+ # in the operator can stay simpler?
1556+ if status .stopReason not in PAUSED_STATES :
1557+ stats .size += status .filesAddedSize
1558+ print (f"stats.size after adding filesAddedSize: { stats .size } " , flush = True )
1559+ else :
1560+ print (
1561+ "not adding filesAddedSize to stats.size, crawl is pausing" , flush = True
1562+ )
15521563
15531564 # update status
15541565 status .pagesDone = stats .done
0 commit comments