Open
Description
Describe the bug
During download tweets in academic api i had problem that collect results stops.
To Reproduce
Steps (and code snippet) to reproduce the behavior:
Code is part of bigger system and I can't put simple code. I cant to reproduce it on new snippet because error archive limit of monthly tweets scrap 😕
import json
from typing import List, Any, Dict
from arrow import Arrow
from searchtweets import collect_results, load_credentials, gen_request_parameters
from app.application.scrap_service import ScrapService
from app.domain.raw_json_twitter_response import RawJsonTwitterResponse
from app.util.log_util import get_logger
logger = get_logger('twitter_scrap')
class OfficialTwitterScrapService(ScrapService):
_config_file: str
_premium_search_args: Dict[str, Any]
def __init__(self, config_file: str):
self._config_file = config_file
self._premium_search_args = load_credentials(self._config_file,
yaml_key="search_tweets_premium",
env_overwrite=False)
def scrap(
self,
query: str,
since: Arrow,
until: Arrow
) -> List[RawJsonTwitterResponse]:
logger.info(
f'run scrap query :: {query}'
f' | since :: {since.isoformat()}'
f' | until :: {until.isoformat()}'
)
query = gen_request_parameters(
query=query,
granularity=None,
results_per_call=100,
start_time=self._get_string_time_from_arrow(since),
end_time=self._get_string_time_from_arrow(until),
expansions='attachments.poll_ids,attachments.media_keys,author_id,'
'entities.mentions.username,geo.place_id,in_reply_to_user_id,'
'referenced_tweets.id,referenced_tweets.id.author_id',
media_fields='duration_ms,height,media_key,preview_image_url,type,url,width,'
'public_metrics,alt_text',
place_fields='contained_within,country,country_code,full_name,geo,id,name,place_type',
tweet_fields='attachments, author_id, context_annotations, conversation_id, created_at,'
' entities, geo, id, in_reply_to_user_id, lang, public_metrics,'
' possibly_sensitive, referenced_tweets, reply_settings, source,'
' text, withheld'.replace(' ', ''),
user_fields='created_at,description,entities,id,location,name,pinned_tweet_id,'
'profile_image_url,protected,public_metrics,url,username,verified,withheld'
)
tweets = collect_results(
query,
max_tweets=10_000_000,
result_stream_args=self._premium_search_args
)
return [RawJsonTwitterResponse(json.dumps(it)) for it in tweets]
@staticmethod
def _get_string_time_from_arrow(time: Arrow) -> str:
return time.isoformat()[:-9]
Expected behavior
I want to scrap tweets without error.
Environment
- Ubuntu 20.20
- Docker Python:3.8
Additional context
Error log
2021-12-05 09:24:42,927 [searchtweets.result_stream ] INFO paging; total requests read so far: 103
2021-12-05 09:24:44,929 [searchtweets.result_stream ] DEBUG sending request
2021-12-05 09:24:45,971 [urllib3.connectionpool ] DEBUG https://api.twitter.com:443 "GET /2/tweets/search/all?query=%28%22%23covid%22+OR+%22%23COVID-19%22+OR+%22%23Covid19%22+OR+%22%23doros%C5%82o%C5%9B%C4%87%22+OR+%22%23generacjaX%22+OR+%22%23generacjaY%22+OR+%22%23generacjaZ%22+OR+%22%23genX%22+OR+%22%23genY%22+OR+%22%23genZ%22+OR+%22%23koronawirus%22+OR+%22%23koronawiruspolska%22+OR+%22%23liceum%22+OR+%22%23lockdown%22+OR+%22%23matura%22+OR+%22%23matura2020%22+OR+%22%23matura2021%22+OR+%22%23matura2022%22+OR+%22%23millenialsi%22+OR+%22%23m%C5%82odzi%22+OR+%22%23pandemia%22+OR+%22%23pierwszami%C5%82o%C5%9B%C4%87%22+OR+%22%23pierwszapraca%22+OR+%22%23praca2020%22+OR+%22%23praca2021%22+OR+%22%23praca2022%22+OR+%22%23pracazdalna%22+OR+%22%23praktyki%22+OR+%22%23rekrutacja2020%22+OR+%22%23rekrutacja2021%22+OR+%22%23rekrutacja2022%22+OR+%22%23siedznadupie%22+OR+%22%23solidarno%C5%9B%C4%87%22+OR+%22%23sta%C5%BC%22+OR+%22%23strajkkobiet%22+OR+%22%23studia2020%22+OR+%22%23studia2021%22+OR+%22%23studia2022%22+OR+%22%23studiazdalne%22+OR+%22%23zdalne%22+OR+%22%23zdalnenauczanie%22+OR+%22%23zostanwdomu%22%29+lang%3Apl&start_time=2020-04-13T00%3A00%3A00Z&end_time=2020-04-14T00%3A00%3A00Z&max_results=100&tweet.fields=attachments%2Cauthor_id%2Ccontext_annotations%2Cconversation_id%2Ccreated_at%2Centities%2Cgeo%2Cid%2Cin_reply_to_user_id%2Clang%2Cpublic_metrics%2Cpossibly_sensitive%2Creferenced_tweets%2Creply_settings%2Csource%2Ctext%2Cwithheld&user.fields=created_at%2Cdescription%2Centities%2Cid%2Clocation%2Cname%2Cpinned_tweet_id%2Cprofile_image_url%2Cprotected%2Cpublic_metrics%2Curl%2Cusername%2Cverified%2Cwithheld&media.fields=duration_ms%2Cheight%2Cmedia_key%2Cpreview_image_url%2Ctype%2Curl%2Cwidth%2Cpublic_metrics%2Calt_text&place.fields=contained_within%2Ccountry%2Ccountry_code%2Cfull_name%2Cgeo%2Cid%2Cname%2Cplace_type&expansions=attachments.poll_ids%2Cattachments.media_keys%2Cauthor_id%2Centities.mentions.username%2Cgeo.place_id%2Cin_reply_to_user_id%2Creferenced_tweets.id%2Creferenced_tweets.id.author_id&next_token=b26v89c19zqg8o3fo77h5m9ag2pb6dnxq7h6w432p5myl HTTP/1.1" 200 60232
Traceback (most recent call last):
File "app/main.py", line 41, in <module>
worker_loop()
File "app/main.py", line 34, in worker_loop
single_work()
File "app/main.py", line 28, in single_work
get_worker_service().run()
File "/app/app/application/worker_service.py", line 53, in run
raw_responses = self._scrap_service.scrap(
File "/app/app/infrastructure/official_twitter_scrap_service.py", line 54, in scrap
tweets = collect_results(
File "/root/.cache/pypoetry/virtualenvs/swps-tweet-infrastructure-9TtSrW0h-py3.8/lib/python3.8/site-packages/searchtweets/result_stream.py", line 467, in collect_results
return list(rs.stream())
File "/root/.cache/pypoetry/virtualenvs/swps-tweet-infrastructure-9TtSrW0h-py3.8/lib/python3.8/site-packages/searchtweets/result_stream.py", line 361, in stream
yield from self.formatted_output()
File "/root/.cache/pypoetry/virtualenvs/swps-tweet-infrastructure-9TtSrW0h-py3.8/lib/python3.8/site-packages/searchtweets/result_stream.py", line 288, in formatted_output
includes_tweets[included_id] = expand_payload(included_tweet)
File "/root/.cache/pypoetry/virtualenvs/swps-tweet-infrastructure-9TtSrW0h-py3.8/lib/python3.8/site-packages/searchtweets/result_stream.py", line 270, in expand_payload
place_id = payload["geo"]['place_id']
KeyError: 'place_id'