Skip to content

Problem in geo parsing in v2 API #143

Open
@markowanga

Description

@markowanga

Describe the bug
During download tweets in academic api i had problem that collect results stops.

To Reproduce
Steps (and code snippet) to reproduce the behavior:
Code is part of bigger system and I can't put simple code. I cant to reproduce it on new snippet because error archive limit of monthly tweets scrap 😕

import json
from typing import List, Any, Dict

from arrow import Arrow
from searchtweets import collect_results, load_credentials, gen_request_parameters

from app.application.scrap_service import ScrapService
from app.domain.raw_json_twitter_response import RawJsonTwitterResponse
from app.util.log_util import get_logger

logger = get_logger('twitter_scrap')


class OfficialTwitterScrapService(ScrapService):
    _config_file: str
    _premium_search_args: Dict[str, Any]

    def __init__(self, config_file: str):
        self._config_file = config_file
        self._premium_search_args = load_credentials(self._config_file,
                                                     yaml_key="search_tweets_premium",
                                                     env_overwrite=False)

    def scrap(
            self,
            query: str,
            since: Arrow,
            until: Arrow
    ) -> List[RawJsonTwitterResponse]:
        logger.info(
            f'run scrap query :: {query}'
            f' | since :: {since.isoformat()}'
            f' | until :: {until.isoformat()}'
        )
        query = gen_request_parameters(
            query=query,
            granularity=None,
            results_per_call=100,
            start_time=self._get_string_time_from_arrow(since),
            end_time=self._get_string_time_from_arrow(until),
            expansions='attachments.poll_ids,attachments.media_keys,author_id,'
                       'entities.mentions.username,geo.place_id,in_reply_to_user_id,'
                       'referenced_tweets.id,referenced_tweets.id.author_id',
            media_fields='duration_ms,height,media_key,preview_image_url,type,url,width,'
                         'public_metrics,alt_text',
            place_fields='contained_within,country,country_code,full_name,geo,id,name,place_type',
            tweet_fields='attachments, author_id, context_annotations, conversation_id, created_at,'
                         ' entities, geo, id, in_reply_to_user_id, lang, public_metrics,'
                         ' possibly_sensitive, referenced_tweets, reply_settings, source,'
                         ' text, withheld'.replace(' ', ''),
            user_fields='created_at,description,entities,id,location,name,pinned_tweet_id,'
                        'profile_image_url,protected,public_metrics,url,username,verified,withheld'
        )
        tweets = collect_results(
            query,
            max_tweets=10_000_000,
            result_stream_args=self._premium_search_args
        )
        return [RawJsonTwitterResponse(json.dumps(it)) for it in tweets]

    @staticmethod
    def _get_string_time_from_arrow(time: Arrow) -> str:
        return time.isoformat()[:-9]

Expected behavior
I want to scrap tweets without error.

Environment

  • Ubuntu 20.20
  • Docker Python:3.8

Additional context
Error log

2021-12-05 09:24:42,927 [searchtweets.result_stream    ] INFO     paging; total requests read so far: 103
2021-12-05 09:24:44,929 [searchtweets.result_stream    ] DEBUG    sending request
2021-12-05 09:24:45,971 [urllib3.connectionpool        ] DEBUG    https://api.twitter.com:443 "GET /2/tweets/search/all?query=%28%22%23covid%22+OR+%22%23COVID-19%22+OR+%22%23Covid19%22+OR+%22%23doros%C5%82o%C5%9B%C4%87%22+OR+%22%23generacjaX%22+OR+%22%23generacjaY%22+OR+%22%23generacjaZ%22+OR+%22%23genX%22+OR+%22%23genY%22+OR+%22%23genZ%22+OR+%22%23koronawirus%22+OR+%22%23koronawiruspolska%22+OR+%22%23liceum%22+OR+%22%23lockdown%22+OR+%22%23matura%22+OR+%22%23matura2020%22+OR+%22%23matura2021%22+OR+%22%23matura2022%22+OR+%22%23millenialsi%22+OR+%22%23m%C5%82odzi%22+OR+%22%23pandemia%22+OR+%22%23pierwszami%C5%82o%C5%9B%C4%87%22+OR+%22%23pierwszapraca%22+OR+%22%23praca2020%22+OR+%22%23praca2021%22+OR+%22%23praca2022%22+OR+%22%23pracazdalna%22+OR+%22%23praktyki%22+OR+%22%23rekrutacja2020%22+OR+%22%23rekrutacja2021%22+OR+%22%23rekrutacja2022%22+OR+%22%23siedznadupie%22+OR+%22%23solidarno%C5%9B%C4%87%22+OR+%22%23sta%C5%BC%22+OR+%22%23strajkkobiet%22+OR+%22%23studia2020%22+OR+%22%23studia2021%22+OR+%22%23studia2022%22+OR+%22%23studiazdalne%22+OR+%22%23zdalne%22+OR+%22%23zdalnenauczanie%22+OR+%22%23zostanwdomu%22%29+lang%3Apl&start_time=2020-04-13T00%3A00%3A00Z&end_time=2020-04-14T00%3A00%3A00Z&max_results=100&tweet.fields=attachments%2Cauthor_id%2Ccontext_annotations%2Cconversation_id%2Ccreated_at%2Centities%2Cgeo%2Cid%2Cin_reply_to_user_id%2Clang%2Cpublic_metrics%2Cpossibly_sensitive%2Creferenced_tweets%2Creply_settings%2Csource%2Ctext%2Cwithheld&user.fields=created_at%2Cdescription%2Centities%2Cid%2Clocation%2Cname%2Cpinned_tweet_id%2Cprofile_image_url%2Cprotected%2Cpublic_metrics%2Curl%2Cusername%2Cverified%2Cwithheld&media.fields=duration_ms%2Cheight%2Cmedia_key%2Cpreview_image_url%2Ctype%2Curl%2Cwidth%2Cpublic_metrics%2Calt_text&place.fields=contained_within%2Ccountry%2Ccountry_code%2Cfull_name%2Cgeo%2Cid%2Cname%2Cplace_type&expansions=attachments.poll_ids%2Cattachments.media_keys%2Cauthor_id%2Centities.mentions.username%2Cgeo.place_id%2Cin_reply_to_user_id%2Creferenced_tweets.id%2Creferenced_tweets.id.author_id&next_token=b26v89c19zqg8o3fo77h5m9ag2pb6dnxq7h6w432p5myl HTTP/1.1" 200 60232
Traceback (most recent call last):
  File "app/main.py", line 41, in <module>
    worker_loop()
  File "app/main.py", line 34, in worker_loop
    single_work()
  File "app/main.py", line 28, in single_work
    get_worker_service().run()
  File "/app/app/application/worker_service.py", line 53, in run
    raw_responses = self._scrap_service.scrap(
  File "/app/app/infrastructure/official_twitter_scrap_service.py", line 54, in scrap
    tweets = collect_results(
  File "/root/.cache/pypoetry/virtualenvs/swps-tweet-infrastructure-9TtSrW0h-py3.8/lib/python3.8/site-packages/searchtweets/result_stream.py", line 467, in collect_results
    return list(rs.stream())
  File "/root/.cache/pypoetry/virtualenvs/swps-tweet-infrastructure-9TtSrW0h-py3.8/lib/python3.8/site-packages/searchtweets/result_stream.py", line 361, in stream
    yield from self.formatted_output()
  File "/root/.cache/pypoetry/virtualenvs/swps-tweet-infrastructure-9TtSrW0h-py3.8/lib/python3.8/site-packages/searchtweets/result_stream.py", line 288, in formatted_output
    includes_tweets[included_id] = expand_payload(included_tweet)
  File "/root/.cache/pypoetry/virtualenvs/swps-tweet-infrastructure-9TtSrW0h-py3.8/lib/python3.8/site-packages/searchtweets/result_stream.py", line 270, in expand_payload
    place_id = payload["geo"]['place_id']
KeyError: 'place_id'

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions