def parse_xml(xml): page = etree.fromstring(xml) children = dict((el.tag, el) for el in page) if 'redirect' in children or \ WIKIPEDIA_NAMESPACES.match(children['title'].text): raise StopIteration() revisions = (rev.text for rev in children['revision'].iter('text')) yield { 'article_id': children['id'].text, 'article_title': children['title'].text, 'wikitext': revisions.next(), } def parse_wikitext(content): text = content['wikitext'] parsed_md = mwparserfromhell.parse(content['wikitext']) content['text'] = _strip_code(parsed_md) yield content p = apache_beam.Pipeline(argv=pipeline_args) value = p | apache_beam.Read('Read XML', custom_sources.XmlFileSource('page', gcs_path)) value = value | apache_beam.FlatMap('Parse XML and filter', parse_xml) value = value | apache_beam.Map('Wikitext to text', parse_wikitext) ...
def analyze_entities(content): analysis = language.annotate_text( content['text'], extract_entities=True, extract_document_sentiment=True) sentiment = analysis.get('documentSentiment', {}) for entity in analysis.get('entities', []): entity_dict = { 'article_id': content['article_id'], ... 'article_sentiment_polarity': sentiment.get('polarity'), 'entity_name': entity['name'], } yield entity_dict value = value | apache_beam.FlatMap('Entities', analyze_entities) value = value | apache_beam.Write( 'Dump metadata to BigQuery', apache_beam.io.BigQuerySink( destination_table, schema=', '.join([ 'article_id:STRING', ... 'article_sentiment_polarity:FLOAT', 'entity_name:STRING', ]), ...)))
SELECT top(entity_name, 5) as entity_name, count(*) as num_articles FROM [nl-wikipedia:nl_wikipedia.nl_wikipedia];
SELECT top(entity_name, 5) as entity_name, count(*) as num_articles FROM [nl-wikipedia:nl_wikipedia.nl_wikipedia] where entity_type = 'CONSUMER_GOOD';
SELECT entity_name, sum(article_sentiment_polarity) as sentiment FROM [nl-wikipedia:nl_wikipedia.nl_wikipedia] where entity_type='CONSUMER_GOOD' and entity_salience > .5 group by entity_name order by sentiment desc limit 5
select top(entity_name, 5) as entity_name, count(*) as num_articles from [nl_wikipedia.nl_wikipedia] where article_id in ( SELECT article_id FROM [nl_wikipedia.nl_wikipedia] where entity_name like '%Android%') and entity_name not like '%Android%' and entity_type = 'CONSUMER_GOOD'
0 件のコメント :
コメントを投稿