From 61a3db98fc15e6f2ce77cecf0bbbd3f5877cfc8c Mon Sep 17 00:00:00 2001 From: Yu-Han Liu Date: Wed, 6 Sep 2017 09:26:48 -0700 Subject: [PATCH 01/11] first version of classify_text tutorial --- language/classify_text/README.md | 80 +++++++ .../classify_text/classify_text_tutorial.py | 221 ++++++++++++++++++ .../classify_text_tutorial_test.py | 40 ++++ language/classify_text/index.json | 1 + language/classify_text/requirements.txt | 1 + .../classify_text/resources/query_text.txt | 1 + .../classify_text/resources/texts/android.txt | 1 + .../classify_text/resources/texts/eclipse.txt | 1 + .../resources/texts/eclipse_of_the_sun.txt | 1 + .../classify_text/resources/texts/gcp.txt | 1 + .../classify_text/resources/texts/google.txt | 1 + language/cloud-client/v1/snippets.py | 2 +- 12 files changed, 350 insertions(+), 1 deletion(-) create mode 100644 language/classify_text/README.md create mode 100644 language/classify_text/classify_text_tutorial.py create mode 100644 language/classify_text/classify_text_tutorial_test.py create mode 100644 language/classify_text/index.json create mode 100644 language/classify_text/requirements.txt create mode 100644 language/classify_text/resources/query_text.txt create mode 100644 language/classify_text/resources/texts/android.txt create mode 100644 language/classify_text/resources/texts/eclipse.txt create mode 100644 language/classify_text/resources/texts/eclipse_of_the_sun.txt create mode 100644 language/classify_text/resources/texts/gcp.txt create mode 100644 language/classify_text/resources/texts/google.txt diff --git a/language/classify_text/README.md b/language/classify_text/README.md new file mode 100644 index 00000000000..85d293b35ab --- /dev/null +++ b/language/classify_text/README.md @@ -0,0 +1,80 @@ +# Introduction + +This sample contains the code referenced in the +[Text Classification Tutorial](http://cloud.google.com/natural-language/docs/classify-text-tutorial) within the Google Cloud Natural Language API Documentation. A full walkthrough of this sample is located within the documentation. + +This sample shows how one can use the text classification feature of the Natural Language API to find similar texts based on a query. + +## Prerequisites + +Set up your +[Cloud Natural Language API project](https://cloud.google.com/natural-language/docs/getting-started#set_up_a_project) +, which includes: + +* Enabling the Natural Language API +* Setting up a service account +* Ensuring you've properly set up your `GOOGLE_APPLICATION_CREDENTIALS` for proper + authentication to the service. + +## Download the Code + +``` +$ git clone https://github.com/GoogleCloudPlatform/python-docs-samples.git +$ cd python-docs-samples/language/classify_text +``` + +## Run the Code + +Open a sample folder, create a virtualenv, install dependencies, and run the sample: + +``` +$ virtualenv env +$ source env/bin/activate +(env)$ pip install -r requirements.txt +``` + +### Usage + +This sample is organized as a script runnable from the command line. It can perform the following tasks: + + * Classifies multiple text files and write the result to an "index" file. + * Processes input query text to find similar text files. + * Processes input query category label to find similar text files. + +## Classify text + +``` +python classify_text_tutorial.py classify "$(cat resources/query_text.txt)" +``` + +Note that the text needs to be sufficiently long for the API to return a non-empty +response. + +## Index mulitple text files + +``` +python classify_text_tutorial.py index resources/texts +``` + +By default this creates a file `index.json`, which you can specify by passing in the optional `--index_file` argument. + +## Query with a category label + +The indexed text files can be queried with any of the category labels listed on the [Categories](https://cloud.google.com/natural-language/docs/categories) page. + +``` +python classify_text_tutorial.py query-category index.json "/Internet & Telecom/Mobile & Wireless" +``` + +## Query with text + +The indexed text files can be queried with another text that might not have been indexec. + +``` +python classify_text_tutorial.py query index.json "$(cat resources/query_text.txt)" +``` + + + + + diff --git a/language/classify_text/classify_text_tutorial.py b/language/classify_text/classify_text_tutorial.py new file mode 100644 index 00000000000..668f36cb8fd --- /dev/null +++ b/language/classify_text/classify_text_tutorial.py @@ -0,0 +1,221 @@ +# Copyright 2017, Google, Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START classify_text_tutorial] +"""Demonstrates how to use the classify_text method.""" + +# [START classify_text_tutorial_import] +import argparse +import json +import os +import numpy as np + +from google.cloud import language_v1beta2 +from google.cloud.language_v1beta2 import types +from google.cloud.language_v1beta2 import enums +# [END classify_text_tutorial_import] + + +# [START def_classify] +def classify(text, verbose=True): + """Classify the input text into categories. """ + + language_client = language_v1beta2.LanguageServiceClient() + + document = types.Document( + content=text, + type=enums.Document.Type.HTML) + categories = language_client.classify_text(document).categories + + result = {} + + for category in categories: + # Turn the categories into a dictionary. + result[category.name] = category.confidence + + if verbose: + print(u'=' * 20) + print(u'{:<16}: {}'.format('category', category.name)) + print(u'{:<16}: {}'.format('confidence', category.confidence)) + + return result +# [END def_classify] + + +# [START def_index] +def index(path, index_file): + """Classify each text file in the directory and write + the results to the index_file. """ + + result = {} + for filename in os.listdir(path): + file_path = os.path.join(path, filename) + + if not os.path.isfile(file_path): + continue + + with open(file_path, 'r') as f: + text = f.read() + categories = classify(text, verbose=False) + + result[filename] = categories + + with open(index_file, 'w') as f: + json.dump(result, f) + + print('Texts indexed in file: {}'.format(index_file)) + return result +# [END def_index] + + +# [START def_query] +def query(index_file, text, n_top=3): + """Find the indexed files that are the most similar to + the query text. """ + + with open(index_file, 'r') as f: + index = json.load(f) + + # Get the categories of the query text. + query_categories = classify(text, verbose=False) + + similarities = [] + for filename, categories in index.iteritems(): + similarities.append((filename, similarity(query_categories, categories))) + + similarities = sorted(similarities, key=lambda p: p[1], reverse=True) + + print('=' * 20) + print('Query: {}\n'.format(text)) + for category, confidence in query_categories.iteritems(): + print('\tCategory: {}, confidence: {}'.format(category, confidence)) + print('\nMost similar {} indexed texts:'.format(n_top)) + for filename, sim in similarities[:n_top]: + print('\tFilename: {}'.format(filename)) + print('\tSimilarity: {}'.format(sim)) + print('\n') + + return similarities +# [END def_query] + + +# [START def_query_category] +def query_category(index_file, category_string, n_top=3): + """Find the indexed files that are the most similar to + the query label. + + The list of all available labels: + https://cloud.google.com/natural-language/docs/categories""" + + with open(index_file, 'r') as f: + index = json.load(f) + + # Make the category_string into a dict + query_categories = {category_string: 1.0} + + similarities = [] + for filename, categories in index.iteritems(): + similarities.append((filename, similarity(query_categories, categories))) + + similarities = sorted(similarities, key=lambda p: p[1], reverse=True) + + print('=' * 20) + print('Query: {}\n'.format(category_string)) + print('\nMost similar {} indexed texts:'.format(n_top)) + for filename, sim in similarities[:n_top]: + print('\tFilename: {}'.format(filename)) + print('\tSimilarity: {}'.format(sim)) + print('\n') + + return similarities +# [END def_query_category] + + +# [START def_similarity] +def similarity(categories1, categories2): + """Cosine similarity of the categories treated as sparse vectors.""" + + def split_labels(categories): + _categories = {} + for name, confidence in categories.iteritems(): + labels = [label for label in name.split('/') if label] + for label in labels: + _categories[label] = confidence + + return _categories + + def norm(categories): + if len(categories) == 0: + return 0.0 + return np.linalg.norm(categories.values()) + + categories1 = split_labels(categories1) + categories2 = split_labels(categories2) + + norm1 = norm(categories1) + norm2 = norm(categories2) + + # Return the smallest possible similarity if either categories is empty. + if norm1 == 0 or norm2 == 0: + return 0.0 + + dot = 0.0 + for label, confidence in categories1.iteritems(): + dot += confidence * categories2.get(label, 0.0) + + return dot / (norm1 * norm2) +# [END def_similarity] + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + subparsers = parser.add_subparsers(dest='command') + classify_parser = subparsers.add_parser( + 'classify', help=classify.__doc__) + classify_parser.add_argument( + 'text', help='The text to be classified. ' + 'The text needs to have at least 20 tokens.') + index_parser = subparsers.add_parser( + 'index', help=index.__doc__) + index_parser.add_argument( + 'path', help='The directory that contains ' + 'text files to be indexed.') + index_parser.add_argument( + '--index_file', help='Filename for the output JSON.', + default='index.json') + query_parser = subparsers.add_parser( + 'query', help=query.__doc__) + query_parser.add_argument( + 'index_file', help='Path to the index JSON file.') + query_parser.add_argument( + 'text', help='Query text.') + query_category_parser = subparsers.add_parser( + 'query-category', help=query_category.__doc__) + query_category_parser.add_argument( + 'index_file', help='Path to the index JSON file.') + query_category_parser.add_argument( + 'category', help='Query category.') + + args = parser.parse_args() + + if args.command == 'classify': + classify(args.text) + if args.command == 'index': + index(args.path, args.index_file) + if args.command == 'query': + query(args.index_file, args.text) + if args.command == 'query-category': + query_category(args.index_file, args.category) +# [END classify_text_tutorial] diff --git a/language/classify_text/classify_text_tutorial_test.py b/language/classify_text/classify_text_tutorial_test.py new file mode 100644 index 00000000000..6808ef33544 --- /dev/null +++ b/language/classify_text/classify_text_tutorial_test.py @@ -0,0 +1,40 @@ +# Copyright 2016, Google, Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import pytest +import re + +from classify_text_tutorial import classify +from classify_text_tutorial import similarity + +RESOURCES = os.path.join(os.path.dirname(__file__), 'resources') + + +def test_classify(capsys): + with open(os.path.join(RESOURCES, 'query_text.txt'), 'r') as f: + text = f.read() + classify(text) + out, err = capsys.readouterr() + assert 'category' in out + + +def test_similarity(): + empty_categories = {} + categories1 = {'/a/b/c': 1.0, '/d/e': 1.0} + categories2 = {'/a/b': 1.0} + + assert similarity(empty_categories, categories1) == 0.0 + assert similarity(categories1, categories1) > 0.99 + assert similarity(categories1, categories2) > 0 + assert similarity(categories1, categories2) < 1 diff --git a/language/classify_text/index.json b/language/classify_text/index.json new file mode 100644 index 00000000000..e45f8c02353 --- /dev/null +++ b/language/classify_text/index.json @@ -0,0 +1 @@ +{"android.txt": {"/Computers & Electronics": 0.800000011920929, "/Internet & Telecom/Mobile & Wireless/Mobile Apps & Add-Ons": 0.6499999761581421}, "google.txt": {"/Internet & Telecom": 0.5799999833106995, "/Business & Industrial": 0.5400000214576721}, "eclipse_of_the_sun.txt": {"/Books & Literature": 0.8999999761581421}, "eclipse.txt": {"/Science/Astronomy": 0.9800000190734863}, "gcp.txt": {"/Internet & Telecom/Web Services": 0.9700000286102295}} \ No newline at end of file diff --git a/language/classify_text/requirements.txt b/language/classify_text/requirements.txt new file mode 100644 index 00000000000..743bbe79212 --- /dev/null +++ b/language/classify_text/requirements.txt @@ -0,0 +1 @@ +google-cloud-language==0.27.0 diff --git a/language/classify_text/resources/query_text.txt b/language/classify_text/resources/query_text.txt new file mode 100644 index 00000000000..304727304d1 --- /dev/null +++ b/language/classify_text/resources/query_text.txt @@ -0,0 +1 @@ +Google Home enables users to speak voice commands to interact with services through the Home's intelligent personal assistant called Google Assistant. A large number of services, both in-house and third-party, are integrated, allowing users to listen to music, look at videos or photos, or receive news updates entirely by voice. diff --git a/language/classify_text/resources/texts/android.txt b/language/classify_text/resources/texts/android.txt new file mode 100644 index 00000000000..29dc1449c55 --- /dev/null +++ b/language/classify_text/resources/texts/android.txt @@ -0,0 +1 @@ +Android is a mobile operating system developed by Google, based on the Linux kernel and designed primarily for touchscreen mobile devices such as smartphones and tablets. diff --git a/language/classify_text/resources/texts/eclipse.txt b/language/classify_text/resources/texts/eclipse.txt new file mode 100644 index 00000000000..5d16217e520 --- /dev/null +++ b/language/classify_text/resources/texts/eclipse.txt @@ -0,0 +1 @@ +A solar eclipse (as seen from the planet Earth) is a type of eclipse that occurs when the Moon passes between the Sun and Earth, and when the Moon fully or partially blocks (occults) the Sun. diff --git a/language/classify_text/resources/texts/eclipse_of_the_sun.txt b/language/classify_text/resources/texts/eclipse_of_the_sun.txt new file mode 100644 index 00000000000..7236fc9d806 --- /dev/null +++ b/language/classify_text/resources/texts/eclipse_of_the_sun.txt @@ -0,0 +1 @@ +Eclipse of the Sun is the debut novel by English author Phil Whitaker. It won the 1997 John Llewellyn Rhys Prize a Betty Trask Award in 1998, and was shortlisted for the 1997 Whitbread First Novel Award. diff --git a/language/classify_text/resources/texts/gcp.txt b/language/classify_text/resources/texts/gcp.txt new file mode 100644 index 00000000000..1ed09b2c758 --- /dev/null +++ b/language/classify_text/resources/texts/gcp.txt @@ -0,0 +1 @@ +Google Cloud Platform, offered by Google, is a suite of cloud computing services that runs on the same infrastructure that Google uses internally for its end-user products, such as Google Search and YouTube. Alongside a set of management tools, it provides a series of modular cloud services including computing, data storage, data analytics and machine learning. diff --git a/language/classify_text/resources/texts/google.txt b/language/classify_text/resources/texts/google.txt new file mode 100644 index 00000000000..06828635931 --- /dev/null +++ b/language/classify_text/resources/texts/google.txt @@ -0,0 +1 @@ +Google is an American multinational technology company that specializes in Internet-related services and products. These include online advertising technologies, search, cloud computing, software, and hardware. diff --git a/language/cloud-client/v1/snippets.py b/language/cloud-client/v1/snippets.py index e13fc7dd6c8..b86dad7468a 100644 --- a/language/cloud-client/v1/snippets.py +++ b/language/cloud-client/v1/snippets.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -# Copyright 2016 Google, Inc. +# Copyright 2017 Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From dd951dc6625ca8df594d83fc7b3c8301bfc551f1 Mon Sep 17 00:00:00 2001 From: Yu-Han Liu Date: Wed, 6 Sep 2017 10:10:36 -0700 Subject: [PATCH 02/11] addressing comments --- language/classify_text/README.md | 2 +- .../classify_text/classify_text_tutorial.py | 116 +++++++++++------- .../classify_text_tutorial_test.py | 2 - 3 files changed, 71 insertions(+), 49 deletions(-) diff --git a/language/classify_text/README.md b/language/classify_text/README.md index 85d293b35ab..37a18c2f457 100644 --- a/language/classify_text/README.md +++ b/language/classify_text/README.md @@ -68,7 +68,7 @@ python classify_text_tutorial.py query-category index.json "/Internet & Telecom/ ## Query with text -The indexed text files can be queried with another text that might not have been indexec. +The indexed text files can be queried with another text that might not have been indexed. ``` python classify_text_tutorial.py query index.json "$(cat resources/query_text.txt)" diff --git a/language/classify_text/classify_text_tutorial.py b/language/classify_text/classify_text_tutorial.py index 668f36cb8fd..c610c5ae163 100644 --- a/language/classify_text/classify_text_tutorial.py +++ b/language/classify_text/classify_text_tutorial.py @@ -18,11 +18,12 @@ import argparse import json import os -import numpy as np from google.cloud import language_v1beta2 -from google.cloud.language_v1beta2 import types from google.cloud.language_v1beta2 import enums +from google.cloud.language_v1beta2 import types + +import numpy as np # [END classify_text_tutorial_import] @@ -40,10 +41,14 @@ def classify(text, verbose=True): result = {} for category in categories: - # Turn the categories into a dictionary. + # Turn the categories into a dictionary of the form: + # {category.name: category.confidence}, so that they can + # be treated as a sparse vector. result[category.name] = category.confidence - if verbose: + if verbose: + print(text) + for category in categories: print(u'=' * 20) print(u'{:<16}: {}'.format('category', category.name)) print(u'{:<16}: {}'.format('confidence', category.confidence)) @@ -55,7 +60,8 @@ def classify(text, verbose=True): # [START def_index] def index(path, index_file): """Classify each text file in the directory and write - the results to the index_file. """ + the results to the index_file. + """ result = {} for filename in os.listdir(path): @@ -81,7 +87,8 @@ def index(path, index_file): # [START def_query] def query(index_file, text, n_top=3): """Find the indexed files that are the most similar to - the query text. """ + the query text. + """ with open(index_file, 'r') as f: index = json.load(f) @@ -91,7 +98,8 @@ def query(index_file, text, n_top=3): similarities = [] for filename, categories in index.iteritems(): - similarities.append((filename, similarity(query_categories, categories))) + similarities.append( + (filename, similarity(query_categories, categories))) similarities = sorted(similarities, key=lambda p: p[1], reverse=True) @@ -109,43 +117,29 @@ def query(index_file, text, n_top=3): # [END def_query] -# [START def_query_category] -def query_category(index_file, category_string, n_top=3): - """Find the indexed files that are the most similar to - the query label. - - The list of all available labels: - https://cloud.google.com/natural-language/docs/categories""" - - with open(index_file, 'r') as f: - index = json.load(f) - - # Make the category_string into a dict - query_categories = {category_string: 1.0} - - similarities = [] - for filename, categories in index.iteritems(): - similarities.append((filename, similarity(query_categories, categories))) - - similarities = sorted(similarities, key=lambda p: p[1], reverse=True) - - print('=' * 20) - print('Query: {}\n'.format(category_string)) - print('\nMost similar {} indexed texts:'.format(n_top)) - for filename, sim in similarities[:n_top]: - print('\tFilename: {}'.format(filename)) - print('\tSimilarity: {}'.format(sim)) - print('\n') - - return similarities -# [END def_query_category] - - # [START def_similarity] def similarity(categories1, categories2): """Cosine similarity of the categories treated as sparse vectors.""" def split_labels(categories): + """The category labels are of the form "/a/b/c" up to three levels, + for example "/Computers & Electronics/Software", and these labels + are used as keys in the categories dictionary, whose values are + confidence scores. + + The split_labels function splits the keys into individual levels + while duplicating the confidence score, which allows a natural + boost in how we calculate similarity when more levels are in common. + + Example: + If we have + + x = {"/a/b/c": 0.5} + y = {"/a/b": 0.5} + z = {"/a": 0.5} + + Then x and y are considered more similar than y and z. + """ _categories = {} for name, confidence in categories.iteritems(): labels = [label for label in name.split('/') if label] @@ -154,16 +148,11 @@ def split_labels(categories): return _categories - def norm(categories): - if len(categories) == 0: - return 0.0 - return np.linalg.norm(categories.values()) - categories1 = split_labels(categories1) categories2 = split_labels(categories2) - norm1 = norm(categories1) - norm2 = norm(categories2) + norm1 = np.linalg.norm(categories1.values()) + norm2 = np.linalg.norm(categories2.values()) # Return the smallest possible similarity if either categories is empty. if norm1 == 0 or norm2 == 0: @@ -177,6 +166,41 @@ def norm(categories): # [END def_similarity] +# [START def_query_category] +def query_category(index_file, category_string, n_top=3): + """Find the indexed files that are the most similar to + the query label. + + The list of all available labels: + https://cloud.google.com/natural-language/docs/categories + """ + + with open(index_file, 'r') as f: + index = json.load(f) + + # Make the category_string into a dictionary so that it is + # of the same format as what we get by calling classify. + query_categories = {category_string: 1.0} + + similarities = [] + for filename, categories in index.iteritems(): + similarities.append( + (filename, similarity(query_categories, categories))) + + similarities = sorted(similarities, key=lambda p: p[1], reverse=True) + + print('=' * 20) + print('Query: {}\n'.format(category_string)) + print('\nMost similar {} indexed texts:'.format(n_top)) + for filename, sim in similarities[:n_top]: + print('\tFilename: {}'.format(filename)) + print('\tSimilarity: {}'.format(sim)) + print('\n') + + return similarities +# [END def_query_category] + + if __name__ == '__main__': parser = argparse.ArgumentParser( description=__doc__, diff --git a/language/classify_text/classify_text_tutorial_test.py b/language/classify_text/classify_text_tutorial_test.py index 6808ef33544..91f61eee5e4 100644 --- a/language/classify_text/classify_text_tutorial_test.py +++ b/language/classify_text/classify_text_tutorial_test.py @@ -12,8 +12,6 @@ # limitations under the License. import os -import pytest -import re from classify_text_tutorial import classify from classify_text_tutorial import similarity From f863a3bc569cebbd093403a4a3d8d3702c4037b6 Mon Sep 17 00:00:00 2001 From: Yu-Han Liu Date: Wed, 13 Sep 2017 08:47:30 -0700 Subject: [PATCH 03/11] classify text tutorial --- language/classify_text/README.md | 2 +- .../classify_text/classify_text_tutorial.py | 116 +++++++++--------- .../classify_text_tutorial_test.py | 9 +- language/classify_text/index.json | 1 - .../{query_text.txt => query_text1.txt} | 0 .../classify_text/resources/query_text2.txt | 1 + .../classify_text/resources/query_text3.txt | 1 + .../resources/texts/cat_in_the_hat.txt | 1 + .../resources/texts/cloud_computing.txt | 1 + .../classify_text/resources/texts/email.txt | 1 + .../classify_text/resources/texts/gmail.txt | 1 + .../resources/texts/harry_potter.txt | 1 + .../classify_text/resources/texts/matilda.txt | 1 + .../resources/texts/mobile_phone.txt | 1 + .../classify_text/resources/texts/mr_fox.txt | 1 + .../resources/texts/wireless.txt | 1 + 16 files changed, 81 insertions(+), 58 deletions(-) delete mode 100644 language/classify_text/index.json rename language/classify_text/resources/{query_text.txt => query_text1.txt} (100%) create mode 100644 language/classify_text/resources/query_text2.txt create mode 100644 language/classify_text/resources/query_text3.txt create mode 100644 language/classify_text/resources/texts/cat_in_the_hat.txt create mode 100644 language/classify_text/resources/texts/cloud_computing.txt create mode 100644 language/classify_text/resources/texts/email.txt create mode 100644 language/classify_text/resources/texts/gmail.txt create mode 100644 language/classify_text/resources/texts/harry_potter.txt create mode 100644 language/classify_text/resources/texts/matilda.txt create mode 100644 language/classify_text/resources/texts/mobile_phone.txt create mode 100644 language/classify_text/resources/texts/mr_fox.txt create mode 100644 language/classify_text/resources/texts/wireless.txt diff --git a/language/classify_text/README.md b/language/classify_text/README.md index 37a18c2f457..1bd3c9eadd4 100644 --- a/language/classify_text/README.md +++ b/language/classify_text/README.md @@ -71,7 +71,7 @@ python classify_text_tutorial.py query-category index.json "/Internet & Telecom/ The indexed text files can be queried with another text that might not have been indexed. ``` -python classify_text_tutorial.py query index.json "$(cat resources/query_text.txt)" +python classify_text_tutorial.py query index.json "$(cat resources/query_text1.txt)" ``` diff --git a/language/classify_text/classify_text_tutorial.py b/language/classify_text/classify_text_tutorial.py index c610c5ae163..0f15df86cac 100644 --- a/language/classify_text/classify_text_tutorial.py +++ b/language/classify_text/classify_text_tutorial.py @@ -12,7 +12,7 @@ # limitations under the License. # [START classify_text_tutorial] -"""Demonstrates how to use the classify_text method.""" +"""Using the classify_text method to cluster texts.""" # [START classify_text_tutorial_import] import argparse @@ -35,7 +35,7 @@ def classify(text, verbose=True): document = types.Document( content=text, - type=enums.Document.Type.HTML) + type=enums.Document.Type.PLAIN_TEXT) categories = language_client.classify_text(document).categories result = {} @@ -70,11 +70,14 @@ def index(path, index_file): if not os.path.isfile(file_path): continue - with open(file_path, 'r') as f: - text = f.read() - categories = classify(text, verbose=False) + try: + with open(file_path, 'r') as f: + text = f.read() + categories = classify(text, verbose=False) - result[filename] = categories + result[filename] = categories + except: + print('Failed to process {}'.format(file_path)) with open(index_file, 'w') as f: json.dump(result, f) @@ -84,6 +87,58 @@ def index(path, index_file): # [END def_index] +# [START def_split_labels] +def split_labels(categories): + """The category labels are of the form "/a/b/c" up to three levels, + for example "/Computers & Electronics/Software", and these labels + are used as keys in the categories dictionary, whose values are + confidence scores. + + The split_labels function splits the keys into individual levels + while duplicating the confidence score, which allows a natural + boost in how we calculate similarity when more levels are in common. + + Example: + If we have + + x = {"/a/b/c": 0.5} + y = {"/a/b": 0.5} + z = {"/a": 0.5} + + Then x and y are considered more similar than y and z. + """ + _categories = {} + for name, confidence in categories.iteritems(): + labels = [label for label in name.split('/') if label] + for label in labels: + _categories[label] = confidence + + return _categories +# [END def_split_labels] + + +# [START def_similarity] +def similarity(categories1, categories2): + """Cosine similarity of the categories treated as sparse vectors.""" + categories1 = split_labels(categories1) + categories2 = split_labels(categories2) + + norm1 = np.linalg.norm(categories1.values()) + norm2 = np.linalg.norm(categories2.values()) + + # Return the smallest possible similarity if either categories is empty. + if norm1 == 0 or norm2 == 0: + return 0.0 + + # Compute the cosine similarity. + dot = 0.0 + for label, confidence in categories1.iteritems(): + dot += confidence * categories2.get(label, 0.0) + + return dot / (norm1 * norm2) +# [END def_similarity] + + # [START def_query] def query(index_file, text, n_top=3): """Find the indexed files that are the most similar to @@ -117,55 +172,6 @@ def query(index_file, text, n_top=3): # [END def_query] -# [START def_similarity] -def similarity(categories1, categories2): - """Cosine similarity of the categories treated as sparse vectors.""" - - def split_labels(categories): - """The category labels are of the form "/a/b/c" up to three levels, - for example "/Computers & Electronics/Software", and these labels - are used as keys in the categories dictionary, whose values are - confidence scores. - - The split_labels function splits the keys into individual levels - while duplicating the confidence score, which allows a natural - boost in how we calculate similarity when more levels are in common. - - Example: - If we have - - x = {"/a/b/c": 0.5} - y = {"/a/b": 0.5} - z = {"/a": 0.5} - - Then x and y are considered more similar than y and z. - """ - _categories = {} - for name, confidence in categories.iteritems(): - labels = [label for label in name.split('/') if label] - for label in labels: - _categories[label] = confidence - - return _categories - - categories1 = split_labels(categories1) - categories2 = split_labels(categories2) - - norm1 = np.linalg.norm(categories1.values()) - norm2 = np.linalg.norm(categories2.values()) - - # Return the smallest possible similarity if either categories is empty. - if norm1 == 0 or norm2 == 0: - return 0.0 - - dot = 0.0 - for label, confidence in categories1.iteritems(): - dot += confidence * categories2.get(label, 0.0) - - return dot / (norm1 * norm2) -# [END def_similarity] - - # [START def_query_category] def query_category(index_file, category_string, n_top=3): """Find the indexed files that are the most similar to diff --git a/language/classify_text/classify_text_tutorial_test.py b/language/classify_text/classify_text_tutorial_test.py index 91f61eee5e4..e3b3bfdbf6c 100644 --- a/language/classify_text/classify_text_tutorial_test.py +++ b/language/classify_text/classify_text_tutorial_test.py @@ -15,18 +15,25 @@ from classify_text_tutorial import classify from classify_text_tutorial import similarity +from classify_text_tutorial import split_labels RESOURCES = os.path.join(os.path.dirname(__file__), 'resources') def test_classify(capsys): - with open(os.path.join(RESOURCES, 'query_text.txt'), 'r') as f: + with open(os.path.join(RESOURCES, 'query_text1.txt'), 'r') as f: text = f.read() classify(text) out, err = capsys.readouterr() assert 'category' in out +def test_split_labels(): + categories = {'/a/b/c': 1.0} + split_categories = {'a': 1.0, 'b': 1.0, 'c': 1.0} + assert split_labels(categories) == split_categories + + def test_similarity(): empty_categories = {} categories1 = {'/a/b/c': 1.0, '/d/e': 1.0} diff --git a/language/classify_text/index.json b/language/classify_text/index.json deleted file mode 100644 index e45f8c02353..00000000000 --- a/language/classify_text/index.json +++ /dev/null @@ -1 +0,0 @@ -{"android.txt": {"/Computers & Electronics": 0.800000011920929, "/Internet & Telecom/Mobile & Wireless/Mobile Apps & Add-Ons": 0.6499999761581421}, "google.txt": {"/Internet & Telecom": 0.5799999833106995, "/Business & Industrial": 0.5400000214576721}, "eclipse_of_the_sun.txt": {"/Books & Literature": 0.8999999761581421}, "eclipse.txt": {"/Science/Astronomy": 0.9800000190734863}, "gcp.txt": {"/Internet & Telecom/Web Services": 0.9700000286102295}} \ No newline at end of file diff --git a/language/classify_text/resources/query_text.txt b/language/classify_text/resources/query_text1.txt similarity index 100% rename from language/classify_text/resources/query_text.txt rename to language/classify_text/resources/query_text1.txt diff --git a/language/classify_text/resources/query_text2.txt b/language/classify_text/resources/query_text2.txt new file mode 100644 index 00000000000..eef573c6007 --- /dev/null +++ b/language/classify_text/resources/query_text2.txt @@ -0,0 +1 @@ +The Hitchhiker's Guide to the Galaxy is the first of five books in the Hitchhiker's Guide to the Galaxy comedy science fiction "trilogy" by Douglas Adams (with the sixth written by Eoin Colfer). \ No newline at end of file diff --git a/language/classify_text/resources/query_text3.txt b/language/classify_text/resources/query_text3.txt new file mode 100644 index 00000000000..1337d3c6477 --- /dev/null +++ b/language/classify_text/resources/query_text3.txt @@ -0,0 +1 @@ +Goodnight Moon is an American children's picture book written by Margaret Wise Brown and illustrated by Clement Hurd. It was published on September 3, 1947, and is a highly acclaimed example of a bedtime story. \ No newline at end of file diff --git a/language/classify_text/resources/texts/cat_in_the_hat.txt b/language/classify_text/resources/texts/cat_in_the_hat.txt new file mode 100644 index 00000000000..bb5a853c694 --- /dev/null +++ b/language/classify_text/resources/texts/cat_in_the_hat.txt @@ -0,0 +1 @@ +The Cat in the Hat is a children's book written and illustrated by Theodor Geisel under the pen name Dr. Seuss and first published in 1957. The story centers on a tall anthropomorphic cat, who wears a red and white-striped hat and a red bow tie. \ No newline at end of file diff --git a/language/classify_text/resources/texts/cloud_computing.txt b/language/classify_text/resources/texts/cloud_computing.txt new file mode 100644 index 00000000000..88172adf1f4 --- /dev/null +++ b/language/classify_text/resources/texts/cloud_computing.txt @@ -0,0 +1 @@ +Cloud computing is a computing-infrastructure and software model for enabling ubiquitous access to shared pools of configurable resources (such as computer networks, servers, storage, applications and services), which can be rapidly provisioned with minimal management effort, often over the Internet. \ No newline at end of file diff --git a/language/classify_text/resources/texts/email.txt b/language/classify_text/resources/texts/email.txt new file mode 100644 index 00000000000..3d430527b75 --- /dev/null +++ b/language/classify_text/resources/texts/email.txt @@ -0,0 +1 @@ +Electronic mail (email or e-mail) is a method of exchanging messages between people using electronics. Email first entered substantial use in the 1960s and by the mid-1970s had taken the form now recognized as email. \ No newline at end of file diff --git a/language/classify_text/resources/texts/gmail.txt b/language/classify_text/resources/texts/gmail.txt new file mode 100644 index 00000000000..89c9704b117 --- /dev/null +++ b/language/classify_text/resources/texts/gmail.txt @@ -0,0 +1 @@ +Gmail is a free, advertising-supported email service developed by Google. Users can access Gmail on the web and through mobile apps for Android and iOS, as well as through third-party programs that synchronize email content through POP or IMAP protocols. \ No newline at end of file diff --git a/language/classify_text/resources/texts/harry_potter.txt b/language/classify_text/resources/texts/harry_potter.txt new file mode 100644 index 00000000000..339c10af05a --- /dev/null +++ b/language/classify_text/resources/texts/harry_potter.txt @@ -0,0 +1 @@ +Harry Potter is a series of fantasy novels written by British author J. K. Rowling. The novels chronicle the life of a young wizard, Harry Potter, and his friends Hermione Granger and Ron Weasley, all of whom are students at Hogwarts School of Witchcraft and Wizardry. \ No newline at end of file diff --git a/language/classify_text/resources/texts/matilda.txt b/language/classify_text/resources/texts/matilda.txt new file mode 100644 index 00000000000..e1539d7ee88 --- /dev/null +++ b/language/classify_text/resources/texts/matilda.txt @@ -0,0 +1 @@ +Matilda is a book by British writer Roald Dahl. Matilda won the Children's Book Award in 1999. It was published in 1988 by Jonathan Cape in London, with 232 pages and illustrations by Quentin Blake. \ No newline at end of file diff --git a/language/classify_text/resources/texts/mobile_phone.txt b/language/classify_text/resources/texts/mobile_phone.txt new file mode 100644 index 00000000000..725e22ef3a9 --- /dev/null +++ b/language/classify_text/resources/texts/mobile_phone.txt @@ -0,0 +1 @@ +A mobile phone is a portable device that can make and receive calls over a radio frequency link while the user is moving within a telephone service area. The radio frequency link establishes a connection to the switching systems of a mobile phone operator, which provides access to the public switched telephone network (PSTN). \ No newline at end of file diff --git a/language/classify_text/resources/texts/mr_fox.txt b/language/classify_text/resources/texts/mr_fox.txt new file mode 100644 index 00000000000..354feced2af --- /dev/null +++ b/language/classify_text/resources/texts/mr_fox.txt @@ -0,0 +1 @@ +Fantastic Mr Fox is a children's novel written by British author Roald Dahl. It was published in 1970, by George Allen & Unwin in the UK and Alfred A. Knopf in the U.S., with illustrations by Donald Chaffin. \ No newline at end of file diff --git a/language/classify_text/resources/texts/wireless.txt b/language/classify_text/resources/texts/wireless.txt new file mode 100644 index 00000000000..d742331c464 --- /dev/null +++ b/language/classify_text/resources/texts/wireless.txt @@ -0,0 +1 @@ +Wireless communication, or sometimes simply wireless, is the transfer of information or power between two or more points that are not connected by an electrical conductor. The most common wireless technologies use radio waves. \ No newline at end of file From 2f033dcf1c999da263ce0b18a7755d437ae3dea5 Mon Sep 17 00:00:00 2001 From: Yu-Han Liu Date: Sun, 17 Sep 2017 20:13:28 -0700 Subject: [PATCH 04/11] update client version --- language/classify_text/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/language/classify_text/requirements.txt b/language/classify_text/requirements.txt index 743bbe79212..afc8ed0adf2 100644 --- a/language/classify_text/requirements.txt +++ b/language/classify_text/requirements.txt @@ -1 +1 @@ -google-cloud-language==0.27.0 +google-cloud-language==0.29.0 From 7e2334966d2999e3534ff966aec7766166d7590f Mon Sep 17 00:00:00 2001 From: Yu-Han Liu Date: Sun, 17 Sep 2017 20:17:05 -0700 Subject: [PATCH 05/11] year first written --- language/cloud-client/v1/snippets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/language/cloud-client/v1/snippets.py b/language/cloud-client/v1/snippets.py index b86dad7468a..e13fc7dd6c8 100644 --- a/language/cloud-client/v1/snippets.py +++ b/language/cloud-client/v1/snippets.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -# Copyright 2017 Google, Inc. +# Copyright 2016 Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 4a1ec6635c8f7effc119e359f6527244bdd4d178 Mon Sep 17 00:00:00 2001 From: Yu-Han Liu Date: Mon, 18 Sep 2017 10:32:42 -0700 Subject: [PATCH 06/11] use auto generated README --- language/classify_text/README.md | 80 ------------------- .../classify_text/classify_text_tutorial.py | 11 ++- 2 files changed, 9 insertions(+), 82 deletions(-) delete mode 100644 language/classify_text/README.md diff --git a/language/classify_text/README.md b/language/classify_text/README.md deleted file mode 100644 index 1bd3c9eadd4..00000000000 --- a/language/classify_text/README.md +++ /dev/null @@ -1,80 +0,0 @@ -# Introduction - -This sample contains the code referenced in the -[Text Classification Tutorial](http://cloud.google.com/natural-language/docs/classify-text-tutorial) within the Google Cloud Natural Language API Documentation. A full walkthrough of this sample is located within the documentation. - -This sample shows how one can use the text classification feature of the Natural Language API to find similar texts based on a query. - -## Prerequisites - -Set up your -[Cloud Natural Language API project](https://cloud.google.com/natural-language/docs/getting-started#set_up_a_project) -, which includes: - -* Enabling the Natural Language API -* Setting up a service account -* Ensuring you've properly set up your `GOOGLE_APPLICATION_CREDENTIALS` for proper - authentication to the service. - -## Download the Code - -``` -$ git clone https://github.com/GoogleCloudPlatform/python-docs-samples.git -$ cd python-docs-samples/language/classify_text -``` - -## Run the Code - -Open a sample folder, create a virtualenv, install dependencies, and run the sample: - -``` -$ virtualenv env -$ source env/bin/activate -(env)$ pip install -r requirements.txt -``` - -### Usage - -This sample is organized as a script runnable from the command line. It can perform the following tasks: - - * Classifies multiple text files and write the result to an "index" file. - * Processes input query text to find similar text files. - * Processes input query category label to find similar text files. - -## Classify text - -``` -python classify_text_tutorial.py classify "$(cat resources/query_text.txt)" -``` - -Note that the text needs to be sufficiently long for the API to return a non-empty -response. - -## Index mulitple text files - -``` -python classify_text_tutorial.py index resources/texts -``` - -By default this creates a file `index.json`, which you can specify by passing in the optional `--index_file` argument. - -## Query with a category label - -The indexed text files can be queried with any of the category labels listed on the [Categories](https://cloud.google.com/natural-language/docs/categories) page. - -``` -python classify_text_tutorial.py query-category index.json "/Internet & Telecom/Mobile & Wireless" -``` - -## Query with text - -The indexed text files can be queried with another text that might not have been indexed. - -``` -python classify_text_tutorial.py query index.json "$(cat resources/query_text1.txt)" -``` - - - - - diff --git a/language/classify_text/classify_text_tutorial.py b/language/classify_text/classify_text_tutorial.py index 0f15df86cac..2a683cd6d8f 100644 --- a/language/classify_text/classify_text_tutorial.py +++ b/language/classify_text/classify_text_tutorial.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + # Copyright 2017, Google, Inc. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +14,12 @@ # limitations under the License. # [START classify_text_tutorial] -"""Using the classify_text method to cluster texts.""" +"""Using the classify_text method to find content categories of text files, +Then use the content category labels to compare text similarity. + +For more information, see the tutorial page at +https://cloud.google.com/natural-language/docs/classify-text-tutorial. +""" # [START classify_text_tutorial_import] import argparse @@ -59,7 +66,7 @@ def classify(text, verbose=True): # [START def_index] def index(path, index_file): - """Classify each text file in the directory and write + """Classify each text file in a directory and write the results to the index_file. """ From 86c5294792907250e19c8354bf0366ead1b87af2 Mon Sep 17 00:00:00 2001 From: Yu-Han Liu Date: Mon, 18 Sep 2017 10:33:44 -0700 Subject: [PATCH 07/11] add README.rst.in and README.rst --- language/classify_text/README.rst | 126 +++++++++++++++++++++++++++ language/classify_text/README.rst.in | 26 ++++++ 2 files changed, 152 insertions(+) create mode 100644 language/classify_text/README.rst create mode 100644 language/classify_text/README.rst.in diff --git a/language/classify_text/README.rst b/language/classify_text/README.rst new file mode 100644 index 00000000000..0a61591bc22 --- /dev/null +++ b/language/classify_text/README.rst @@ -0,0 +1,126 @@ +.. This file is automatically generated. Do not edit this file directly. + +Google Cloud Natural Language API Python Samples +=============================================================================== + +This directory contains samples for Google Cloud Natural Language API. The `Google Cloud Natural Language API`_ provides natural language understanding technologies to developers. + +This tutorial demostrates how to use the `classify_text` method to classify content category of text files, and use the result to compare texts by their similarity to each other. See the `tutorial page`_ for details about this sample. + +.. _tutorial page: https://cloud.google.com/natural-language/docs/classify-text-tutorial + + + + +.. _Google Cloud Natural Language API: https://cloud.google.com/natural-language/docs/ + +Setup +------------------------------------------------------------------------------- + + +Authentication +++++++++++++++ + +Authentication is typically done through `Application Default Credentials`_, +which means you do not have to change the code to authenticate as long as +your environment has credentials. You have a few options for setting up +authentication: + +#. When running locally, use the `Google Cloud SDK`_ + + .. code-block:: bash + + gcloud auth application-default login + + +#. When running on App Engine or Compute Engine, credentials are already + set-up. However, you may need to configure your Compute Engine instance + with `additional scopes`_. + +#. You can create a `Service Account key file`_. This file can be used to + authenticate to Google Cloud Platform services from any environment. To use + the file, set the ``GOOGLE_APPLICATION_CREDENTIALS`` environment variable to + the path to the key file, for example: + + .. code-block:: bash + + export GOOGLE_APPLICATION_CREDENTIALS=/path/to/service_account.json + +.. _Application Default Credentials: https://cloud.google.com/docs/authentication#getting_credentials_for_server-centric_flow +.. _additional scopes: https://cloud.google.com/compute/docs/authentication#using +.. _Service Account key file: https://developers.google.com/identity/protocols/OAuth2ServiceAccount#creatinganaccount + +Install Dependencies +++++++++++++++++++++ + +#. Install `pip`_ and `virtualenv`_ if you do not already have them. + +#. Create a virtualenv. Samples are compatible with Python 2.7 and 3.4+. + + .. code-block:: bash + + $ virtualenv env + $ source env/bin/activate + +#. Install the dependencies needed to run the samples. + + .. code-block:: bash + + $ pip install -r requirements.txt + +.. _pip: https://pip.pypa.io/ +.. _virtualenv: https://virtualenv.pypa.io/ + +Samples +------------------------------------------------------------------------------- + +Classify Text Tutorial ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + + + +To run this sample: + +.. code-block:: bash + + $ python classify_text_tutorial.py + + usage: classify_text_tutorial.py [-h] + {classify,index,query,query-category} ... + + Using the classify_text method to cluster texts. + + positional arguments: + {classify,index,query,query-category} + classify Classify the input text into categories. + index Classify each text file in a directory and write the + results to the index_file. + query Find the indexed files that are the most similar to + the query text. + query-category Find the indexed files that are the most similar to + the query label. The list of all available labels: + https://cloud.google.com/natural- + language/docs/categories + + optional arguments: + -h, --help show this help message and exit + + + + +The client library +------------------------------------------------------------------------------- + +This sample uses the `Google Cloud Client Library for Python`_. +You can read the documentation for more details on API usage and use GitHub +to `browse the source`_ and `report issues`_. + +.. _Google Cloud Client Library for Python: + https://googlecloudplatform.github.io/google-cloud-python/ +.. _browse the source: + https://github.com/GoogleCloudPlatform/google-cloud-python +.. _report issues: + https://github.com/GoogleCloudPlatform/google-cloud-python/issues + + +.. _Google Cloud SDK: https://cloud.google.com/sdk/ \ No newline at end of file diff --git a/language/classify_text/README.rst.in b/language/classify_text/README.rst.in new file mode 100644 index 00000000000..42e8f061a5d --- /dev/null +++ b/language/classify_text/README.rst.in @@ -0,0 +1,26 @@ +# This file is used to generate README.rst + +product: + name: Google Cloud Natural Language API + short_name: Cloud Natural Language API + url: https://cloud.google.com/natural-language/docs/ + description: > + The `Google Cloud Natural Language API`_ provides natural language + understanding technologies to developers. + + + This tutorial demostrates how to use the `classify_text` method to classify content category of text files, and use the result to compare texts by their similarity to each other. See the `tutorial page`_ for details about this sample. + + + .. _tutorial page: https://cloud.google.com/natural-language/docs/classify-text-tutorial + +setup: +- auth +- install_deps + +samples: +- name: Classify Text Tutorial + file: classify_text_tutorial.py + show_help: true + +cloud_client_library: true From 3091fee359016fb4c6de2e28c235cc3300fe816d Mon Sep 17 00:00:00 2001 From: Yu-Han Liu Date: Mon, 18 Sep 2017 11:05:06 -0700 Subject: [PATCH 08/11] addressing review comments --- .../classify_text/classify_text_tutorial.py | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/language/classify_text/classify_text_tutorial.py b/language/classify_text/classify_text_tutorial.py index 2a683cd6d8f..841e198c67c 100644 --- a/language/classify_text/classify_text_tutorial.py +++ b/language/classify_text/classify_text_tutorial.py @@ -23,14 +23,15 @@ # [START classify_text_tutorial_import] import argparse +import io import json import os +import numpy +import six from google.cloud import language_v1beta2 from google.cloud.language_v1beta2 import enums from google.cloud.language_v1beta2 import types - -import numpy as np # [END classify_text_tutorial_import] @@ -43,7 +44,8 @@ def classify(text, verbose=True): document = types.Document( content=text, type=enums.Document.Type.PLAIN_TEXT) - categories = language_client.classify_text(document).categories + response = language_client.classify_text(document) + categories = response.categories result = {} @@ -78,7 +80,7 @@ def index(path, index_file): continue try: - with open(file_path, 'r') as f: + with io.open(file_path, 'r') as f: text = f.read() categories = classify(text, verbose=False) @@ -86,8 +88,8 @@ def index(path, index_file): except: print('Failed to process {}'.format(file_path)) - with open(index_file, 'w') as f: - json.dump(result, f) + with io.open(index_file, 'w') as f: + f.write(unicode(json.dumps(result))) print('Texts indexed in file: {}'.format(index_file)) return result @@ -115,7 +117,7 @@ def split_labels(categories): Then x and y are considered more similar than y and z. """ _categories = {} - for name, confidence in categories.iteritems(): + for name, confidence in six.iteritems(categories): labels = [label for label in name.split('/') if label] for label in labels: _categories[label] = confidence @@ -130,8 +132,8 @@ def similarity(categories1, categories2): categories1 = split_labels(categories1) categories2 = split_labels(categories2) - norm1 = np.linalg.norm(categories1.values()) - norm2 = np.linalg.norm(categories2.values()) + norm1 = numpy.linalg.norm(categories1.values()) + norm2 = numpy.linalg.norm(categories2.values()) # Return the smallest possible similarity if either categories is empty. if norm1 == 0 or norm2 == 0: @@ -139,7 +141,7 @@ def similarity(categories1, categories2): # Compute the cosine similarity. dot = 0.0 - for label, confidence in categories1.iteritems(): + for label, confidence in six.iteritems(categories1): dot += confidence * categories2.get(label, 0.0) return dot / (norm1 * norm2) @@ -152,14 +154,14 @@ def query(index_file, text, n_top=3): the query text. """ - with open(index_file, 'r') as f: + with io.open(index_file, 'r') as f: index = json.load(f) # Get the categories of the query text. query_categories = classify(text, verbose=False) similarities = [] - for filename, categories in index.iteritems(): + for filename, categories in six.iteritems(index): similarities.append( (filename, similarity(query_categories, categories))) @@ -167,7 +169,7 @@ def query(index_file, text, n_top=3): print('=' * 20) print('Query: {}\n'.format(text)) - for category, confidence in query_categories.iteritems(): + for category, confidence in six.iteritems(query_categories): print('\tCategory: {}, confidence: {}'.format(category, confidence)) print('\nMost similar {} indexed texts:'.format(n_top)) for filename, sim in similarities[:n_top]: @@ -188,7 +190,7 @@ def query_category(index_file, category_string, n_top=3): https://cloud.google.com/natural-language/docs/categories """ - with open(index_file, 'r') as f: + with io.open(index_file, 'r') as f: index = json.load(f) # Make the category_string into a dictionary so that it is @@ -196,7 +198,7 @@ def query_category(index_file, category_string, n_top=3): query_categories = {category_string: 1.0} similarities = [] - for filename, categories in index.iteritems(): + for filename, categories in six.iteritems(index): similarities.append( (filename, similarity(query_categories, categories))) From bf52056b07feb2e95006416334f6416f9b2e6fea Mon Sep 17 00:00:00 2001 From: Yu-Han Liu Date: Mon, 18 Sep 2017 13:32:17 -0700 Subject: [PATCH 09/11] add tests for index and query --- .../classify_text_tutorial_test.py | 62 ++++++++++++++++--- 1 file changed, 53 insertions(+), 9 deletions(-) diff --git a/language/classify_text/classify_text_tutorial_test.py b/language/classify_text/classify_text_tutorial_test.py index e3b3bfdbf6c..eede80ff0fa 100644 --- a/language/classify_text/classify_text_tutorial_test.py +++ b/language/classify_text/classify_text_tutorial_test.py @@ -11,27 +11,70 @@ # See the License for the specific language governing permissions and # limitations under the License. +import classify_text_tutorial import os +import pytest -from classify_text_tutorial import classify -from classify_text_tutorial import similarity -from classify_text_tutorial import split_labels +OUTPUT = 'index.json' RESOURCES = os.path.join(os.path.dirname(__file__), 'resources') +QUERY_TEXT = """Google Home enables users to speak voice commands to interact +with services through the Home\'s intelligent personal assistant called +Google Assistant. A large number of services, both in-house and third-party, +are integrated, allowing users to listen to music, look at videos or photos, +or receive news updates entirely by voice.""" +QUERY_CATEGORY = '/Computers & Electronics/Software' + + +@pytest.fixture(scope='session') +def index_file(tmpdir_factory): + temp_file = tmpdir_factory.mktemp('tmp').join(OUTPUT) + temp_out = temp_file.strpath + classify_text_tutorial.index(os.path.join(RESOURCES, 'texts'), temp_out) + return temp_file def test_classify(capsys): with open(os.path.join(RESOURCES, 'query_text1.txt'), 'r') as f: text = f.read() - classify(text) + classify_text_tutorial.classify(text) out, err = capsys.readouterr() assert 'category' in out +def test_index(capsys, tmpdir): + temp_dir = tmpdir.mkdir('tmp') + temp_out = temp_dir.join(OUTPUT).strpath + + classify_text_tutorial.index(os.path.join(RESOURCES, 'texts'), temp_out) + out, err = capsys.readouterr() + + assert OUTPUT in out + assert len(temp_dir.listdir()) == 1 + + +def test_query_text(capsys, index_file): + temp_out = index_file.strpath + + classify_text_tutorial.query(temp_out, QUERY_TEXT) + out, err = capsys.readouterr() + + assert 'Filename: cloud_computing.txt' in out + + +def test_query_category(capsys, index_file): + temp_out = index_file.strpath + + classify_text_tutorial.query_category(temp_out, QUERY_CATEGORY) + out, err = capsys.readouterr() + + assert 'Filename: cloud_computing.txt' in out + + def test_split_labels(): categories = {'/a/b/c': 1.0} split_categories = {'a': 1.0, 'b': 1.0, 'c': 1.0} - assert split_labels(categories) == split_categories + assert classify_text_tutorial.split_labels(categories) == split_categories def test_similarity(): @@ -39,7 +82,8 @@ def test_similarity(): categories1 = {'/a/b/c': 1.0, '/d/e': 1.0} categories2 = {'/a/b': 1.0} - assert similarity(empty_categories, categories1) == 0.0 - assert similarity(categories1, categories1) > 0.99 - assert similarity(categories1, categories2) > 0 - assert similarity(categories1, categories2) < 1 + assert classify_text_tutorial.similarity( + empty_categories, categories1) == 0.0 + assert classify_text_tutorial.similarity(categories1, categories1) > 0.99 + assert classify_text_tutorial.similarity(categories1, categories2) > 0 + assert classify_text_tutorial.similarity(categories1, categories2) < 1 From 3853a1b99d795788d3b52b01fb4c171ccbb2751d Mon Sep 17 00:00:00 2001 From: Yu-Han Liu Date: Mon, 18 Sep 2017 13:34:20 -0700 Subject: [PATCH 10/11] import order --- language/classify_text/classify_text_tutorial.py | 5 +++-- language/classify_text/classify_text_tutorial_test.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/language/classify_text/classify_text_tutorial.py b/language/classify_text/classify_text_tutorial.py index 841e198c67c..08a03e98212 100644 --- a/language/classify_text/classify_text_tutorial.py +++ b/language/classify_text/classify_text_tutorial.py @@ -26,12 +26,13 @@ import io import json import os -import numpy -import six from google.cloud import language_v1beta2 from google.cloud.language_v1beta2 import enums from google.cloud.language_v1beta2 import types + +import numpy +import six # [END classify_text_tutorial_import] diff --git a/language/classify_text/classify_text_tutorial_test.py b/language/classify_text/classify_text_tutorial_test.py index eede80ff0fa..305cf53fede 100644 --- a/language/classify_text/classify_text_tutorial_test.py +++ b/language/classify_text/classify_text_tutorial_test.py @@ -11,8 +11,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import classify_text_tutorial import os + +import classify_text_tutorial import pytest From 0f1a77e1beaf2d41cf986101c05fc7e357d4b2b2 Mon Sep 17 00:00:00 2001 From: Yu-Han Liu Date: Mon, 18 Sep 2017 21:09:06 -0700 Subject: [PATCH 11/11] add numpy to requirements --- language/classify_text/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/language/classify_text/requirements.txt b/language/classify_text/requirements.txt index afc8ed0adf2..10069f1801e 100644 --- a/language/classify_text/requirements.txt +++ b/language/classify_text/requirements.txt @@ -1 +1,2 @@ google-cloud-language==0.29.0 +numpy==1.13.1