diff --git a/.cursorignore b/.cursorignore new file mode 100644 index 00000000..7664704b --- /dev/null +++ b/.cursorignore @@ -0,0 +1 @@ +*.bak \ No newline at end of file diff --git a/.data/divvy.jsonl.gz b/.data/divvy.jsonl.gz new file mode 100644 index 00000000..ba93c40f Binary files /dev/null and b/.data/divvy.jsonl.gz differ diff --git a/.data/wikipedia.jsonl.gz b/.data/wikipedia.jsonl.gz new file mode 100644 index 00000000..3c2c9f43 Binary files /dev/null and b/.data/wikipedia.jsonl.gz differ diff --git a/.gitignore b/.gitignore index 1e099b85..a2fe8681 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,4 @@ www/transform/.gradle logstash/.gradle logstash/build theme.css +*.jsonl \ No newline at end of file diff --git a/data-extraction-report.txt b/data-extraction-report.txt new file mode 100644 index 00000000..ad1dd9d5 --- /dev/null +++ b/data-extraction-report.txt @@ -0,0 +1,12 @@ +Data Extraction Report +==================== + +Successfully extracted: +total 792864 +-rw-r--r--@ 1 rmdemp staff 349M May 14 15:28 divvy.jsonl +-rw-r--r--@ 1 rmdemp staff 34M May 14 15:28 divvy.jsonl.gz +-rw-r--r--@ 1 rmdemp staff 145K May 14 15:29 wikipedia.jsonl +-rw-r--r--@ 1 rmdemp staff 38K May 14 15:29 wikipedia.jsonl.gz + +Freebase index status: +The freebase index could not be loaded because it requires a synonyms file at /usr/share/elasticsearch/config/analysis/first_name.synonyms.txt which is missing in our container setup. diff --git a/docker-compose-elasticsearch.yml b/docker-compose-elasticsearch.yml.bak similarity index 100% rename from docker-compose-elasticsearch.yml rename to docker-compose-elasticsearch.yml.bak diff --git a/docker-compose.yml b/docker-compose.yml index 942a48b5..47498b46 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,84 +1,61 @@ -version: '2' +version: '3' services: - elastic1: - extends: - file: docker-compose-elasticsearch.yml - service: elasticsearch + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch:8.18.0 + container_name: elasticsearch environment: - ES_NODENAME: elastic1 - ports: - - "9200:9200" - - "9300:9300" - elastic2: - extends: - file: docker-compose-elasticsearch.yml - service: elasticsearch - environment: - ES_NODENAME: elastic2 - ports: - - "9201:9200" - - "9301:9300" - depends_on: - - elastic1 - elastic3: - extends: - file: docker-compose-elasticsearch.yml - service: elasticsearch - environment: - ES_NODENAME: elastic3 - ports: - - "9202:9200" - - "9302:9300" - depends_on: - - elastic1 - inquisitor: - image: spantree/elasticsearch-inquisitor - environment: - ELASTICSEARCH_URL: "http://elastic1:9200" - ports: - - "9400:80" - exercises: - image: nginx:1.11-alpine + - discovery.type=single-node + - xpack.security.enabled=false + - "ES_JAVA_OPTS=-Xms4g -Xmx4g" + ulimits: + memlock: + soft: -1 + hard: -1 volumes: - - "./exercises:/usr/share/nginx/html" - - "./exercises/nginx:/etc/nginx" - ports: - - "9500:80" - logstash: - image: logstash:2.4 + - elasticsearch-data:/usr/share/elasticsearch/data + ports: + - 9200:9200 + - 9300:9300 + networks: + - elastic + + elasticsearch-2: + image: elasticsearch:2.4.5 + platform: linux/amd64 + container_name: elasticsearch-2 + environment: + - bootstrap.memory_lock=true + - "ES_JAVA_OPTS=-Xms1g -Xmx1g" + - ES_NODENAME=elasticsearch-2 + ulimits: + memlock: + soft: -1 + hard: -1 volumes: - - "./logstash:/config-dir" - - "./data:/usr/local/share/data" - command: ["logstash", "-f", "/config-dir/logstash.conf"] - ports: - - "3333:3333" - - "5044:5044" - - "9600:9600" - depends_on: - - elastic1 - - elastic2 - - elastic3 + - elasticsearch-2-data:/usr/share/elasticsearch/data + - ./es2-config/elasticsearch.yml:/usr/share/elasticsearch/config/elasticsearch.yml + ports: + - 9201:9200 + - 9301:9300 + networks: + - elastic + kibana: - build: kibana + image: docker.elastic.co/kibana/kibana:8.18.0 + container_name: kibana ports: - - "5601:5601" + - 5601:5601 environment: - ELASTICSEARCH_URL: "http://elastic1:9200" - volumes: - - "./kibana/config:/opt/kibana/config" + - ELASTICSEARCH_HOSTS=http://elasticsearch:9200 depends_on: - - elastic1 - - elastic2 - - elastic3 - slides: - build: slides - ports: - - "9000:9000" - - "35729:35729" # to enable live reloading - environment: - ELASTICSEARCH_URL: http://localhost:9200 - KIBANA_URL: http://localhost:5601 - INQUISITOR_URL: http://localhost:9400 - EXERCISES_URL: http://localhost:9500 - volumes: - - ./slides/slides:/usr/src/slides/slides + - elasticsearch + networks: + - elastic + +volumes: + elasticsearch-data: + elasticsearch-2-data: + +networks: + elastic: + driver: bridge \ No newline at end of file diff --git a/docker-compose.yml.bak b/docker-compose.yml.bak new file mode 100644 index 00000000..3fdd96b7 --- /dev/null +++ b/docker-compose.yml.bak @@ -0,0 +1,92 @@ +version: '2' +services: + elastic1: + platform: linux/amd64 + extends: + file: docker-compose-elasticsearch.yml + service: elasticsearch + environment: + ES_NODENAME: elastic1 + ports: + - "9200:9200" + - "9300:9300" + elastic2: + platform: linux/amd64 + extends: + file: docker-compose-elasticsearch.yml + service: elasticsearch + environment: + ES_NODENAME: elastic2 + ports: + - "9201:9200" + - "9301:9300" + depends_on: + - elastic1 + elastic3: + platform: linux/amd64 + extends: + file: docker-compose-elasticsearch.yml + service: elasticsearch + environment: + ES_NODENAME: elastic3 + ports: + - "9202:9200" + - "9302:9300" + depends_on: + - elastic1 + inquisitor: + platform: linux/amd64 + image: spantree/elasticsearch-inquisitor + environment: + ELASTICSEARCH_URL: "http://elastic1:9200" + ports: + - "9400:80" + exercises: + platform: linux/amd64 + image: nginx:1.11-alpine + volumes: + - "./exercises:/usr/share/nginx/html" + - "./exercises/nginx:/etc/nginx" + ports: + - "9500:80" + logstash: + platform: linux/amd64 + image: logstash:2.4 + volumes: + - "./logstash:/config-dir" + - "./data:/usr/local/share/data" + command: [ "logstash", "-f", "/config-dir/logstash.conf" ] + ports: + - "3333:3333" + - "5044:5044" + - "9600:9600" + depends_on: + - elastic1 + - elastic2 + - elastic3 + kibana: + platform: linux/amd64 + build: kibana + ports: + - "5601:5601" + environment: + ELASTICSEARCH_URL: "http://elastic1:9200" + volumes: + - "./kibana/config:/opt/kibana/config" + depends_on: + - elastic1 + - elastic2 + - elastic3 + slides: + platform: linux/amd64 + build: slides + ports: + - "9000:9000" + - "35729:35729" # to enable live reloading + environment: + ELASTICSEARCH_URL: http://localhost:9200 + KIBANA_URL: http://localhost:5601 + INQUISITOR_URL: http://localhost:9400 + EXERCISES_URL: http://localhost:9500 + volumes: + - ./slides/slides:/usr/src/slides/slides diff --git a/es2-config/elasticsearch.yml b/es2-config/elasticsearch.yml new file mode 100644 index 00000000..0836cb25 --- /dev/null +++ b/es2-config/elasticsearch.yml @@ -0,0 +1,40 @@ +network: + host: 0.0.0.0 +node: + name: ${ES_NODENAME} +index: + number_of_replicas: 0 +http: + max_content_length: 500mb +bootstrap: + mlockall: true +indices: + fielddata: + cache: + size: 25% +discovery: + zen: + ping: + unicast: + hosts: ["localhost"] + multicast: + enabled: false + minimum_master_nodes: 1 +repositories: + url: + allowed_urls: ["https://elasticsearch-sample-data.s3.amazonaws.com/*"] +script: + engine: + groovy: + inline: + aggs: true + mapping: true + search: true + update: true + plugin: true + indexed: + aggs: true + mapping: true + search: true + update: true + plugin: true \ No newline at end of file diff --git a/exercises/aggregations.sense b/exercises/aggregations.sense index 0d9cc703..a1726907 100644 --- a/exercises/aggregations.sense +++ b/exercises/aggregations.sense @@ -3,16 +3,35 @@ # Query Divvy trips. # -GET /divvy/trip/_search +GET /divvy/_search +{ + "query": { + "term": { + "_index": "divvy" + } + } +} # Query Divvy stations # -GET /divvy/station/_search +GET /divvy/_search +{ + "query": { + "term": { + "_index": "divvy" + } + } +} # Get terms aggregation for gender of rider. # -GET /divvy/trip/_search +GET /divvy/_search { + "query": { + "term": { + "_index": "divvy" + } + }, "aggs": { "genders": { "terms": { @@ -25,8 +44,13 @@ GET /divvy/trip/_search # Get statistics for trip duration. # -GET /divvy/trip/_search +GET /divvy/_search { + "query": { + "term": { + "_index": "divvy" + } + }, "aggs": { "trip_duration_stats": { "stats": { @@ -39,8 +63,13 @@ GET /divvy/trip/_search # Get extended statistics for trip duration. # -GET /divvy/trip/_search +GET /divvy/_search { + "query": { + "term": { + "_index": "divvy" + } + }, "aggs": { "trip_duration_stats": { "extended_stats": { @@ -53,8 +82,13 @@ GET /divvy/trip/_search # Get trip duration percentiles. # -GET /divvy/trip/_search +GET /divvy/_search { + "query": { + "term": { + "_index": "divvy" + } + }, "aggs": { "trip_length": { "percentiles": { @@ -72,10 +106,12 @@ GET /divvy/trip/_search # Get trip duration stats by gender. # -GET /divvy/trip/_search +GET /divvy/_search { "query": { - "match_all": {} + "term": { + "_index": "divvy" + } }, "aggs": { "gender": { @@ -97,8 +133,13 @@ GET /divvy/trip/_search # Aggregate based on distance to Spantree's office. # -GET /divvy/station/_search +GET /divvy/_search { + "query": { + "term": { + "_index": "divvy" + } + }, "aggs": { "spantree_dist": { "geo_distance": { @@ -140,8 +181,13 @@ GET /divvy/station/_search # Get trip duration histogram. # -GET /divvy/trip/_search +GET /divvy/_search { + "query": { + "term": { + "_index": "divvy" + } + }, "aggs": { "trip_length": { "histogram": { @@ -156,13 +202,18 @@ GET /divvy/trip/_search # Get bike trips over time. We can also create histograms by date. # -GET /divvy/trip/_search +GET /divvy/_search { + "query": { + "term": { + "_index": "divvy" + } + }, "aggs": { "trips_over_time": { "date_histogram": { "field": "start_time", - "interval": "week" + "fixed_interval": "week" } } }, @@ -171,8 +222,13 @@ GET /divvy/trip/_search # Get oldest two movies by genre. # -GET /freebase/film/_search +GET /freebase/_search { + "query": { + "term": { + "_index": "freebase" + } + }, "aggs": { "top_genres": { "terms": { @@ -190,7 +246,7 @@ GET /freebase/film/_search } ], "_source": { - "include": [ + "includes": [ "name", "initial_release_date" ] @@ -206,8 +262,13 @@ GET /freebase/film/_search # Get top 3 directors by genre # -GET /freebase/film/_search +GET /freebase/_search { + "query": { + "term": { + "_index": "freebase" + } + }, "aggs": { "top_genres": { "terms": { @@ -217,7 +278,7 @@ GET /freebase/film/_search "aggs": { "top_directors": { "terms": { - "field": "directed_by.raw", + "field": "directed_by.keyword", "size": 3 } } diff --git a/exercises/getting-started.sense b/exercises/getting-started.sense index 1fc74579..926f27ef 100644 --- a/exercises/getting-started.sense +++ b/exercises/getting-started.sense @@ -7,12 +7,12 @@ GET / # Index a single document. Now, we will insert a single document -# into Elasticsearch. Note that we don't need to create an index or -# type, it gets created automatically if it doesn't already exist. +# into Elasticsearch. Note that we don't need to create an index +# as it gets created automatically if it doesn't already exist. # Elasticsearch will also try to guess the types for document fields # based on the initial JSON payload. # -PUT /getting-started/locations/frontera_grill +PUT /getting-started/_doc/frontera_grill { "name": "Frontera Grill", "url": "http://en.wikipedia.org/wiki/Frontera_Grill", @@ -27,7 +27,7 @@ PUT /getting-started/locations/frontera_grill # Fetch our document. You can retrieve a single document by its ID # with a simple HTTP GET request. # -GET /getting-started/locations/frontera_grill +GET /getting-started/_doc/frontera_grill # Finding all documents. We can also execute a request to get all # documents in this index. At this point, there should only be one. @@ -35,6 +35,6 @@ GET /getting-started/locations/frontera_grill GET /getting-started/_search # Review the mappings. We can also peek at the mappings Elasticsearch -# automatically generated for the location document type. +# automatically generated for the index. # -GET /getting-started/locations/_mapping +GET /getting-started/_mapping diff --git a/exercises/indexing.sense b/exercises/indexing.sense index 1439c429..b10931b7 100644 --- a/exercises/indexing.sense +++ b/exercises/indexing.sense @@ -3,7 +3,7 @@ # Index John Doe with an assigned ID. # -POST /spantree/people/ +POST /spantree/_doc/ { "name": "Johnny Noname" } @@ -18,7 +18,7 @@ GET /spantree/_mapping # Index Cedric with a known ID. # -PUT /spantree/people/cedric +PUT /spantree/_doc/cedric { "name": "Cedric Hurst", "title": "Principal" @@ -26,11 +26,11 @@ PUT /spantree/people/cedric # Make Sure Cedric is there. # -GET /spantree/people/cedric +GET /spantree/_doc/cedric # Add more information about Cedric. # -POST /spantree/people/cedric/_update +POST /spantree/_update/cedric { "doc": { "git_commits": 2560 @@ -39,15 +39,16 @@ POST /spantree/people/cedric/_update # Add one more git commit for Cedric. # -POST /spantree/people/cedric/_update +POST /spantree/_update/cedric { - "script": "ctx._source.git_commits += 1", - "lang": "groovy" + "script": { + "source": "ctx._source.git_commits += 1" + } } # Upsert Kevin. # -POST /spantree/people/kevin/_update +POST /spantree/_update/kevin { "doc": { "git_commits": 1912 @@ -61,14 +62,14 @@ POST /spantree/people/kevin/_update # Make sure Kevin is still there. # -GET /spantree/people/kevin +GET /spantree/_doc/kevin # Add everyone else. # POST /spantree/_bulk -{"index":{"_id":"gary","_type": "people"}} +{"index":{"_id":"gary"}} {"name":"Gary Turovsky","title":"Senior Software Engineer","git_commits": 611} -{"index":{"_id":"jonathan","_type": "people"}} +{"index":{"_id":"jonathan"}} {"name":"Jonathan Freeman","title":"Software Engineer","git_commits": 186} # Review the whole list. diff --git a/exercises/more-like-this.sense b/exercises/more-like-this.sense index 4664c5b0..b4c69850 100644 --- a/exercises/more-like-this.sense +++ b/exercises/more-like-this.sense @@ -16,7 +16,6 @@ POST /wikipedia/_search "like" : [ { "_index" : "wikipedia", - "_type" : "locations", "_id" : "northwestern_university_settlement_house" } ], @@ -31,14 +30,13 @@ POST /wikipedia/_search # query. POST /wikipedia/_search { - "fields" : ["about"], + "_source" : ["about"], "query": { "more_like_this" : { "fields" : ["name", "about", "description"], "like" : [ { "_index" : "wikipedia", - "_type" : "locations", "_id" : "northwestern_university_settlement_house" } ], diff --git a/exercises/paging-and-sorting.sense b/exercises/paging-and-sorting.sense index c90bfcb3..44059eca 100644 --- a/exercises/paging-and-sorting.sense +++ b/exercises/paging-and-sorting.sense @@ -6,7 +6,7 @@ # GET /wikipedia/_search { - "fields": ["name", "coordinates"], + "_source": ["name", "coordinates"], "query": { "bool": { "must": [ @@ -26,7 +26,7 @@ GET /wikipedia/_search # GET /wikipedia/_search { - "fields": ["name", "coordinates"], + "_source": ["name", "coordinates"], "query": { "bool": { "must": [ @@ -38,41 +38,71 @@ GET /wikipedia/_search "from": 10 } -# Start a scan query. For frequently-changing data sets, it is often +# Start a point in time query. For frequently-changing data sets, it is often # difficult to keep search results consistent across pages. For # example, if a user is sorting results by freshness, a search result # once appeared in position 10 may be in position 11 by the time the # second page is requested. Elasticsearch has the ability to retain a -# previously fetched result set via a "scan query". This is similar -# to a JDBC cursor. +# previously fetched result set. # -# Note: make sure to copy the scroll id from the result because we're +# Note: make sure to copy the pit id from the result because we're # going to need it in the next step # -GET /wikipedia/_search?search_type=scan&scroll=10m&size=10 +POST /wikipedia/_pit?keep_alive=10m + +# Now use the pit id in a search request +# +GET /_search +{ + "pit": { + "id": "YOUR_PIT_ID", + "keep_alive": "10m" + }, + "_source": ["name", "coordinates"], + "query": { + "bool": { + "must": [ + {"query_string": {"query": "chicago"}} + ] + } + }, + "size": 10 +} + +# Access the next page using search_after +# +GET /_search { - "fields": ["name", "coordinates"], + "pit": { + "id": "YOUR_PIT_ID", + "keep_alive": "10m" + }, + "_source": ["name", "coordinates"], "query": { "bool": { "must": [ {"query_string": {"query": "chicago"}} ] } - } + }, + "size": 10, + "search_after": [YOUR_SORT_VALUES], + "sort": [{"_score": "desc"}, {"_id": "asc"}] } -## Continue our scan query. Use the field _scroll_id from the above -## query in the following query. Note: you will have to manually -## manipulate the query string with the scroll id from the last query. +# Clean up the PIT when done # -GET /_search/scroll?scroll=10m&scroll_id={scroll_id} +DELETE /_pit +{ + "id": "YOUR_PIT_ID" +} # Sort dates chronologically. Sorting documents matching "chicago" by # last update time. # GET /wikipedia/_search { - "fields": ["name", "lastUpdated"], + "_source": ["name", "lastUpdated"], "query": { "bool": { "must": [ @@ -88,7 +118,7 @@ GET /wikipedia/_search # GET /wikipedia/_search { - "fields": ["name", "lastUpdated"], + "_source": ["name", "lastUpdated"], "query": { "bool": { "must": [{"query_string": {"query": "chicago"}}] @@ -102,7 +132,7 @@ GET /wikipedia/_search # GET /wikipedia/_search { - "fields": ["name", "coordinates"], + "_source": ["name", "coordinates"], "query": { "bool": { "must": [ @@ -131,9 +161,9 @@ GET /wikipedia/_search # inverted index. Because our name field is tokenized, the first # alphabetical token in the field value determines a field's ranking. # -GET /wikipedia/locations/_search +GET /wikipedia/_search { - "fields": ["name"], + "_source": ["name"], "query": { "bool": { "must": [{ @@ -156,21 +186,25 @@ GET /wikipedia/_settings?name=*.sort*.* # Store the sortable string in a multi-field. For the name field, we # want to do both full-text search and string sorting. So to make sure # we can handle both, we configure name to be a multi-field. -GET /wikipedia/locations/_mapping +GET /wikipedia/_mapping # Testing the sortable analyzer. When creating new analyzers, we # recommend testing them out using the Analyze API to make sure they # work as expected. # -GET /wikipedia/_analyze?field=name.sortable&text=I wanna know what love is; I want you to show me! +GET /wikipedia/_analyze +{ + "field": "name.sortable", + "text": "I wanna know what love is; I want you to show me!" +} # Sort Strings the Right Way. Now that we've done all that, we simply # swap out the sort field from the previous query to use -# `name.sorted`. +# `name.sortable`. -GET /wikipedia/locations/_search +GET /wikipedia/_search { - "fields": ["name"], + "_source": ["name"], "query": { "bool": { "must": [{ diff --git a/exercises/percolators.sense b/exercises/percolators.sense index 2a992c61..fa3bf13e 100644 --- a/exercises/percolators.sense +++ b/exercises/percolators.sense @@ -3,7 +3,7 @@ # Add a sample document. # -PUT /spantree/team/cedric +PUT /spantree/_doc/cedric { "drinks": ["Red Bull"] } @@ -18,73 +18,101 @@ PUT /spantree/team/cedric # process at Spantree, where we'd like to be notified if they modify # our scheduled amazon orders for various teas. # -# First, we will register a percolator for people who like to drink -# earl grey. +# First, we'll create a percolator mapping # -PUT /spantree/.percolator/earl_grey +PUT /percolate_queries { - "query" : { - "match" : { - "drinks": "earl grey" - } - } + "mappings": { + "properties": { + "query": { + "type": "percolator" + }, + "drinks": { + "type": "keyword" + } + } + } +} + +# Register a percolator for people who like to drink earl grey. +# +PUT /percolate_queries/_doc/earl_grey +{ + "query": { + "match": { + "drinks": "earl grey" + } + } } # Add Percolator for Russian Caravan. # -PUT /spantree/.percolator/russian_caravan +PUT /percolate_queries/_doc/russian_caravan { - "query" : { - "match" : { - "drinks": "russian caravan" - } - } + "query": { + "match": { + "drinks": "russian caravan" + } + } } # Onboard Justin with his drink preferences. # -# -GET /spantree/people/_percolate +GET /percolate_queries/_search { - "doc" : { - "name": "Justin", - "drinks": [ - "Triple Espresso", - "Green Tea with Brown Rice", - "Coconut Water" - ] - } + "query": { + "percolate": { + "field": "query", + "document": { + "name": "Justin", + "drinks": [ + "Triple Espresso", + "Green Tea with Brown Rice", + "Coconut Water" + ] + } + } + } } # Onboard Kevin with his drink preferences. # -GET /spantree/people/_percolate +GET /percolate_queries/_search { - "doc" : { - "name": "Kevin", - "drinks": [ - "Sodastream Energy", - "Dark Magic Coffee", - "Earl Grey Tea" - ] - } + "query": { + "percolate": { + "field": "query", + "document": { + "name": "Kevin", + "drinks": [ + "Sodastream Energy", + "Dark Magic Coffee", + "Earl Grey Tea" + ] + } + } + } } # Review the earl grey percolator. # -GET /spantree/.percolator/earl_grey +GET /percolate_queries/_doc/earl_grey # Onboard Marija with her drink preferences. # -# -GET /spantree/people/_percolate +GET /percolate_queries/_search { - "doc": { - "name": "Marija", - "drinks": [ - "Earl Grey Tea", - "Russian Caravan Tea", - "Assam Tea" - ] - } + "query": { + "percolate": { + "field": "query", + "document": { + "name": "Marija", + "drinks": [ + "Earl Grey Tea", + "Russian Caravan Tea", + "Assam Tea" + ] + } + } + } } diff --git a/exercises/searching.sense b/exercises/searching.sense index dfc768e5..833d959a 100644 --- a/exercises/searching.sense +++ b/exercises/searching.sense @@ -61,7 +61,7 @@ GET /wikipedia/_search "query": "theater OR theatre" } }, - "fields": ["name", "keywords"] + "_source": ["name", "keywords"] } # Search with Lucene boolean syntax. Find both the terms "theater" and @@ -69,7 +69,7 @@ GET /wikipedia/_search # GET /wikipedia/_search { - "fields": ["name", "keywords", "description"], + "_source": ["name", "keywords", "description"], "query": { "query_string": { "fields": ["name", "keywords", "description"], @@ -82,7 +82,7 @@ GET /wikipedia/_search # GET /wikipedia/_search { - "fields": ["name", "keywords", "description"], + "_source": ["name", "keywords", "description"], "query": { "bool": { "must": [ @@ -109,7 +109,7 @@ GET /wikipedia/_search # GET /wikipedia/_search { - "fields": ["name", "keywords", "description"], + "_source": ["name", "keywords", "description"], "query": { "bool": { "must": [ @@ -133,7 +133,7 @@ GET /wikipedia/_search # GET /wikipedia/_search { - "fields": ["name", "keywords", "description"], + "_source": ["name", "keywords", "description"], "query": { "bool": { "should": [ @@ -158,7 +158,7 @@ GET /wikipedia/_search # GET /wikipedia/_search { - "fields": ["name", "description"], + "_source": ["name", "description"], "query": { "bool": { "must": [ @@ -181,7 +181,7 @@ GET /wikipedia/_search # Explain results. The explain endpoint will expose some of the math # behind filtering and scoring. # -GET /wikipedia/locations/chicago_shakespeare_theater/_explain +GET /wikipedia/_doc/chicago_shakespeare_theater/_explain { "query": { "query_string": { @@ -199,7 +199,7 @@ GET /wikipedia/locations/chicago_shakespeare_theater/_explain # GET /wikipedia/_search { - "fields": ["name", "keywords", "about"], + "_source": ["name", "keywords", "about"], "query": { "query_string": { "fields": ["name^2", "keywords^1.5", "about"], @@ -213,7 +213,7 @@ GET /wikipedia/_search # GET /wikipedia/_search { - "fields": [ + "_source": [ "name", "keywords", "about" @@ -232,8 +232,9 @@ GET /wikipedia/_search } }, "script_score": { - "script": "_score * 2", - "lang": "groovy" + "script": { + "source": "_score * 2" + } } } } @@ -257,18 +258,26 @@ GET /wikipedia/_search # Filter on geo-distance. # -GET /divvy/station/_search +GET /divvy/_search { "query": { - "match_all": {} - }, - "filter": { - "geo_distance" : { - "distance": "1mi", - "location" : { - "lat": 41.886732, - "lon": -87.655979 - } + "bool": { + "filter": [ + { + "term": { + "_index": "divvy" + } + }, + { + "geo_distance": { + "distance": "1mi", + "location": { + "lat": 41.886732, + "lon": -87.655979 + } + } + } + ] } } } diff --git a/exercises/suggestions.sense b/exercises/suggestions.sense index ecec0166..6c003d8e 100644 --- a/exercises/suggestions.sense +++ b/exercises/suggestions.sense @@ -4,12 +4,14 @@ # Build a term suggester. These provide suggestions based on # Levenshtein (edit) distance. # -POST /wikipedia/_suggest +POST /wikipedia/_search { - "term_suggestion": { - "text": "buildng", - "term": { - "field": "description" + "suggest": { + "term_suggestion": { + "text": "buildng", + "term": { + "field": "description" + } } } } @@ -18,30 +20,32 @@ POST /wikipedia/_suggest # suggesters to return entire corrected phrases based on a sequence # of words in the corpus (ngrams). # -POST /wikipedia/_suggest +POST /wikipedia/_search { - "text": "high risk buildng", - "simple_phrase": { - "phrase": { - "field": "description", - "gram_size": 2, - "real_word_error_likelihood": 0.95, - "confidence": 1, - "max_errors": 100, - "size": 1, - "analyzer": "standard", - "shard_size": 5, - "direct_generator": [ - { - "field": "description", - "suggest_mode": "popular", - "max_edits": 2, - "min_word_len": 4, - "max_inspections": 5, - "min_doc_freq": 0, - "max_term_freq": 0 - } - ] + "suggest": { + "simple_phrase": { + "text": "high risk buildng", + "phrase": { + "field": "description", + "gram_size": 2, + "real_word_error_likelihood": 0.95, + "confidence": 1, + "max_errors": 100, + "size": 1, + "analyzer": "standard", + "shard_size": 5, + "direct_generator": [ + { + "field": "description", + "suggest_mode": "popular", + "max_edits": 2, + "min_word_len": 4, + "max_inspections": 5, + "min_doc_freq": 0, + "max_term_freq": 0 + } + ] + } } } } @@ -53,72 +57,72 @@ POST /wikipedia/_suggest # to assign weights and priorities to different options. # PUT /suggestions - -# Create completion suggestion mapping. -# -PUT /suggestions/suggestion/_mapping { - "suggestion" : { - "properties" : { - "name" : { "type" : "string" }, - "suggest" : { - "type" : "completion", - "analyzer" : "simple", - "search_analyzer" : "simple", - "payloads" : true - } - } + "mappings": { + "properties": { + "name": { + "type": "text" + }, + "suggest": { + "type": "completion", + "analyzer": "simple", + "search_analyzer": "simple", + "preserve_position_increments": true + } } + } } # Create a completion suggestion for Cedric # -PUT /suggestions/suggestion/1 +PUT /suggestions/_doc/1 { - "name" : "Cedric Hurst", - "suggest" : { - "input": [ "Cedster", "The Ced", "C-Man", "That guy from that one meetup", "Software Engineer" ], - "output": "Cedric", - "payload" : { "title" : "Principal" }, - "weight" : 37 - } + "name": "Cedric Hurst", + "suggest": { + "input": [ "Cedster", "The Ced", "C-Man", "That guy from that one meetup", "Software Engineer" ], + "weight": 37 + }, + "title": "Principal" } # Create a completion suggestion for Kevin. # -PUT /suggestions/suggestion/2 +PUT /suggestions/_doc/2 { - "name" : "Kevin Greene", - "suggest" : { - "input": [ "Kev", "KG", "Greene", "Michigan", "Software Engineer" ], - "output": "Kevin", - "payload" : { "title" : "Senior Software Engineer" }, - "weight" : 101 - } + "name": "Kevin Greene", + "suggest": { + "input": [ "Kev", "KG", "Greene", "Michigan", "Software Engineer" ], + "weight": 101 + }, + "title": "Senior Software Engineer" } # Get a completion suggestion. # -POST /suggestions/_suggest +POST /suggestions/_search { - "completion_suggestion": { - "text": "software", - "completion": { - "field": "suggest" + "suggest": { + "completion_suggestion": { + "prefix": "software", + "completion": { + "field": "suggest" + } } } } # Get a fuzzy completion suggestion. # -POST /suggestions/_suggest +POST /suggestions/_search { - "completion_suggestion": { - "text": "mchgan", - "completion": { - "field": "suggest", - "fuzzy" : { - "fuzziness" : 2 + "suggest": { + "completion_suggestion": { + "prefix": "mchgan", + "completion": { + "field": "suggest", + "fuzzy": { + "fuzziness": 2 + } } } } @@ -130,85 +134,84 @@ POST /suggestions/_suggest # You may want to boost suggestions for people who work in your same # country or office. Context suggesters allow you to combine the two. # -PUT /suggestions/conference/_mapping +PUT /suggestions { - "conference": { - "properties": { - "name": { - "type": "string" - }, - "suggestion": { - "type": "completion", - "context": { - "location": { - "type": "geo", - "precision": "500km", - "neighbors": true, - "default": "u33" - } - } - } + "mappings": { + "properties": { + "name": { + "type": "text" + }, + "suggestion": { + "type": "completion", + "contexts": { + "location": { + "type": "geo", + "precision": "500km", + "neighbors": true + } } + } } + } } ## Create GOTO Chicago conference suggestion # -PUT /suggestions/conference/goto_chicago +PUT /suggestions/_doc/goto_chicago { - "name": "GOTO Chicago", - "suggestion": { - "input": [ - "goto", - "chicago", - "tech conference" - ], - "output": "GOTO Chicago", - "context": { - "location": { - "lat": 41.8927539, - "lon": -87.6191727 - } - } + "name": "GOTO Chicago", + "suggestion": { + "input": [ + "goto", + "chicago", + "tech conference" + ], + "contexts": { + "location": { + "lat": 41.8927539, + "lon": -87.6191727 + } } + } } # Create Strangeloop context suggestion # -PUT /suggestions/conference/strangeloop +PUT /suggestions/_doc/strangeloop { - "name": "Strangeloop Conference", - "suggestion": { - "input": [ - "strange", - "loop", - "tech conference" - ], - "output": "Strangeloop Conference", - "context": { - "location": { - "lat": 38.6537065, - "lon": -90.2477908 - } - } + "name": "Strangeloop Conference", + "suggestion": { + "input": [ + "strange", + "loop", + "tech conference" + ], + "contexts": { + "location": { + "lat": 38.6537065, + "lon": -90.2477908 + } } + } } # Suggest nearby nerdy things when in St Louis. # -POST suggestions/_suggest +POST suggestions/_search { + "suggest": { "context_suggestion": { - "text": "tech", - "completion": { - "field": "suggestion", - "size": 10, - "context": { - "location": { - "lat": 39.626072, - "lon": -90.0769822 - } - } + "prefix": "tech", + "completion": { + "field": "suggestion", + "size": 10, + "contexts": { + "location": { + "lat": 39.626072, + "lon": -90.0769822 + } } + } } + } } diff --git a/extract-data.sh b/extract-data.sh new file mode 100755 index 00000000..df49710d --- /dev/null +++ b/extract-data.sh @@ -0,0 +1,69 @@ +#!/usr/bin/env bash + +set -o errexit # exit when a command fails +set -o pipefail # exit if pipe fails +set -o nounset # exit when script tries to use undeclared variables + +ES_HOST="http://localhost:9201" +OUTPUT_DIR=".data" +SCROLL_TIME="1m" +BATCH_SIZE=1000 + +# Make sure output directory exists +mkdir -p "$OUTPUT_DIR" + +# Get list of indices +INDICES=$(curl -s "${ES_HOST}/_cat/indices?h=index" | grep -v "freebase" | tr -d ' ') + +for INDEX in $INDICES; do + echo "Extracting data from index: $INDEX" + OUTPUT_FILE="${OUTPUT_DIR}/${INDEX}.jsonl" + + # Initialize the scroll + SCROLL_ID=$(curl -s -X POST "${ES_HOST}/${INDEX}/_search?scroll=${SCROLL_TIME}" -d '{ + "size": '${BATCH_SIZE}', + "sort": ["_doc"], + "query": {"match_all": {}} + }' | jq -r '._scroll_id') + + # Get the first batch of results + RESULTS=$(curl -s -X POST "${ES_HOST}/_search/scroll" -d '{ + "scroll": "'${SCROLL_TIME}'", + "scroll_id": "'${SCROLL_ID}'" + }') + + # Extract hits + echo "$RESULTS" | jq -c '.hits.hits[]._source' > "$OUTPUT_FILE" + + # Continue scrolling until no more hits + TOTAL_HITS=$(echo "$RESULTS" | jq '.hits.total') + HITS_COUNT=$(echo "$RESULTS" | jq '.hits.hits | length') + + while [ "$HITS_COUNT" -gt 0 ]; do + echo "Processed $HITS_COUNT documents from $INDEX..." + + # Get the next batch of results + RESULTS=$(curl -s -X POST "${ES_HOST}/_search/scroll" -d '{ + "scroll": "'${SCROLL_TIME}'", + "scroll_id": "'${SCROLL_ID}'" + }') + + # Extract hits and append to file + echo "$RESULTS" | jq -c '.hits.hits[]._source' >> "$OUTPUT_FILE" + + # Update scroll ID and counts + SCROLL_ID=$(echo "$RESULTS" | jq -r '._scroll_id') + HITS_COUNT=$(echo "$RESULTS" | jq '.hits.hits | length') + done + + # Clean up the scroll + curl -s -X DELETE "${ES_HOST}/_search/scroll" -d '{ + "scroll_id": ["'${SCROLL_ID}'"] + }' > /dev/null + + echo "Completed extracting data from $INDEX" + echo "Data saved to $OUTPUT_FILE" + echo +done + +echo "All data extraction complete!" \ No newline at end of file diff --git a/extract-freebase.sh b/extract-freebase.sh new file mode 100755 index 00000000..b46acccc --- /dev/null +++ b/extract-freebase.sh @@ -0,0 +1,143 @@ +#!/usr/bin/env bash + +set -o errexit # exit when a command fails +set -o pipefail # exit if pipe fails +set -o nounset # exit when script tries to use undeclared variables + +ES_HOST="http://localhost:9201" +OUTPUT_DIR=".data" +OUTPUT_FILE="${OUTPUT_DIR}/freebase.jsonl" +SCROLL_TIME="1m" +BATCH_SIZE=1000 +MAX_ATTEMPTS=5 + +mkdir -p "$OUTPUT_DIR" + +# Function to check if we got valid JSON response +function is_valid_json() { + if echo "$1" | jq -e . >/dev/null 2>&1; then + return 0 + else + return 1 + fi +} + +echo "Attempting to extract data from freebase index..." + +# Try to get a count of documents +COUNT_RESPONSE=$(curl -s "${ES_HOST}/freebase/_count") +if is_valid_json "$COUNT_RESPONSE" && [[ $(echo "$COUNT_RESPONSE" | jq -r '.count // 0') -gt 0 ]]; then + DOC_COUNT=$(echo "$COUNT_RESPONSE" | jq -r '.count') + echo "Found $DOC_COUNT documents in freebase index" +else + echo "Warning: Unable to get document count. Will attempt extraction anyway." +fi + +# Try to get some data +for attempt in $(seq 1 $MAX_ATTEMPTS); do + echo "Attempt $attempt of $MAX_ATTEMPTS to extract data..." + + # Try to get a small batch of data + RESPONSE=$(curl -s "${ES_HOST}/freebase/_search?size=10" -d '{ + "query": {"match_all": {}}, + "sort": ["_doc"] + }') + + if is_valid_json "$RESPONSE" && [[ $(echo "$RESPONSE" | jq -r '.hits.hits | length') -gt 0 ]]; then + echo "Successfully retrieved documents. Starting full extraction..." + break + fi + + echo "No data found or encountered an error. Waiting 5 seconds before retrying..." + sleep 5 + + if [[ $attempt -eq $MAX_ATTEMPTS ]]; then + echo "Failed to extract data after $MAX_ATTEMPTS attempts." + echo "The index may still be initializing or is empty." + exit 1 + fi +done + +# Initialize output file +> "$OUTPUT_FILE" + +# Initialize the scroll +echo "Initializing scroll for bulk extraction..." +SCROLL_RESPONSE=$(curl -s -X POST "${ES_HOST}/freebase/_search?scroll=${SCROLL_TIME}&ignore_unavailable=true" -d '{ + "size": '${BATCH_SIZE}', + "sort": ["_doc"], + "query": {"match_all": {}} +}') + +if ! is_valid_json "$SCROLL_RESPONSE"; then + echo "Error: Invalid response from Elasticsearch." + echo "Response: $SCROLL_RESPONSE" + exit 1 +fi + +# Check if we got any hits at all +HITS_COUNT=$(echo "$SCROLL_RESPONSE" | jq '.hits.hits | length') +if [[ $HITS_COUNT -eq 0 ]]; then + echo "No documents found in the freebase index." + exit 0 +fi + +# Extract hits +echo "$SCROLL_RESPONSE" | jq -c '.hits.hits[]._source' >> "$OUTPUT_FILE" +SCROLL_ID=$(echo "$SCROLL_RESPONSE" | jq -r '._scroll_id') + +# Continue scrolling until no more hits +TOTAL_EXTRACTED=0 +EXTRACTED_THIS_BATCH=$HITS_COUNT +((TOTAL_EXTRACTED += EXTRACTED_THIS_BATCH)) + +echo "Extracted $EXTRACTED_THIS_BATCH documents. Continuing..." + +while [[ $EXTRACTED_THIS_BATCH -gt 0 ]]; do + # Get the next batch of results + SCROLL_RESPONSE=$(curl -s -X POST "${ES_HOST}/_search/scroll" -d '{ + "scroll": "'${SCROLL_TIME}'", + "scroll_id": "'${SCROLL_ID}'" + }') + + if ! is_valid_json "$SCROLL_RESPONSE"; then + echo "Error: Invalid response from scroll request." + break + fi + + # Extract hits and append to file + EXTRACTED_THIS_BATCH=$(echo "$SCROLL_RESPONSE" | jq '.hits.hits | length') + + if [[ $EXTRACTED_THIS_BATCH -gt 0 ]]; then + echo "$SCROLL_RESPONSE" | jq -c '.hits.hits[]._source' >> "$OUTPUT_FILE" + SCROLL_ID=$(echo "$SCROLL_RESPONSE" | jq -r '._scroll_id') + ((TOTAL_EXTRACTED += EXTRACTED_THIS_BATCH)) + echo "Extracted $EXTRACTED_THIS_BATCH more documents. Total: $TOTAL_EXTRACTED" + fi +done + +# Clean up the scroll +curl -s -X DELETE "${ES_HOST}/_search/scroll" -d '{ + "scroll_id": ["'${SCROLL_ID}'"] +}' > /dev/null + +# Check the file size +FILE_SIZE=$(wc -c < "$OUTPUT_FILE") +LINE_COUNT=$(wc -l < "$OUTPUT_FILE") + +echo "Extraction completed." +echo "Extracted $TOTAL_EXTRACTED documents from freebase index." +echo "Output file size: $FILE_SIZE bytes" +echo "Line count: $LINE_COUNT" + +# Compress the file +if [[ $FILE_SIZE -gt 0 ]]; then + gzip -k "$OUTPUT_FILE" + COMPRESSED_SIZE=$(wc -c < "${OUTPUT_FILE}.gz") + echo "Compressed file size: $COMPRESSED_SIZE bytes" + echo "Compression ratio: $(echo "scale=2; $COMPRESSED_SIZE * 100 / $FILE_SIZE" | bc)%" +else + echo "Output file is empty. Skipping compression." +fi + +echo "Extraction process complete." \ No newline at end of file diff --git a/extract-wikipedia.sh b/extract-wikipedia.sh new file mode 100755 index 00000000..f2869f2b --- /dev/null +++ b/extract-wikipedia.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +set -o errexit # exit when a command fails +set -o pipefail # exit if pipe fails +set -o nounset # exit when script tries to use undeclared variables + +ES_HOST="http://localhost:9201" +OUTPUT_DIR=".data" +OUTPUT_FILE="${OUTPUT_DIR}/wikipedia.jsonl" + +mkdir -p "$OUTPUT_DIR" + +# Get all documents from the wikipedia index with a single query +curl -s -X GET "${ES_HOST}/wikipedia/_search?size=100" -d '{ + "query": {"match_all": {}} +}' | jq -c '.hits.hits[]._source' > "$OUTPUT_FILE" + +echo "Data saved to $OUTPUT_FILE" +wc -l "$OUTPUT_FILE" \ No newline at end of file diff --git a/load-sample-snapshots.sh b/load-sample-snapshots.sh index e30f59e8..b27d642a 100755 --- a/load-sample-snapshots.sh +++ b/load-sample-snapshots.sh @@ -5,17 +5,18 @@ set -o errexit # exit when a command fails. set -o nounset # exit when your script tries to use undeclared variables REPOSITORY_NAME=sample_readonly +ES_HOST=http://localhost:9201 __dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # Create a read-only snapshot repository to pull files from S3 bucket via HTTPs -curl -X PUT -d '{"type": "url", "settings": {"url": "https://elasticsearch-sample-data.s3.amazonaws.com/"}}' "http://localhost:9200/_snapshot/${REPOSITORY_NAME}" +curl -X PUT -d '{"type": "url", "settings": {"url": "https://elasticsearch-sample-data.s3.amazonaws.com/"}}' "${ES_HOST}/_snapshot/${REPOSITORY_NAME}" # Delete any existing indices -curl -X DELETE "http://localhost:9200/*" +curl -X DELETE "${ES_HOST}/*" # Read snapshots to restore from manifest file for SNAPSHOT_NAME in $(cat ./snapshot-manifest); do # Restore snapshots with one replica - time curl -X POST -d '{"index_settings": {"index.number_of_replicas": 1}}' "http://localhost:9200/_snapshot/${REPOSITORY_NAME}/${SNAPSHOT_NAME}/_restore?wait_for_completion=true" + time curl -X POST -d '{"index_settings": {"index.number_of_replicas": 1}}' "${ES_HOST}/_snapshot/${REPOSITORY_NAME}/${SNAPSHOT_NAME}/_restore?wait_for_completion=true" done