From 9fe99f7cfb5e3dfe7f5a259d52bd1cff98b4bfa5 Mon Sep 17 00:00:00 2001 From: bitsofinfo Date: Fri, 25 Aug 2017 10:14:28 -0600 Subject: [PATCH 01/18] index bodies option/flag --- README.md | 4 +++- src/index_emails.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a15a516..ad55793 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,8 @@ Set up [Elasticsearch](http://ohardt.us/es-install) and make sure it's running a I use Python and [Tornado](https://github.com/tornadoweb/tornado/) for the scripts to import and query the data. Run `pip install tornado chardet` to install Tornado and chardet. +You may also need to `pip install beautifulsoup4`, for the stripping HTML/JS/CSS via the body indexing flag: `--index_bodies` + #### Aight, where do we start? @@ -194,7 +196,7 @@ You can also quickly query for certain fields via the `q` parameter. This exampl ``` curl "localhost:9200/gmail/email/_search?pretty&q=from:ship-confirm@amazon.com" -``` +``` ##### Aggregation queries diff --git a/src/index_emails.py b/src/index_emails.py index 20515af..867adfb 100644 --- a/src/index_emails.py +++ b/src/index_emails.py @@ -12,6 +12,7 @@ from DelegatingEmailParser import DelegatingEmailParser from AmazonEmailParser import AmazonEmailParser from SteamEmailParser import SteamEmailParser +from bs4 import BeautifulSoup import logging http_client = HTTPClient() @@ -20,6 +21,20 @@ DEFAULT_ES_URL = "http://localhost:9200" DEFAULT_INDEX_NAME = "gmail" +def strip_html_css_js(msg): + soup = BeautifulSoup(msg,"html.parser") # create a new bs4 object from the html data loaded + for script in soup(["script", "style"]): # remove all javascript and stylesheet code + script.extract() + # get text + text = soup.get_text() + # break into lines and remove leading and trailing space on each + lines = (line.strip() for line in text.splitlines()) + # break multi-headlines into a line each + chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) + # drop blank lines + text = '\n'.join(chunk for chunk in chunks if chunk) + return text + def delete_index(): try: url = "%s/%s?refresh=true" % (tornado.options.options.es_url, tornado.options.options.index_name) @@ -117,6 +132,20 @@ def convert_msg_to_json(msg): del result["x-gmail-labels"] result['labels'] = labels + # Bodies... + if tornado.options.options.index_bodies: + result['body'] = '' + if msg.is_multipart(): + for mpart in msg.get_payload(): + if mpart is not None: + mpart_payload = mpart.get_payload(decode=True) + if mpart_payload is not None: + result['body'] += strip_html_css_js(mpart_payload) + else: + result['body'] = strip_html_css_js(msg.get_payload(decode=True)) + + result['body_size'] = len(result['body']) + parts = result.get("parts", []) result['content_size_total'] = 0 for part in parts: @@ -183,6 +212,9 @@ def load_from_file(): tornado.options.define("num_of_shards", type=int, default=2, help="Number of shards for ES index") + tornado.options.define("index_bodies", type=bool, default=False, + help="Will index all body content, stripped of HTML/CSS/JS etc. Adds fields: 'body' and 'body_size'") + tornado.options.parse_command_line() if tornado.options.options.infile: From a9ff91d7280a9c9be03ec5cfc5afd76fffd12108 Mon Sep 17 00:00:00 2001 From: Oliver Date: Fri, 22 Dec 2017 14:05:40 +0100 Subject: [PATCH 02/18] fix two of the links --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ad55793..46aaad1 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ __Related tutorial:__ [Index and Search Hacker News using Elasticsearch and the #### Prerequisites -Set up [Elasticsearch](http://ohardt.us/es-install) and make sure it's running at [http://localhost:9200](http://localhost:9200) +Set up [Elasticsearch](https://www.elastic.co/guide/en/elasticsearch/guide/current/running-elasticsearch.html) and make sure it's running at [http://localhost:9200](http://localhost:9200) I use Python and [Tornado](https://github.com/tornadoweb/tornado/) for the scripts to import and query the data. Run `pip install tornado chardet` to install Tornado and chardet. @@ -24,7 +24,7 @@ You may also need to `pip install beautifulsoup4`, for the stripping HTML/JS/CSS #### Aight, where do we start? -First, go [here](http://ohardt.us/download-gmail-mailbox) and download your Gmail mailbox, depending on the amount of emails you have accumulated this might take a while. +First, go [here](https://www.google.com/settings/takeout/custom/gmail) and download your Gmail mailbox, depending on the amount of emails you have accumulated this might take a while. The downloaded archive is in the [mbox format](http://en.wikipedia.org/wiki/Mbox) and Python provides libraries to work with the mbox format so that's easy. From 0dab6ec992e3f3342a0af8e963aee8dd4e9b726f Mon Sep 17 00:00:00 2001 From: Oliver Date: Fri, 22 Dec 2017 14:06:27 +0100 Subject: [PATCH 03/18] fix third link --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 46aaad1..0362519 100644 --- a/README.md +++ b/README.md @@ -115,7 +115,7 @@ def upload_item_to_es(item): ``` -However, Elasticsearch provides a better method for importing large chunks of data: [bulk indexing](http://ohardt.us/es-bulk-indexing) +However, Elasticsearch provides a better method for importing large chunks of data: [bulk indexing](https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html) Instead of making a HTTP request per document and indexing individually, we batch them in chunks of eg. 1000 documents and then index them.
Bulk messages are of the format: From 1fccb8bb589d3dbf55d5969a47093821fbcdb909 Mon Sep 17 00:00:00 2001 From: cclauss Date: Sat, 23 Dec 2017 18:28:59 +0100 Subject: [PATCH 04/18] print() is a function in modern Python --- src/AmazonEmailParser.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/AmazonEmailParser.py b/src/AmazonEmailParser.py index 69ea211..117c87f 100644 --- a/src/AmazonEmailParser.py +++ b/src/AmazonEmailParser.py @@ -1,3 +1,5 @@ +from __future__ import print_function + import json import re @@ -74,8 +76,8 @@ def parse(self, email): costTotal -= cost if costTotal != 0: - print "Warning order not parsed correctly, order items may be missing, or promotion may have been applied." - print email['order_details'] - print body + print("Warning order not parsed correctly, order items may be missing, or promotion may have been applied.") + print(email['order_details']) + print(body) return email From 5a1813aa044e1943e7a9cf04fe01ec72542fef87 Mon Sep 17 00:00:00 2001 From: cclauss Date: Sat, 23 Dec 2017 18:30:36 +0100 Subject: [PATCH 05/18] print() is a function in modern Python --- src/SteamEmailParser.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/SteamEmailParser.py b/src/SteamEmailParser.py index cf7aef9..601660e 100644 --- a/src/SteamEmailParser.py +++ b/src/SteamEmailParser.py @@ -1,3 +1,5 @@ +from __future__ import print_function + import json import re @@ -54,8 +56,8 @@ def parse(self, email): costTotal -= cost if costTotal != 0: - print "Warning order not parsed correctly, order items may be missing, or promotion may have been applied." - print email['order_details'] - print body + print("Warning order not parsed correctly, order items may be missing, or promotion may have been applied.") + print(email['order_details']) + print(body) return email From 0786ee0e1f97eb4d802ad4d7fb7e9e2b509a7fa8 Mon Sep 17 00:00:00 2001 From: Ugo Sangiorgi Date: Fri, 19 Jan 2018 14:37:58 -0500 Subject: [PATCH 06/18] requirements.txt to ease dependency installation --- README.md | 9 +++++---- requirements.txt | 3 +++ 2 files changed, 8 insertions(+), 4 deletions(-) create mode 100644 requirements.txt diff --git a/README.md b/README.md index 0362519..32bc775 100644 --- a/README.md +++ b/README.md @@ -16,10 +16,11 @@ __Related tutorial:__ [Index and Search Hacker News using Elasticsearch and the Set up [Elasticsearch](https://www.elastic.co/guide/en/elasticsearch/guide/current/running-elasticsearch.html) and make sure it's running at [http://localhost:9200](http://localhost:9200) -I use Python and [Tornado](https://github.com/tornadoweb/tornado/) for the scripts to import and query the data. Run `pip install tornado chardet` to install Tornado and chardet. +I use Python and [Tornado](https://github.com/tornadoweb/tornado/) for the scripts to import and query the data. Also `beautifulsoup4` for the stripping HTML/JS/CSS (if you want to use the body indexing flag). -You may also need to `pip install beautifulsoup4`, for the stripping HTML/JS/CSS via the body indexing flag: `--index_bodies` +Install the dependencies by running: +`pip install -r requirements.txt` #### Aight, where do we start? @@ -283,9 +284,9 @@ Result: "key_as_string" : "2004-01-01T00:00:00.000Z", "key" : 1072915200000, "doc_count" : 585 - }, { + }, { ... - }, { + }, { "key_as_string" : "2013-01-01T00:00:00.000Z", "key" : 1356998400000, "doc_count" : 12832 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b4cf28f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +beautifulsoup4==4.6.0 +chardet==3.0.4 +tornado==4.5.3 From dd46fe189a2be4a024570a823b19357e93da00ba Mon Sep 17 00:00:00 2001 From: Ugo Sangiorgi Date: Fri, 19 Jan 2018 14:46:25 -0500 Subject: [PATCH 07/18] "not in" instead of "not 'string' in" --- src/index_emails.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/index_emails.py b/src/index_emails.py index 867adfb..4103957 100644 --- a/src/index_emails.py +++ b/src/index_emails.py @@ -103,7 +103,7 @@ def normalize_email(email_in): def convert_msg_to_json(msg): result = {'parts': []} - if not 'message-id' in msg: + if 'message-id' not in msg: return None for (k, v) in msg.items(): @@ -160,7 +160,6 @@ def load_from_file(): delete_index() create_index() - if tornado.options.options.skip: logging.info("Skipping first %d messages from mbox file" % tornado.options.options.skip) From ac3527d8024abfc1a99ad72d43a0680bf7aa16fc Mon Sep 17 00:00:00 2001 From: Prateep Bandharangshi Date: Mon, 26 Mar 2018 17:42:37 +0100 Subject: [PATCH 08/18] Update README.md Fix incorrect link --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 32bc775..2d8d826 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ print "Done!" #### Ok, tell me more about the details -The full Python code is here: [src/update.py](src/index_emails.py) +The full Python code is here: [src/index_emails.py](src/index_emails.py) ##### Turn mbox into JSON From 8a6bc92155a011b248e5f07025cac330231352da Mon Sep 17 00:00:00 2001 From: Jay Caines-Gooby Date: Wed, 1 Aug 2018 23:33:21 +0100 Subject: [PATCH 09/18] Elastic Search 6 compatibility https://www.elastic.co/blog/strict-content-type-checking-for-elasticsearch-rest-requests Starting from Elasticsearch 6.0, all REST requests that include a body must also provide the correct content-type for that body. --- src/index_emails.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/index_emails.py b/src/index_emails.py index 4103957..e3a4c96 100644 --- a/src/index_emails.py +++ b/src/index_emails.py @@ -38,7 +38,7 @@ def strip_html_css_js(msg): def delete_index(): try: url = "%s/%s?refresh=true" % (tornado.options.options.es_url, tornado.options.options.index_name) - request = HTTPRequest(url, method="DELETE", request_timeout=240) + request = HTTPRequest(url, method="DELETE", request_timeout=240, headers={"Content-Type": "application/json"}) body = {"refresh": True} response = http_client.fetch(request) logging.info('Delete index done %s' % response.body) @@ -71,7 +71,7 @@ def create_index(): body = json.dumps(schema) url = "%s/%s" % (tornado.options.options.es_url, tornado.options.options.index_name) try: - request = HTTPRequest(url, method="PUT", body=body, request_timeout=240) + request = HTTPRequest(url, method="PUT", body=body, request_timeout=240, headers={"Content-Type": "application/json"}) response = http_client.fetch(request) logging.info('Create index done %s' % response.body) except: @@ -86,7 +86,7 @@ def upload_batch(upload_data): upload_data_txt += json.dumps(cmd) + "\n" upload_data_txt += json.dumps(item) + "\n" - request = HTTPRequest(tornado.options.options.es_url + "/_bulk", method="POST", body=upload_data_txt, request_timeout=240) + request = HTTPRequest(tornado.options.options.es_url + "/_bulk", method="POST", body=upload_data_txt, request_timeout=240, headers={"Content-Type": "application/json"}) response = http_client.fetch(request) result = json.loads(response.body) From 0a0e4744f5c42784ebda279c89684a852d7b57a1 Mon Sep 17 00:00:00 2001 From: priyadharshinijaganathan <39261947+priyadharshinijaganathan@users.noreply.github.com> Date: Thu, 12 Mar 2020 19:10:38 +0530 Subject: [PATCH 10/18] Changed the code from python 2.x to 3.x --- src/index_emails.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/index_emails.py b/src/index_emails.py index e3a4c96..24788e0 100644 --- a/src/index_emails.py +++ b/src/index_emails.py @@ -21,9 +21,10 @@ DEFAULT_ES_URL = "http://localhost:9200" DEFAULT_INDEX_NAME = "gmail" + def strip_html_css_js(msg): - soup = BeautifulSoup(msg,"html.parser") # create a new bs4 object from the html data loaded - for script in soup(["script", "style"]): # remove all javascript and stylesheet code + soup = BeautifulSoup(msg, "html.parser") # create a new bs4 object from the html data loaded + for script in soup(["script", "style"]): # remove all javascript and stylesheet code script.extract() # get text text = soup.get_text() @@ -35,6 +36,7 @@ def strip_html_css_js(msg): text = '\n'.join(chunk for chunk in chunks if chunk) return text + def delete_index(): try: url = "%s/%s?refresh=true" % (tornado.options.options.es_url, tornado.options.options.index_name) @@ -45,6 +47,7 @@ def delete_index(): except: pass + def create_index(): schema = { @@ -79,6 +82,8 @@ def create_index(): total_uploaded = 0 + + def upload_batch(upload_data): upload_data_txt = "" for item in upload_data: @@ -107,7 +112,7 @@ def convert_msg_to_json(msg): return None for (k, v) in msg.items(): - result[k.lower()] = v.decode('utf-8', 'ignore') + result[k.lower()] = v for k in ['to', 'cc', 'bcc']: if not result.get(k): @@ -166,7 +171,10 @@ def load_from_file(): count = 0 upload_data = list() logging.info("Starting import from file %s" % tornado.options.options.infile) - mbox = mailbox.UnixMailbox(open(tornado.options.options.infile, 'rb'), email.message_from_file) + # mbox = mailbox.UnixMailbox(open(tornado.options.options.infile, 'rb'), email.message_from_file) + + # removed the above UnixMailbox which is not supported in python 3.x and replaced it with mailbox.mbox class + mbox = mailbox.mbox(tornado.options.options.infile) emailParser = DelegatingEmailParser([AmazonEmailParser(), SteamEmailParser()]) @@ -175,6 +183,7 @@ def load_from_file(): if count < tornado.options.options.skip: continue item = convert_msg_to_json(msg) + if item: upload_data.append(item) if len(upload_data) == tornado.options.options.batch_size: @@ -211,8 +220,9 @@ def load_from_file(): tornado.options.define("num_of_shards", type=int, default=2, help="Number of shards for ES index") - tornado.options.define("index_bodies", type=bool, default=False, - help="Will index all body content, stripped of HTML/CSS/JS etc. Adds fields: 'body' and 'body_size'") + tornado.options.define("index_bodies", type=bool, default=True, + help="Will index all body content, stripped of HTML/CSS/JS etc. Adds fields: 'body' and \ + 'body_size'") tornado.options.parse_command_line() From a7cce066440008c7600f9c3531f84d303d5b156b Mon Sep 17 00:00:00 2001 From: Oliver Date: Sat, 14 Mar 2020 15:53:58 -0400 Subject: [PATCH 11/18] update README, add sample command line and sample mbox file, add docker instructions to run ES --- .gitignore | 1 + README.md | 25 +- sample.mbox | 685 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 707 insertions(+), 4 deletions(-) create mode 100644 .gitignore create mode 100644 sample.mbox diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5ceb386 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +venv diff --git a/README.md b/README.md index 2d8d826..db70861 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ Elasticsearch For Beginners: Indexing your Gmail Inbox ======================= +[![Build Status](https://cloud.drone.io/api/badges/oliver006/elasticsearch-gmail/status.svg)](https://cloud.drone.io/oliver006/elasticsearch-gmail) #### What's this all about? @@ -9,26 +10,42 @@ I recently looked at my Gmail inbox and noticed that I have well over 50k emails Goal of this tutorial is to load an entire Gmail inbox into Elasticsearch using bulk indexing and then start querying the cluster to get a better picture of what's going on. -__Related tutorial:__ [Index and Search Hacker News using Elasticsearch and the HN API](https://github.com/oliver006/elasticsearch-hn) - #### Prerequisites Set up [Elasticsearch](https://www.elastic.co/guide/en/elasticsearch/guide/current/running-elasticsearch.html) and make sure it's running at [http://localhost:9200](http://localhost:9200) +A quick way to run Elasticsearch is using Docker: (the cors settingsa aren't really needed but come in handy if you want to use e.g. [dejavu](https://dejavu.appbase.io/) to explore the index) +``` +docker run --name es -d -p 9200:9200 -e http.port=9200 -e http.cors.enabled=true -e 'http.cors.allow-origin=*' -e http.cors.allow-headers=X-Requested-With,X-Auth-Token,Content-Type,Content-Length,Authorization -e http.cors.allow-credentials=true -e "discovery.type=single-node" docker.elastic.co/elasticsearch/elasticsearch-oss:7.6.1 +``` + I use Python and [Tornado](https://github.com/tornadoweb/tornado/) for the scripts to import and query the data. Also `beautifulsoup4` for the stripping HTML/JS/CSS (if you want to use the body indexing flag). Install the dependencies by running: -`pip install -r requirements.txt` +`pip3 install -r requirements.txt` #### Aight, where do we start? First, go [here](https://www.google.com/settings/takeout/custom/gmail) and download your Gmail mailbox, depending on the amount of emails you have accumulated this might take a while. +There's also a small `sample.mbox` file included in the repo for you to play around with while you're waiting for Google to prepare your download. The downloaded archive is in the [mbox format](http://en.wikipedia.org/wiki/Mbox) and Python provides libraries to work with the mbox format so that's easy. +You can run the code (assuming Elasticsearch is running at localhost:9200) with the sammple mbox file like this: +``` +$ python3 src/index_emails.py --infile=sample.mbox +[I index_emails:173] Starting import from file sample.mbox +[I index_emails:101] Upload: OK - upload took: 1033ms, total messages uploaded: 3 +[I index_emails:197] Import done - total count 16 +$ +``` + + +#### The Source Code + The overall program will look something like this: ```python @@ -342,4 +359,4 @@ GET _search #### Feedback -Open pull requests, issues or email me at o@21zoo.com +Open a pull requests or an issue! diff --git a/sample.mbox b/sample.mbox new file mode 100644 index 0000000..de10312 --- /dev/null +++ b/sample.mbox @@ -0,0 +1,685 @@ + + + +From nobody Mon Sep 17 00:00:00 2001 +From: A (zzz) + U + Thor + (Comment) +Date: Fri, 9 Jun 2006 00:44:16 -0700 +Subject: [PATCH] a commit. + +Here is a patch from A U Thor. + +--- + foo | 2 +- + 1 files changed, 1 insertions(+), 1 deletions(-) + +diff --git a/foo b/foo +index 9123cdc..918dcf8 100644 +--- a/foo ++++ b/foo +@@ -1 +1 @@ +-Fri Jun 9 00:44:04 PDT 2006 ++Fri Jun 9 00:44:13 PDT 2006 +-- +1.4.0.g6f2b + +From nobody Mon Sep 17 00:00:00 2001 +From: A U Thor +Date: Fri, 9 Jun 2006 00:44:16 -0700 +Subject: [PATCH] another patch + +Here is a patch from A U Thor. This addresses the issue raised in the +message: + +From: Nit Picker +Subject: foo is too old +Message-Id: + +Hopefully this would fix the problem stated there. + + +I have included an extra blank line above, but it does not have to be +stripped away here, along with the +whitespaces at the end of the above line. They are expected to be squashed +when the message is made into a commit log by stripspace, +Also, there are three blank lines after this paragraph, +two truly blank and another full of spaces in between. + + + +Hope this helps. + +--- + foo | 2 +- + 1 files changed, 1 insertions(+), 1 deletions(-) + +diff --git a/foo b/foo +index 9123cdc..918dcf8 100644 +--- a/foo ++++ b/foo +@@ -1 +1 @@ +-Fri Jun 9 00:44:04 PDT 2006 ++Fri Jun 9 00:44:13 PDT 2006 +-- +1.4.0.g6f2b + +From nobody Mon Sep 17 00:00:00 2001 +From: Junio C Hamano +Date: Fri, 9 Jun 2006 00:44:16 -0700 +Subject: re: [PATCH] another patch + +From: A U Thor +Subject: [PATCH] third patch + +Here is a patch from A U Thor. This addresses the issue raised in the +message: + +From: Nit Picker +Subject: foo is too old +Message-Id: + +Hopefully this would fix the problem stated there. + +--- + foo | 2 +- + 1 files changed, 1 insertions(+), 1 deletions(-) + +diff --git a/foo b/foo +index 9123cdc..918dcf8 100644 +--- a/foo ++++ b/foo +@@ -1 +1 @@ +-Fri Jun 9 00:44:04 PDT 2006 ++Fri Jun 9 00:44:13 PDT 2006 +-- +1.4.0.g6f2b + +From nobody Sat Aug 27 23:07:49 2005 +Path: news.gmane.org!not-for-mail +Message-ID: <20050721.091036.01119516.yoshfuji@linux-ipv6.org> +From: YOSHIFUJI Hideaki / =?ISO-2022-JP?B?GyRCNUhGIzFRTEAbKEI=?= + +Newsgroups: gmane.comp.version-control.git +Subject: [PATCH 1/2] GIT: Try all addresses for given remote name +Date: Thu, 21 Jul 2005 09:10:36 -0400 (EDT) +Lines: 99 +Organization: USAGI/WIDE Project +Approved: news@gmane.org +NNTP-Posting-Host: main.gmane.org +Mime-Version: 1.0 +Content-Type: Text/Plain; charset=us-ascii +Content-Transfer-Encoding: 7bit +X-Trace: sea.gmane.org 1121951434 29350 80.91.229.2 (21 Jul 2005 13:10:34 GMT) +X-Complaints-To: usenet@sea.gmane.org +NNTP-Posting-Date: Thu, 21 Jul 2005 13:10:34 +0000 (UTC) + +Hello. + +Try all addresses for given remote name until it succeeds. +Also supports IPv6. + +Signed-of-by: Hideaki YOSHIFUJI + +diff --git a/connect.c b/connect.c +--- a/connect.c ++++ b/connect.c +@@ -96,42 +96,57 @@ static enum protocol get_protocol(const + die("I don't handle protocol '%s'", name); + } + +-static void lookup_host(const char *host, struct sockaddr *in) +-{ +- struct addrinfo *res; +- int ret; +- +- ret = getaddrinfo(host, NULL, NULL, &res); +- if (ret) +- die("Unable to look up %s (%s)", host, gai_strerror(ret)); +- *in = *res->ai_addr; +- freeaddrinfo(res); +-} ++#define STR_(s) # s ++#define STR(s) STR_(s) + + static int git_tcp_connect(int fd[2], const char *prog, char *host, char *path) + { +- struct sockaddr addr; +- int port = DEFAULT_GIT_PORT, sockfd; +- char *colon; +- +- colon = strchr(host, ':'); +- if (colon) { +- char *end; +- unsigned long n = strtoul(colon+1, &end, 0); +- if (colon[1] && !*end) { +- *colon = 0; +- port = n; ++ int sockfd = -1; ++ char *colon, *end; ++ char *port = STR(DEFAULT_GIT_PORT); ++ struct addrinfo hints, *ai0, *ai; ++ int gai; ++ ++ if (host[0] == '[') { ++ end = strchr(host + 1, ']'); ++ if (end) { ++ *end = 0; ++ end++; ++ host++; ++ } else ++ end = host; ++ } else ++ end = host; ++ colon = strchr(end, ':'); ++ ++ if (colon) ++ port = colon + 1; ++ ++ memset(&hints, 0, sizeof(hints)); ++ hints.ai_socktype = SOCK_STREAM; ++ hints.ai_protocol = IPPROTO_TCP; ++ ++ gai = getaddrinfo(host, port, &hints, &ai); ++ if (gai) ++ die("Unable to look up %s (%s)", host, gai_strerror(gai)); ++ ++ for (ai0 = ai; ai; ai = ai->ai_next) { ++ sockfd = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol); ++ if (sockfd < 0) ++ continue; ++ if (connect(sockfd, ai->ai_addr, ai->ai_addrlen) < 0) { ++ close(sockfd); ++ sockfd = -1; ++ continue; + } ++ break; + } + +- lookup_host(host, &addr); +- ((struct sockaddr_in *)&addr)->sin_port = htons(port); ++ freeaddrinfo(ai0); + +- sockfd = socket(PF_INET, SOCK_STREAM, IPPROTO_IP); + if (sockfd < 0) + die("unable to create socket (%s)", strerror(errno)); +- if (connect(sockfd, (void *)&addr, sizeof(addr)) < 0) +- die("unable to connect (%s)", strerror(errno)); ++ + fd[0] = sockfd; + fd[1] = sockfd; + packet_write(sockfd, "%s %s\n", prog, path); + +-- +YOSHIFUJI Hideaki @ USAGI Project +GPG-FP : 9022 65EB 1ECF 3AD1 0BDF 80D8 4807 F894 E062 0EEA + +From nobody Sat Aug 27 23:07:49 2005 +Path: news.gmane.org!not-for-mail +Message-ID: +From: =?ISO8859-1?Q?David_K=E5gedal?= +Newsgroups: gmane.comp.version-control.git +Subject: [PATCH] Fixed two bugs in git-cvsimport-script. +Date: Mon, 15 Aug 2005 20:18:25 +0200 +Lines: 83 +Approved: news@gmane.org +NNTP-Posting-Host: main.gmane.org +Mime-Version: 1.0 +Content-Type: text/plain; charset=ISO8859-1 +Content-Transfer-Encoding: QUOTED-PRINTABLE +X-Trace: sea.gmane.org 1124130247 31839 80.91.229.2 (15 Aug 2005 18:24:07 GMT) +X-Complaints-To: usenet@sea.gmane.org +NNTP-Posting-Date: Mon, 15 Aug 2005 18:24:07 +0000 (UTC) +Cc: "Junio C. Hamano" +Original-X-From: git-owner@vger.kernel.org Mon Aug 15 20:24:05 2005 + +The git-cvsimport-script had a copule of small bugs that prevented me +from importing a big CVS repository. + +The first was that it didn't handle removed files with a multi-digit +primary revision number. + +The second was that it was asking the CVS server for "F" messages, +although they were not handled. + +I also updated the documentation for that script to correspond to +actual flags. + +Signed-off-by: David K=E5gedal +--- + + Documentation/git-cvsimport-script.txt | 9 ++++++++- + git-cvsimport-script | 4 ++-- + 2 files changed, 10 insertions(+), 3 deletions(-) + +50452f9c0c2df1f04d83a26266ba704b13861632 +diff --git a/Documentation/git-cvsimport-script.txt b/Documentation/git= +-cvsimport-script.txt +--- a/Documentation/git-cvsimport-script.txt ++++ b/Documentation/git-cvsimport-script.txt +@@ -29,6 +29,10 @@ OPTIONS + currently, only the :local:, :ext: and :pserver: access methods=20 + are supported. +=20 ++-C :: ++ The GIT repository to import to. If the directory doesn't ++ exist, it will be created. Default is the current directory. ++ + -i:: + Import-only: don't perform a checkout after importing. This option + ensures the working directory and cache remain untouched and will +@@ -44,7 +48,7 @@ OPTIONS +=20 + -p :: + Additional options for cvsps. +- The options '-x' and '-A' are implicit and should not be used here. ++ The options '-u' and '-A' are implicit and should not be used here. +=20 + If you need to pass multiple options, separate them with a comma. +=20 +@@ -57,6 +61,9 @@ OPTIONS + -h:: + Print a short usage message and exit. +=20 ++-z :: ++ Pass the timestamp fuzz factor to cvsps. ++ + OUTPUT + ------ + If '-v' is specified, the script reports what it is doing. +diff --git a/git-cvsimport-script b/git-cvsimport-script +--- a/git-cvsimport-script ++++ b/git-cvsimport-script +@@ -190,7 +190,7 @@ sub conn { + $self->{'socketo'}->write("Root $repo\n"); +=20 + # Trial and error says that this probably is the minimum set +- $self->{'socketo'}->write("Valid-responses ok error Valid-requests Mo= +de M Mbinary E F Checked-in Created Updated Merged Removed\n"); ++ $self->{'socketo'}->write("Valid-responses ok error Valid-requests Mo= +de M Mbinary E Checked-in Created Updated Merged Removed\n"); +=20 + $self->{'socketo'}->write("valid-requests\n"); + $self->{'socketo'}->flush(); +@@ -691,7 +691,7 @@ while() { + unlink($tmpname); + my $mode =3D pmode($cvs->{'mode'}); + push(@new,[$mode, $sha, $fn]); # may be resurrected! +- } elsif($state =3D=3D 9 and /^\s+(\S+):\d(?:\.\d+)+->(\d(?:\.\d+)+)\(= +DEAD\)\s*$/) { ++ } elsif($state =3D=3D 9 and /^\s+(\S+):\d+(?:\.\d+)+->(\d+(?:\.\d+)+)= +\(DEAD\)\s*$/) { + my $fn =3D $1; + $fn =3D~ s#^/+##; + push(@old,$fn); + +--=20 +David K=E5gedal +- +To unsubscribe from this list: send the line "unsubscribe git" in +the body of a message to majordomo@vger.kernel.org +More majordomo info at http://vger.kernel.org/majordomo-info.html + +From nobody Mon Sep 17 00:00:00 2001 +From: A U Thor +References: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Date: Fri, 9 Jun 2006 00:44:16 -0700 +Subject: [PATCH] a commit. + +Here is a patch from A U Thor. + +--- + foo | 2 +- + 1 files changed, 1 insertions(+), 1 deletions(-) + +diff --git a/foo b/foo +index 9123cdc..918dcf8 100644 +--- a/foo ++++ b/foo +@@ -1 +1 @@ +-Fri Jun 9 00:44:04 PDT 2006 ++Fri Jun 9 00:44:13 PDT 2006 +-- +1.4.0.g6f2b + +From nobody Mon Sep 17 00:00:00 2001 +From: A U Thor +Date: Fri, 9 Jun 2006 00:44:16 -0700 +Subject: [PATCH] another patch + +Here is an empty patch from A U Thor. + +From nobody Mon Sep 17 00:00:00 2001 +From: Junio C Hamano +Date: Fri, 9 Jun 2006 00:44:16 -0700 +Subject: re: [PATCH] another patch + +From: A U Thor +Subject: [PATCH] another patch +>Here is an empty patch from A U Thor. + +Hey you forgot the patch! + +From nobody Mon Sep 17 00:00:00 2001 +From: A U Thor +Date: Mon, 17 Sep 2001 00:00:00 +0900 +Mime-Version: 1.0 +Content-Type: Text/Plain; charset=us-ascii +Content-Transfer-Encoding: Quoted-Printable + +=0A=0AFrom: F U Bar +Subject: [PATCH] updates=0A=0AThis is to fix diff-format documentation. + +diff --git a/Documentation/diff-format.txt b/Documentation/diff-format.txt +index b426a14..97756ec 100644 +--- a/Documentation/diff-format.txt ++++ b/Documentation/diff-format.txt +@@ -81,7 +81,7 @@ The "diff" formatting options can be customized via the + environment variable 'GIT_DIFF_OPTS'. For example, if you + prefer context diff: +=20 +- GIT_DIFF_OPTS=3D-c git-diff-index -p $(cat .git/HEAD) ++ GIT_DIFF_OPTS=3D-c git-diff-index -p HEAD +=20 +=20 + 2. When the environment variable 'GIT_EXTERNAL_DIFF' is set, the +From b9704a518e21158433baa2cc2d591fea687967f6 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Lukas=20Sandstr=C3=B6m?= +Date: Thu, 10 Jul 2008 23:41:33 +0200 +Subject: Re: discussion that lead to this patch +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +[PATCH] git-mailinfo: Fix getting the subject from the body + +"Subject: " isn't in the static array "header", and thus +memcmp("Subject: ", header[i], 7) will never match. + +Signed-off-by: Lukas Sandström +Signed-off-by: Junio C Hamano +--- + builtin-mailinfo.c | 2 +- + 1 files changed, 1 insertions(+), 1 deletions(-) + +diff --git a/builtin-mailinfo.c b/builtin-mailinfo.c +index 962aa34..2d1520f 100644 +--- a/builtin-mailinfo.c ++++ b/builtin-mailinfo.c +@@ -334,7 +334,7 @@ static int check_header(char *line, unsigned linesize, char **hdr_data, int over + return 1; + if (!memcmp("[PATCH]", line, 7) && isspace(line[7])) { + for (i = 0; header[i]; i++) { +- if (!memcmp("Subject: ", header[i], 9)) { ++ if (!memcmp("Subject", header[i], 7)) { + if (! handle_header(line, hdr_data[i], 0)) { + return 1; + } +-- +1.5.6.2.455.g1efb2 + +From nobody Fri Aug 8 22:24:03 2008 +Date: Fri, 8 Aug 2008 13:08:37 +0200 (CEST) +From: A U Thor +Subject: [PATCH 3/3 v2] Xyzzy +MIME-Version: 1.0 +Content-Type: multipart/mixed; boundary="=-=-=" + +--=-=-= +Content-Type: text/plain; charset=ISO8859-15 +Content-Transfer-Encoding: quoted-printable + +Here comes a commit log message, and +its second line is here. +--- + builtin-mailinfo.c | 4 ++-- + +diff --git a/builtin-mailinfo.c b/builtin-mailinfo.c +index 3e5fe51..aabfe5c 100644 +--- a/builtin-mailinfo.c ++++ b/builtin-mailinfo.c +@@ -758,8 +758,8 @@ static void handle_body(void) + /* process any boundary lines */ + if (*content_top && is_multipart_boundary(&line)) { + /* flush any leftover */ +- if (line.len) +- handle_filter(&line); ++ if (prev.len) ++ handle_filter(&prev); +=20 + if (!handle_boundary()) + goto handle_body_out; +--=20 +1.6.0.rc2 + +--=-=-=-- + +From bda@mnsspb.ru Wed Nov 12 17:54:41 2008 +From: Dmitriy Blinov +To: navy-patches@dinar.mns.mnsspb.ru +Date: Wed, 12 Nov 2008 17:54:41 +0300 +Message-Id: <1226501681-24923-1-git-send-email-bda@mnsspb.ru> +X-Mailer: git-send-email 1.5.6.5 +MIME-Version: 1.0 +Content-Type: text/plain; + charset=utf-8 +Content-Transfer-Encoding: 8bit +Subject: [Navy-patches] [PATCH] + =?utf-8?b?0JjQt9C80LXQvdGR0L0g0YHQv9C40YHQvtC6INC/0LA=?= + =?utf-8?b?0LrQtdGC0L7QsiDQvdC10L7QsdGF0L7QtNC40LzRi9GFINC00LvRjyA=?= + =?utf-8?b?0YHQsdC+0YDQutC4?= + +textlive-* исправлены на texlive-* +docutils заменён на python-docutils + +Действительно, оказалось, что rest2web вытягивает за собой +python-docutils. В то время как сам rest2web не нужен. + +Signed-off-by: Dmitriy Blinov +--- + howto/build_navy.txt | 6 +++--- + 1 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/howto/build_navy.txt b/howto/build_navy.txt +index 3fd3afb..0ee807e 100644 +--- a/howto/build_navy.txt ++++ b/howto/build_navy.txt +@@ -119,8 +119,8 @@ + - libxv-dev + - libusplash-dev + - latex-make +- - textlive-lang-cyrillic +- - textlive-latex-extra ++ - texlive-lang-cyrillic ++ - texlive-latex-extra + - dia + - python-pyrex + - libtool +@@ -128,7 +128,7 @@ + - sox + - cython + - imagemagick +- - docutils ++ - python-docutils + + #. на машине dinar: добавить свой открытый ssh-ключ в authorized_keys2 пользователя ddev + #. на своей машине: отредактировать /etc/sudoers (команда ``visudo``) примерно следующим образом:: +-- +1.5.6.5 +From nobody Mon Sep 17 00:00:00 2001 +From: (A U Thor) +Date: Fri, 9 Jun 2006 00:44:16 -0700 +Subject: [PATCH] a patch + +From nobody Mon Sep 17 00:00:00 2001 +From: Junio Hamano +Date: Thu, 20 Aug 2009 17:18:22 -0700 +Subject: Why doesn't git-am does not like >8 scissors mark? + +Subject: [PATCH] BLAH ONE + +In real life, we will see a discussion that inspired this patch +discussing related and unrelated things around >8 scissors mark +in this part of the message. + +Subject: [PATCH] BLAH TWO + +And then we will see the scissors. + + This line is not a scissors mark -- >8 -- but talks about it. + - - >8 - - please remove everything above this line - - >8 - - + +Subject: [PATCH] Teach mailinfo to ignore everything before -- >8 -- mark +From: Junio C Hamano + +This teaches mailinfo the scissors -- >8 -- mark; the command ignores +everything before it in the message body. + +Signed-off-by: Junio C Hamano +--- + builtin-mailinfo.c | 37 ++++++++++++++++++++++++++++++++++++- + 1 files changed, 36 insertions(+), 1 deletions(-) + +diff --git a/builtin-mailinfo.c b/builtin-mailinfo.c +index b0b5d8f..461c47e 100644 +--- a/builtin-mailinfo.c ++++ b/builtin-mailinfo.c +@@ -712,6 +712,34 @@ static inline int patchbreak(const struct strbuf *line) + return 0; + } + ++static int scissors(const struct strbuf *line) ++{ ++ size_t i, len = line->len; ++ int scissors_dashes_seen = 0; ++ const char *buf = line->buf; ++ ++ for (i = 0; i < len; i++) { ++ if (isspace(buf[i])) ++ continue; ++ if (buf[i] == '-') { ++ scissors_dashes_seen |= 02; ++ continue; ++ } ++ if (i + 1 < len && !memcmp(buf + i, ">8", 2)) { ++ scissors_dashes_seen |= 01; ++ i++; ++ continue; ++ } ++ if (i + 7 < len && !memcmp(buf + i, "cut here", 8)) { ++ i += 7; ++ continue; ++ } ++ /* everything else --- not scissors */ ++ break; ++ } ++ return scissors_dashes_seen == 03; ++} ++ + static int handle_commit_msg(struct strbuf *line) + { + static int still_looking = 1; +@@ -723,10 +751,17 @@ static int handle_commit_msg(struct strbuf *line) + strbuf_ltrim(line); + if (!line->len) + return 0; +- if ((still_looking = check_header(line, s_hdr_data, 0)) != 0) ++ still_looking = check_header(line, s_hdr_data, 0); ++ if (still_looking) + return 0; + } + ++ if (scissors(line)) { ++ fseek(cmitmsg, 0L, SEEK_SET); ++ still_looking = 1; ++ return 0; ++ } ++ + /* normalize the log message to UTF-8. */ + if (metainfo_charset) + convert_to_utf8(line, charset.buf); +-- +1.6.4.1 +From nobody Mon Sep 17 00:00:00 2001 +From: A U Thor +Subject: check bogus body header (from) +Date: Fri, 9 Jun 2006 00:44:16 -0700 + +From: bogosity + - a list + - of stuff +--- +diff --git a/foo b/foo +index e69de29..d95f3ad 100644 +--- a/foo ++++ b/foo +@@ -0,0 +1 @@ ++content + +From nobody Mon Sep 17 00:00:00 2001 +From: A U Thor +Subject: check bogus body header (date) +Date: Fri, 9 Jun 2006 00:44:16 -0700 + +Date: bogus + +and some content + +--- +diff --git a/foo b/foo +index e69de29..d95f3ad 100644 +--- a/foo ++++ b/foo +@@ -0,0 +1 @@ ++content + From 5dbd8651350040720b7c4c707788432a76382ff2 Mon Sep 17 00:00:00 2001 From: Oliver Date: Sat, 14 Mar 2020 21:57:23 -0400 Subject: [PATCH 12/18] drone tests --- .drone.yml | 33 +++++++++++++++++++++++++++++++++ .gitignore | 2 ++ 2 files changed, 35 insertions(+) create mode 100644 .drone.yml diff --git a/.drone.yml b/.drone.yml new file mode 100644 index 0000000..a3f0f20 --- /dev/null +++ b/.drone.yml @@ -0,0 +1,33 @@ +kind: pipeline +name: default +type: docker + + +workspace: + base: /go + path: src/github.com/oliver006/elasticsearch-gmail + + +services: + - name: es + image: docker.elastic.co/elasticsearch/elasticsearch-oss:7.6.1 + environment: + http.port: "9200" + discovery.type: "single-node" + ports: + - 9200 + + +steps: + - name: tests + image: "python:3.7" + pull: always + commands: + - pip3 install -r requirements.txt + - sleep 30 + - curl -s http://es:9200 + - "python3 src/index_emails.py --infile=sample.mbox --es-url=http://es:9200" + when: + event: + - pull_request + - push diff --git a/.gitignore b/.gitignore index 5ceb386..0231762 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ venv +.idea +*.pyc From 6d31ea3cabd4f0177303d5fbd57662e73a4d925d Mon Sep 17 00:00:00 2001 From: ad-ast <66307268+ad-ast@users.noreply.github.com> Date: Wed, 22 Jul 2020 18:07:12 +0200 Subject: [PATCH 13/18] Added support for MH mailboxes and several other features/improvements/fixes (#22) * Ignore VS Code metadata * Enable loading MH directory mailbox * Enable reading body having multipart containing multiparts * Skip mail when unable to serialize to JSON instead of exiting * Option to filter out non-text content type body parts * Explicitly stringify to deal with bad encodings * Unused * Make indexing x-* header fields optional * Fix deleting index http request * Optimized skipping (skip keys, do not read skipped messages) * Added dry run option/documentation * Fixed recursion for content type multipart * Updated readme to reflect MH support * Uncommented DelegatingEmailParser * Removed unused email parser classes Co-authored-by: adast --- .gitignore | 1 + README.md | 9 ++-- src/AmazonEmailParser.py | 83 --------------------------------- src/DelegatingEmailParser.py | 11 ----- src/SteamEmailParser.py | 63 ------------------------- src/index_emails.py | 90 +++++++++++++++++++++++------------- 6 files changed, 63 insertions(+), 194 deletions(-) delete mode 100644 src/AmazonEmailParser.py delete mode 100644 src/DelegatingEmailParser.py delete mode 100644 src/SteamEmailParser.py diff --git a/.gitignore b/.gitignore index 0231762..1760f4f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ venv .idea *.pyc +.vscode diff --git a/README.md b/README.md index db70861..ad442d1 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -Elasticsearch For Beginners: Indexing your Gmail Inbox +Elasticsearch For Beginners: Indexing your Gmail Inbox (and more: Supports any mbox and MH mailboxes) ======================= [![Build Status](https://cloud.drone.io/api/badges/oliver006/elasticsearch-gmail/status.svg)](https://cloud.drone.io/oliver006/elasticsearch-gmail) @@ -43,13 +43,14 @@ $ python3 src/index_emails.py --infile=sample.mbox $ ``` +Note: All examples focus on Gmail inboxes. Substitute any `--infile=` parameters with `--indir=` pointing to an MH directory to make them work with MH mailboxes instead. #### The Source Code The overall program will look something like this: ```python -mbox = mailbox.UnixMailbox(open('emails.mbox', 'rb'), email.message_from_file) +mbox = mailbox.mbox('emails.mbox') // or mailbox.MH('inbox/') for msg in mbox: item = convert_msg_to_json(msg) @@ -63,9 +64,9 @@ print "Done!" The full Python code is here: [src/index_emails.py](src/index_emails.py) -##### Turn mbox into JSON +##### Turn mailbox into JSON -First, we got to turn the mbox format messages into JSON so we can insert it into Elasticsearch. [Here](http://nbviewer.ipython.org/github/furukama/Mining-the-Social-Web-2nd-Edition/blob/master/ipynb/Chapter%206%20-%20Mining%20Mailboxes.ipynb) is some sample code that was very useful when it came to normalizing and cleaning up the data. +First, we got to turn the messages into JSON so we can insert it into Elasticsearch. [Here](http://nbviewer.ipython.org/github/furukama/Mining-the-Social-Web-2nd-Edition/blob/master/ipynb/Chapter%206%20-%20Mining%20Mailboxes.ipynb) is some sample code that was very useful when it came to normalizing and cleaning up the data. A good first step: diff --git a/src/AmazonEmailParser.py b/src/AmazonEmailParser.py deleted file mode 100644 index 117c87f..0000000 --- a/src/AmazonEmailParser.py +++ /dev/null @@ -1,83 +0,0 @@ -from __future__ import print_function - -import json -import re - -class AmazonEmailParser(object): - - def __init__(self): - self.orderTotalRE = re.compile(r"(?<=Order Total:) (?:.*?)(\d+.\d+)") - self.postageRE = re.compile(r"(?<=Postage & Packing:) (?:.*?)(\d+.\d+)") - self.deliveryRE = re.compile(r"(?<=Delivery & Handling::) (?:.*?)(\d+.\d+)") - self.orderItemsRE = re.compile(r"==========\r\n\r\n") - self.costRE = re.compile(r"(\d+\.\d+)") - - def canParse(self, email): - try: - if 'auto-confirm@amazon' in email['from']: - return True - else: - return False - except: - return False - - def parse(self, email): - body = email['body'] - - if 'Order Confirmation' in body: - postage = 0 - orderTotal = 0 - - result = re.search(self.orderTotalRE, body) - - if result: - orderTotal = float(result.groups()[0]) - - result = re.search(self.postageRE, body) - - if result: - postage = float(result.groups()[0]) - else: - result = re.search(self.deliveryRE, body) - if result: - postage = float(result.groups()[0]) - - email['order_details'] = { - "order_items" : [], - "order_total" : orderTotal, - "postage" : postage, - "merchant" : "amazon" - } - - orders = re.split(self.orderItemsRE, body)[1] - orders = orders.split('\r\n\r\n') - - #Remove first and last 3 items - orders.pop(0) - orders.pop() - orders.pop() - orders.pop() - - costTotal = orderTotal - - for item in orders: - if 'Your estimated delivery date is:' in item or 'Your order will be sent to:' in item: - continue - else: - lines = item.replace('_','').split('\r\n') - if len(lines) < 4: - continue - itemName = lines[0].strip() - cost = float(re.search(self.costRE, lines[1].strip()).groups()[0]) - condition = lines[2].rpartition(':')[2].strip() - seller = lines[3].replace('Sold by', '').strip() - - email['order_details']['order_items'].append({"item":itemName, "cost":cost, "condition": condition, "seller": seller}) - costTotal -= cost - - if costTotal != 0: - print("Warning order not parsed correctly, order items may be missing, or promotion may have been applied.") - print(email['order_details']) - print(body) - - return email diff --git a/src/DelegatingEmailParser.py b/src/DelegatingEmailParser.py deleted file mode 100644 index 11c291d..0000000 --- a/src/DelegatingEmailParser.py +++ /dev/null @@ -1,11 +0,0 @@ -class DelegatingEmailParser(object): - - def __init__(self, parsers): - self.parsers = parsers - - def parse(self, email): - for parser in self.parsers: - if parser.canParse(email): - return parser.parse(email) - - return email diff --git a/src/SteamEmailParser.py b/src/SteamEmailParser.py deleted file mode 100644 index 601660e..0000000 --- a/src/SteamEmailParser.py +++ /dev/null @@ -1,63 +0,0 @@ -from __future__ import print_function - -import json -import re - -class SteamEmailParser(object): - - def __init__(self): - self.orderTotalRE = re.compile(r"(?<=Total:)[ \t]+(\d+.\d+)") - self.orderItemsRE = re.compile(r"(?:\.\r\n)+") - self.costRE = re.compile(r"(\d+\.\d+)") - - def canParse(self, email): - try: - if 'noreply@steampowered.com' in email['from']: - return True - else: - return False - except: - return False - - def parse(self, email): - body = email['body'] - - if 'Thank you' in email['subject'] and 'purchase' in body: - orderTotal = 0 - - result = re.search(self.orderTotalRE, body) - - if result: - orderTotal = float(result.groups()[0]) - - email['order_details'] = { - "order_items" : [], - "order_total" : orderTotal, - "merchant" : "steam" - } - - order = re.split(self.orderItemsRE, body)[2].split('\r\n') #This parser to get order total is currently broken, gift purchases are not parsed - - costTotal = orderTotal - - costTotal = orderTotal - - for item in order: - if '-------' in item: - break - else: - if item == '' or ': ' not in item: - continue - splitResult = item.rpartition(':') - itemName = splitResult[0].strip() - cost = float(re.match(self.costRE, splitResult[2].strip()).groups()[0]) - - email['order_details']['order_items'].append({"item":itemName, "cost":cost}) - costTotal -= cost - - if costTotal != 0: - print("Warning order not parsed correctly, order items may be missing, or promotion may have been applied.") - print(email['order_details']) - print(body) - - return email diff --git a/src/index_emails.py b/src/index_emails.py index 24788e0..b3693d8 100644 --- a/src/index_emails.py +++ b/src/index_emails.py @@ -9,9 +9,6 @@ import email import quopri import chardet -from DelegatingEmailParser import DelegatingEmailParser -from AmazonEmailParser import AmazonEmailParser -from SteamEmailParser import SteamEmailParser from bs4 import BeautifulSoup import logging @@ -39,9 +36,8 @@ def strip_html_css_js(msg): def delete_index(): try: - url = "%s/%s?refresh=true" % (tornado.options.options.es_url, tornado.options.options.index_name) + url = "%s/%s" % (tornado.options.options.es_url, tornado.options.options.index_name) request = HTTPRequest(url, method="DELETE", request_timeout=240, headers={"Content-Type": "application/json"}) - body = {"refresh": True} response = http_client.fetch(request) logging.info('Delete index done %s' % response.body) except: @@ -85,11 +81,20 @@ def create_index(): def upload_batch(upload_data): + if tornado.options.options.dry_run: + logging.info("Dry run, not uploading") + return upload_data_txt = "" for item in upload_data: cmd = {'index': {'_index': tornado.options.options.index_name, '_type': 'email', '_id': item['message-id']}} - upload_data_txt += json.dumps(cmd) + "\n" - upload_data_txt += json.dumps(item) + "\n" + try: + json_cmd = json.dumps(cmd) + "\n" + json_item = json.dumps(item) + "\n" + except: + logging.warn('Skipping mail with message id %s because of exception converting to JSON (invalid characters?).' % item['message-id']) + continue + upload_data_txt += json_cmd + upload_data_txt += json_item request = HTTPRequest(tornado.options.options.es_url + "/_bulk", method="POST", body=upload_data_txt, request_timeout=240, headers={"Content-Type": "application/json"}) response = http_client.fetch(request) @@ -107,6 +112,17 @@ def normalize_email(email_in): def convert_msg_to_json(msg): + + def parse_message_parts(current_msg): + if current_msg.is_multipart(): + for mpart in current_msg.get_payload(): + if mpart is not None: + content_type = str(mpart.get_content_type()) + if not tornado.options.options.text_only or (content_type.startswith("text") or content_type.startswith("multipart")): + parse_message_parts(mpart) + else: + result['body'] += strip_html_css_js(current_msg.get_payload(decode=True)) + result = {'parts': []} if 'message-id' not in msg: return None @@ -117,11 +133,11 @@ def convert_msg_to_json(msg): for k in ['to', 'cc', 'bcc']: if not result.get(k): continue - emails_split = result[k].replace('\n', '').replace('\t', '').replace('\r', '').replace(' ', '').encode('utf8').decode('utf-8', 'ignore').split(',') + emails_split = str(result[k]).replace('\n', '').replace('\t', '').replace('\r', '').replace(' ', '').encode('utf8').decode('utf-8', 'ignore').split(',') result[k] = [normalize_email(e) for e in emails_split] if "from" in result: - result['from'] = normalize_email(result['from']) + result['from'] = normalize_email(str(result['from'])) if "date" in result: try: @@ -140,15 +156,7 @@ def convert_msg_to_json(msg): # Bodies... if tornado.options.options.index_bodies: result['body'] = '' - if msg.is_multipart(): - for mpart in msg.get_payload(): - if mpart is not None: - mpart_payload = mpart.get_payload(decode=True) - if mpart_payload is not None: - result['body'] += strip_html_css_js(mpart_payload) - else: - result['body'] = strip_html_css_js(msg.get_payload(decode=True)) - + parse_message_parts(msg) result['body_size'] = len(result['body']) parts = result.get("parts", []) @@ -156,6 +164,9 @@ def convert_msg_to_json(msg): for part in parts: result['content_size_total'] += len(part.get('content', "")) + if not tornado.options.options.index_x_headers: + result = {key: result[key] for key in result if not key.startswith("x-")} + return result @@ -166,22 +177,22 @@ def load_from_file(): create_index() if tornado.options.options.skip: - logging.info("Skipping first %d messages from mbox file" % tornado.options.options.skip) + logging.info("Skipping first %d messages" % tornado.options.options.skip) - count = 0 upload_data = list() - logging.info("Starting import from file %s" % tornado.options.options.infile) - # mbox = mailbox.UnixMailbox(open(tornado.options.options.infile, 'rb'), email.message_from_file) - # removed the above UnixMailbox which is not supported in python 3.x and replaced it with mailbox.mbox class - mbox = mailbox.mbox(tornado.options.options.infile) + if tornado.options.options.infile: + logging.info("Starting import from mbox file %s" % tornado.options.options.infile) + mbox = mailbox.mbox(tornado.options.options.infile) + else: + logging.info("Starting import from MH directory %s" % tornado.options.options.indir) + mbox = mailbox.MH(tornado.options.options.indir, factory=None, create=False) - emailParser = DelegatingEmailParser([AmazonEmailParser(), SteamEmailParser()]) + #Skipping on keys to avoid expensive read operations on skipped messages + msgkeys = mbox.keys()[tornado.options.options.skip:] - for msg in mbox: - count += 1 - if count < tornado.options.options.skip: - continue + for msgkey in msgkeys: + msg = mbox[msgkey] item = convert_msg_to_json(msg) if item: @@ -194,7 +205,7 @@ def load_from_file(): if upload_data: upload_batch(upload_data) - logging.info("Import done - total count %d" % count) + logging.info("Import done - total count %d" % len(mbox.keys())) if __name__ == '__main__': @@ -206,7 +217,10 @@ def load_from_file(): help="Name of the index to store your messages") tornado.options.define("infile", type=str, default=None, - help="The mbox input file") + help="Input file (supported mailbox format: mbox). Mutually exclusive to --indir") + + tornado.options.define("indir", type=str, default=None, + help="Input directory (supported mailbox format: mh). Mutually exclusive to --infile") tornado.options.define("init", type=bool, default=False, help="Force deleting and re-initializing the Elasticsearch index") @@ -215,7 +229,7 @@ def load_from_file(): help="Elasticsearch bulk index batch size") tornado.options.define("skip", type=int, default=0, - help="Number of messages to skip from the mbox file") + help="Number of messages to skip from mailbox") tornado.options.define("num_of_shards", type=int, default=2, help="Number of shards for ES index") @@ -224,9 +238,19 @@ def load_from_file(): help="Will index all body content, stripped of HTML/CSS/JS etc. Adds fields: 'body' and \ 'body_size'") + tornado.options.define("text_only", type=bool, default=False, + help='Only parse message body multiparts declared as text (ignoring images etc.).') + + tornado.options.define("index_x_headers", type=bool, default=True, + help='Index x-* fields from headers') + + tornado.options.define("dry_run", type=bool, default=False, + help='Do not upload to Elastic Search, just process messages') + tornado.options.parse_command_line() - if tornado.options.options.infile: + #Exactly one of {infile, indir} must be set + if bool(tornado.options.options.infile) ^ bool(tornado.options.options.indir): IOLoop.instance().run_sync(load_from_file) else: tornado.options.print_help() From 53283ad37805dfe62253bde14278cebe706841a5 Mon Sep 17 00:00:00 2001 From: Tim Gates Date: Fri, 25 Dec 2020 08:54:11 +1100 Subject: [PATCH 14/18] docs: fix simple typo, settingsa -> settings (#23) There is a small typo in README.md. Should read `settings` rather than `settingsa`. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ad442d1..c744744 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Goal of this tutorial is to load an entire Gmail inbox into Elasticsearch using Set up [Elasticsearch](https://www.elastic.co/guide/en/elasticsearch/guide/current/running-elasticsearch.html) and make sure it's running at [http://localhost:9200](http://localhost:9200) -A quick way to run Elasticsearch is using Docker: (the cors settingsa aren't really needed but come in handy if you want to use e.g. [dejavu](https://dejavu.appbase.io/) to explore the index) +A quick way to run Elasticsearch is using Docker: (the cors settings aren't really needed but come in handy if you want to use e.g. [dejavu](https://dejavu.appbase.io/) to explore the index) ``` docker run --name es -d -p 9200:9200 -e http.port=9200 -e http.cors.enabled=true -e 'http.cors.allow-origin=*' -e http.cors.allow-headers=X-Requested-With,X-Auth-Token,Content-Type,Content-Length,Authorization -e http.cors.allow-credentials=true -e "discovery.type=single-node" docker.elastic.co/elasticsearch/elasticsearch-oss:7.6.1 ``` From 746ea588e4fa135a219c9fdf39541832faa732dd Mon Sep 17 00:00:00 2001 From: Stephen George Date: Sat, 11 Jun 2022 14:57:21 -0500 Subject: [PATCH 15/18] Fix --index-bodies to be intended default of False. (#24) --- src/index_emails.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/index_emails.py b/src/index_emails.py index b3693d8..0c2b316 100644 --- a/src/index_emails.py +++ b/src/index_emails.py @@ -234,7 +234,7 @@ def load_from_file(): tornado.options.define("num_of_shards", type=int, default=2, help="Number of shards for ES index") - tornado.options.define("index_bodies", type=bool, default=True, + tornado.options.define("index_bodies", type=bool, default=False, help="Will index all body content, stripped of HTML/CSS/JS etc. Adds fields: 'body' and \ 'body_size'") From 5cbb522a57407095e17e33a0024a628bd2ac94b9 Mon Sep 17 00:00:00 2001 From: Oliver Date: Sun, 25 Jun 2023 18:08:19 -0400 Subject: [PATCH 16/18] tornado==6.3.2 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b4cf28f..55a9969 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ beautifulsoup4==4.6.0 chardet==3.0.4 -tornado==4.5.3 +tornado==6.3.2 From b6be72600ceaf92212d22a3fc891f520a002dbba Mon Sep 17 00:00:00 2001 From: Oliver Date: Thu, 17 Aug 2023 09:13:11 -0700 Subject: [PATCH 17/18] Update tornado to 6.3.3 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 55a9969..25e9892 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ beautifulsoup4==4.6.0 chardet==3.0.4 -tornado==6.3.2 +tornado==6.3.3 From 25e9cccb321e69c294919aad07eae724fcc968a4 Mon Sep 17 00:00:00 2001 From: Oliver Date: Fri, 7 Jun 2024 22:54:38 -0700 Subject: [PATCH 18/18] tornado 6.4.1 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 25e9892..72d9aef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ beautifulsoup4==4.6.0 chardet==3.0.4 -tornado==6.3.3 +tornado==6.4.1