From 9fe99f7cfb5e3dfe7f5a259d52bd1cff98b4bfa5 Mon Sep 17 00:00:00 2001
From: bitsofinfo <bitsofinfo.g@gmail.com>
Date: Fri, 25 Aug 2017 10:14:28 -0600
Subject: [PATCH 01/18] index bodies option/flag

---
 README.md           |  4 +++-
 src/index_emails.py | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a15a516..ad55793 100644
--- a/README.md
+++ b/README.md
@@ -18,6 +18,8 @@ Set up [Elasticsearch](http://ohardt.us/es-install) and make sure it's running a
 
 I use Python and [Tornado](https://github.com/tornadoweb/tornado/) for the scripts to import and query the data. Run `pip install tornado chardet` to install Tornado and chardet.
 
+You may also need to `pip install beautifulsoup4`, for the stripping HTML/JS/CSS via the body indexing flag: `--index_bodies`
+
 
 
 #### Aight, where do we start?
@@ -194,7 +196,7 @@ You can also quickly query for certain fields via the `q` parameter. This exampl
 
 ```
 curl "localhost:9200/gmail/email/_search?pretty&q=from:ship-confirm@amazon.com"
-``` 
+```
 
 ##### Aggregation queries
 
diff --git a/src/index_emails.py b/src/index_emails.py
index 20515af..867adfb 100644
--- a/src/index_emails.py
+++ b/src/index_emails.py
@@ -12,6 +12,7 @@
 from DelegatingEmailParser import DelegatingEmailParser
 from AmazonEmailParser import AmazonEmailParser
 from SteamEmailParser import SteamEmailParser
+from bs4 import BeautifulSoup
 import logging
 
 http_client = HTTPClient()
@@ -20,6 +21,20 @@
 DEFAULT_ES_URL = "http://localhost:9200"
 DEFAULT_INDEX_NAME = "gmail"
 
+def strip_html_css_js(msg):
+    soup = BeautifulSoup(msg,"html.parser") # create a new bs4 object from the html data loaded
+    for script in soup(["script", "style"]): # remove all javascript and stylesheet code
+        script.extract()
+    # get text
+    text = soup.get_text()
+    # break into lines and remove leading and trailing space on each
+    lines = (line.strip() for line in text.splitlines())
+    # break multi-headlines into a line each
+    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+    # drop blank lines
+    text = '\n'.join(chunk for chunk in chunks if chunk)
+    return text
+
 def delete_index():
     try:
         url = "%s/%s?refresh=true" % (tornado.options.options.es_url, tornado.options.options.index_name)
@@ -117,6 +132,20 @@ def convert_msg_to_json(msg):
         del result["x-gmail-labels"]
     result['labels'] = labels
 
+    # Bodies...
+    if tornado.options.options.index_bodies:
+        result['body'] = ''
+        if msg.is_multipart():
+            for mpart in msg.get_payload():
+                if mpart is not None:
+                    mpart_payload = mpart.get_payload(decode=True)
+                    if mpart_payload is not None:
+                        result['body'] += strip_html_css_js(mpart_payload)
+        else:
+            result['body'] = strip_html_css_js(msg.get_payload(decode=True))
+
+        result['body_size'] = len(result['body'])
+
     parts = result.get("parts", [])
     result['content_size_total'] = 0
     for part in parts:
@@ -183,6 +212,9 @@ def load_from_file():
     tornado.options.define("num_of_shards", type=int, default=2,
                            help="Number of shards for ES index")
 
+    tornado.options.define("index_bodies", type=bool, default=False,
+                           help="Will index all body content, stripped of HTML/CSS/JS etc. Adds fields: 'body' and 'body_size'")
+
     tornado.options.parse_command_line()
 
     if tornado.options.options.infile:

From a9ff91d7280a9c9be03ec5cfc5afd76fffd12108 Mon Sep 17 00:00:00 2001
From: Oliver <oliver.hardt@nytimes.com>
Date: Fri, 22 Dec 2017 14:05:40 +0100
Subject: [PATCH 02/18] fix two of the links

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index ad55793..46aaad1 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ __Related tutorial:__ [Index and Search Hacker News using Elasticsearch and the
 
 #### Prerequisites
 
-Set up [Elasticsearch](http://ohardt.us/es-install) and make sure it's running at [http://localhost:9200](http://localhost:9200)
+Set up [Elasticsearch](https://www.elastic.co/guide/en/elasticsearch/guide/current/running-elasticsearch.html) and make sure it's running at [http://localhost:9200](http://localhost:9200)
 
 I use Python and [Tornado](https://github.com/tornadoweb/tornado/) for the scripts to import and query the data. Run `pip install tornado chardet` to install Tornado and chardet.
 
@@ -24,7 +24,7 @@ You may also need to `pip install beautifulsoup4`, for the stripping HTML/JS/CSS
 
 #### Aight, where do we start?
 
-First, go [here](http://ohardt.us/download-gmail-mailbox) and download your Gmail mailbox, depending on the amount of emails you have accumulated this might take a while.
+First, go [here](https://www.google.com/settings/takeout/custom/gmail) and download your Gmail mailbox, depending on the amount of emails you have accumulated this might take a while.
 
 The downloaded archive is in the [mbox format](http://en.wikipedia.org/wiki/Mbox) and Python provides libraries to work with the mbox format so that's easy.
 

From 0dab6ec992e3f3342a0af8e963aee8dd4e9b726f Mon Sep 17 00:00:00 2001
From: Oliver <oliver.hardt@nytimes.com>
Date: Fri, 22 Dec 2017 14:06:27 +0100
Subject: [PATCH 03/18] fix third link

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 46aaad1..0362519 100644
--- a/README.md
+++ b/README.md
@@ -115,7 +115,7 @@ def upload_item_to_es(item):
 
 ```
 
-However, Elasticsearch provides a better method for importing large chunks of data: [bulk indexing](http://ohardt.us/es-bulk-indexing)
+However, Elasticsearch provides a better method for importing large chunks of data: [bulk indexing](https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html)
 Instead of making a HTTP request per document and indexing individually, we batch them in chunks of eg. 1000 documents and then index them.<br>
 Bulk messages are of the format:
 

From 1fccb8bb589d3dbf55d5969a47093821fbcdb909 Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Sat, 23 Dec 2017 18:28:59 +0100
Subject: [PATCH 04/18] print() is a function in modern Python

---
 src/AmazonEmailParser.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/AmazonEmailParser.py b/src/AmazonEmailParser.py
index 69ea211..117c87f 100644
--- a/src/AmazonEmailParser.py
+++ b/src/AmazonEmailParser.py
@@ -1,3 +1,5 @@
+from __future__ import print_function
+
 import json
 import re
 
@@ -74,8 +76,8 @@ def parse(self, email):
                 costTotal -= cost
 
             if costTotal != 0:
-                print "Warning order not parsed correctly, order items may be missing, or promotion may have been applied."
-                print email['order_details']
-                print body
+                print("Warning order not parsed correctly, order items may be missing, or promotion may have been applied.")
+                print(email['order_details'])
+                print(body)
 
         return email

From 5a1813aa044e1943e7a9cf04fe01ec72542fef87 Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Sat, 23 Dec 2017 18:30:36 +0100
Subject: [PATCH 05/18] print() is a function in modern Python

---
 src/SteamEmailParser.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/SteamEmailParser.py b/src/SteamEmailParser.py
index cf7aef9..601660e 100644
--- a/src/SteamEmailParser.py
+++ b/src/SteamEmailParser.py
@@ -1,3 +1,5 @@
+from __future__ import print_function
+
 import json
 import re
 
@@ -54,8 +56,8 @@ def parse(self, email):
                 costTotal -= cost
 
             if costTotal != 0:
-                print "Warning order not parsed correctly, order items may be missing, or promotion may have been applied."
-                print email['order_details']
-                print body
+                print("Warning order not parsed correctly, order items may be missing, or promotion may have been applied.")
+                print(email['order_details'])
+                print(body)
 
         return email

From 0786ee0e1f97eb4d802ad4d7fb7e9e2b509a7fa8 Mon Sep 17 00:00:00 2001
From: Ugo Sangiorgi <ugo.sangiorgi@gmail.com>
Date: Fri, 19 Jan 2018 14:37:58 -0500
Subject: [PATCH 06/18] requirements.txt to ease dependency installation

---
 README.md        | 9 +++++----
 requirements.txt | 3 +++
 2 files changed, 8 insertions(+), 4 deletions(-)
 create mode 100644 requirements.txt

diff --git a/README.md b/README.md
index 0362519..32bc775 100644
--- a/README.md
+++ b/README.md
@@ -16,10 +16,11 @@ __Related tutorial:__ [Index and Search Hacker News using Elasticsearch and the
 
 Set up [Elasticsearch](https://www.elastic.co/guide/en/elasticsearch/guide/current/running-elasticsearch.html) and make sure it's running at [http://localhost:9200](http://localhost:9200)
 
-I use Python and [Tornado](https://github.com/tornadoweb/tornado/) for the scripts to import and query the data. Run `pip install tornado chardet` to install Tornado and chardet.
+I use Python and [Tornado](https://github.com/tornadoweb/tornado/) for the scripts to import and query the data. Also `beautifulsoup4` for the stripping HTML/JS/CSS (if you want to use the body indexing flag).
 
-You may also need to `pip install beautifulsoup4`, for the stripping HTML/JS/CSS via the body indexing flag: `--index_bodies`
+Install the dependencies by running:
 
+`pip install -r requirements.txt`
 
 
 #### Aight, where do we start?
@@ -283,9 +284,9 @@ Result:
     "key_as_string" : "2004-01-01T00:00:00.000Z",
     "key" : 1072915200000,
     "doc_count" : 585
-  }, {  
+  }, {
 ...
-  }, {  
+  }, {
     "key_as_string" : "2013-01-01T00:00:00.000Z",
     "key" : 1356998400000,
     "doc_count" : 12832
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..b4cf28f
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+beautifulsoup4==4.6.0
+chardet==3.0.4
+tornado==4.5.3

From dd46fe189a2be4a024570a823b19357e93da00ba Mon Sep 17 00:00:00 2001
From: Ugo Sangiorgi <ugo.sangiorgi@gmail.com>
Date: Fri, 19 Jan 2018 14:46:25 -0500
Subject: [PATCH 07/18] "not in" instead of "not 'string' in"

---
 src/index_emails.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/index_emails.py b/src/index_emails.py
index 867adfb..4103957 100644
--- a/src/index_emails.py
+++ b/src/index_emails.py
@@ -103,7 +103,7 @@ def normalize_email(email_in):
 
 def convert_msg_to_json(msg):
     result = {'parts': []}
-    if not 'message-id' in msg:
+    if 'message-id' not in msg:
         return None
 
     for (k, v) in msg.items():
@@ -160,7 +160,6 @@ def load_from_file():
         delete_index()
     create_index()
 
-
     if tornado.options.options.skip:
         logging.info("Skipping first %d messages from mbox file" % tornado.options.options.skip)
 

From ac3527d8024abfc1a99ad72d43a0680bf7aa16fc Mon Sep 17 00:00:00 2001
From: Prateep Bandharangshi <prateep@anicca.net>
Date: Mon, 26 Mar 2018 17:42:37 +0100
Subject: [PATCH 08/18] Update README.md

Fix incorrect link
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 32bc775..2d8d826 100644
--- a/README.md
+++ b/README.md
@@ -43,7 +43,7 @@ print "Done!"
 
 #### Ok, tell me more about the details
 
-The full Python code is here: [src/update.py](src/index_emails.py)
+The full Python code is here: [src/index_emails.py](src/index_emails.py)
 
 
 ##### Turn mbox into JSON

From 8a6bc92155a011b248e5f07025cac330231352da Mon Sep 17 00:00:00 2001
From: Jay Caines-Gooby <jay@gooby.org>
Date: Wed, 1 Aug 2018 23:33:21 +0100
Subject: [PATCH 09/18] Elastic Search 6 compatibility

https://www.elastic.co/blog/strict-content-type-checking-for-elasticsearch-rest-requests

Starting from Elasticsearch 6.0, all REST requests that include a body must also provide the correct content-type for that body.
---
 src/index_emails.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/index_emails.py b/src/index_emails.py
index 4103957..e3a4c96 100644
--- a/src/index_emails.py
+++ b/src/index_emails.py
@@ -38,7 +38,7 @@ def strip_html_css_js(msg):
 def delete_index():
     try:
         url = "%s/%s?refresh=true" % (tornado.options.options.es_url, tornado.options.options.index_name)
-        request = HTTPRequest(url, method="DELETE", request_timeout=240)
+        request = HTTPRequest(url, method="DELETE", request_timeout=240, headers={"Content-Type": "application/json"})
         body = {"refresh": True}
         response = http_client.fetch(request)
         logging.info('Delete index done   %s' % response.body)
@@ -71,7 +71,7 @@ def create_index():
     body = json.dumps(schema)
     url = "%s/%s" % (tornado.options.options.es_url, tornado.options.options.index_name)
     try:
-        request = HTTPRequest(url, method="PUT", body=body, request_timeout=240)
+        request = HTTPRequest(url, method="PUT", body=body, request_timeout=240, headers={"Content-Type": "application/json"})
         response = http_client.fetch(request)
         logging.info('Create index done   %s' % response.body)
     except:
@@ -86,7 +86,7 @@ def upload_batch(upload_data):
         upload_data_txt += json.dumps(cmd) + "\n"
         upload_data_txt += json.dumps(item) + "\n"
 
-    request = HTTPRequest(tornado.options.options.es_url + "/_bulk", method="POST", body=upload_data_txt, request_timeout=240)
+    request = HTTPRequest(tornado.options.options.es_url + "/_bulk", method="POST", body=upload_data_txt, request_timeout=240, headers={"Content-Type": "application/json"})
     response = http_client.fetch(request)
     result = json.loads(response.body)
 

From 0a0e4744f5c42784ebda279c89684a852d7b57a1 Mon Sep 17 00:00:00 2001
From: priyadharshinijaganathan
 <39261947+priyadharshinijaganathan@users.noreply.github.com>
Date: Thu, 12 Mar 2020 19:10:38 +0530
Subject: [PATCH 10/18] Changed the code from python 2.x to 3.x

---
 src/index_emails.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/src/index_emails.py b/src/index_emails.py
index e3a4c96..24788e0 100644
--- a/src/index_emails.py
+++ b/src/index_emails.py
@@ -21,9 +21,10 @@
 DEFAULT_ES_URL = "http://localhost:9200"
 DEFAULT_INDEX_NAME = "gmail"
 
+
 def strip_html_css_js(msg):
-    soup = BeautifulSoup(msg,"html.parser") # create a new bs4 object from the html data loaded
-    for script in soup(["script", "style"]): # remove all javascript and stylesheet code
+    soup = BeautifulSoup(msg, "html.parser")  # create a new bs4 object from the html data loaded
+    for script in soup(["script", "style"]):  # remove all javascript and stylesheet code
         script.extract()
     # get text
     text = soup.get_text()
@@ -35,6 +36,7 @@ def strip_html_css_js(msg):
     text = '\n'.join(chunk for chunk in chunks if chunk)
     return text
 
+
 def delete_index():
     try:
         url = "%s/%s?refresh=true" % (tornado.options.options.es_url, tornado.options.options.index_name)
@@ -45,6 +47,7 @@ def delete_index():
     except:
         pass
 
+
 def create_index():
 
     schema = {
@@ -79,6 +82,8 @@ def create_index():
 
 
 total_uploaded = 0
+
+
 def upload_batch(upload_data):
     upload_data_txt = ""
     for item in upload_data:
@@ -107,7 +112,7 @@ def convert_msg_to_json(msg):
         return None
 
     for (k, v) in msg.items():
-        result[k.lower()] = v.decode('utf-8', 'ignore')
+        result[k.lower()] = v
 
     for k in ['to', 'cc', 'bcc']:
         if not result.get(k):
@@ -166,7 +171,10 @@ def load_from_file():
     count = 0
     upload_data = list()
     logging.info("Starting import from file %s" % tornado.options.options.infile)
-    mbox = mailbox.UnixMailbox(open(tornado.options.options.infile, 'rb'), email.message_from_file)
+    # mbox = mailbox.UnixMailbox(open(tornado.options.options.infile, 'rb'), email.message_from_file)
+
+    # removed the above UnixMailbox which is not supported in python 3.x and replaced it with mailbox.mbox class
+    mbox = mailbox.mbox(tornado.options.options.infile)
 
     emailParser = DelegatingEmailParser([AmazonEmailParser(), SteamEmailParser()])
 
@@ -175,6 +183,7 @@ def load_from_file():
         if count < tornado.options.options.skip:
             continue
         item = convert_msg_to_json(msg)
+
         if item:
             upload_data.append(item)
             if len(upload_data) == tornado.options.options.batch_size:
@@ -211,8 +220,9 @@ def load_from_file():
     tornado.options.define("num_of_shards", type=int, default=2,
                            help="Number of shards for ES index")
 
-    tornado.options.define("index_bodies", type=bool, default=False,
-                           help="Will index all body content, stripped of HTML/CSS/JS etc. Adds fields: 'body' and 'body_size'")
+    tornado.options.define("index_bodies", type=bool, default=True,
+                           help="Will index all body content, stripped of HTML/CSS/JS etc. Adds fields: 'body' and \
+                                    'body_size'")
 
     tornado.options.parse_command_line()
 

From a7cce066440008c7600f9c3531f84d303d5b156b Mon Sep 17 00:00:00 2001
From: Oliver <oliver@21zoo.com>
Date: Sat, 14 Mar 2020 15:53:58 -0400
Subject: [PATCH 11/18] update README, add sample command line and sample mbox
 file, add docker instructions to run ES

---
 .gitignore  |   1 +
 README.md   |  25 +-
 sample.mbox | 685 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 707 insertions(+), 4 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 sample.mbox

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..5ceb386
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+venv
diff --git a/README.md b/README.md
index 2d8d826..db70861 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 Elasticsearch For Beginners: Indexing your Gmail Inbox
 =======================
 
+[![Build Status](https://cloud.drone.io/api/badges/oliver006/elasticsearch-gmail/status.svg)](https://cloud.drone.io/oliver006/elasticsearch-gmail)
 
 
 #### What's this all about?
@@ -9,26 +10,42 @@ I recently looked at my Gmail inbox and noticed that I have well over 50k emails
 
 Goal of this tutorial is to load an entire Gmail inbox into Elasticsearch using bulk indexing and then start querying the cluster to get a better picture of what's going on.
 
-__Related tutorial:__ [Index and Search Hacker News using Elasticsearch and the HN API](https://github.com/oliver006/elasticsearch-hn)
-
 
 #### Prerequisites
 
 Set up [Elasticsearch](https://www.elastic.co/guide/en/elasticsearch/guide/current/running-elasticsearch.html) and make sure it's running at [http://localhost:9200](http://localhost:9200)
 
+A quick way to run Elasticsearch is using Docker: (the cors settingsa aren't really needed but come in handy if you want to use e.g. [dejavu](https://dejavu.appbase.io/) to explore the index)
+```
+docker run --name es -d -p 9200:9200 -e http.port=9200 -e http.cors.enabled=true -e 'http.cors.allow-origin=*' -e http.cors.allow-headers=X-Requested-With,X-Auth-Token,Content-Type,Content-Length,Authorization -e http.cors.allow-credentials=true -e "discovery.type=single-node" docker.elastic.co/elasticsearch/elasticsearch-oss:7.6.1
+```
+
 I use Python and [Tornado](https://github.com/tornadoweb/tornado/) for the scripts to import and query the data. Also `beautifulsoup4` for the stripping HTML/JS/CSS (if you want to use the body indexing flag).
 
 Install the dependencies by running:
 
-`pip install -r requirements.txt`
+`pip3 install -r requirements.txt`
 
 
 #### Aight, where do we start?
 
 First, go [here](https://www.google.com/settings/takeout/custom/gmail) and download your Gmail mailbox, depending on the amount of emails you have accumulated this might take a while.
+There's also a small `sample.mbox` file included in the repo for you to play around with while you're waiting for Google to prepare your download.
 
 The downloaded archive is in the [mbox format](http://en.wikipedia.org/wiki/Mbox) and Python provides libraries to work with the mbox format so that's easy.
 
+You can run the code (assuming Elasticsearch is running at localhost:9200) with the sammple mbox file like this:
+```
+$ python3 src/index_emails.py --infile=sample.mbox
+[I index_emails:173] Starting import from file sample.mbox
+[I index_emails:101] Upload: OK - upload took: 1033ms, total messages uploaded:      3
+[I index_emails:197] Import done - total count 16
+$
+```
+
+
+#### The Source Code
+
 The overall program will look something like this:
 
 ```python
@@ -342,4 +359,4 @@ GET _search
 
 #### Feedback
 
-Open pull requests, issues or email me at o@21zoo.com
+Open a pull requests or an issue!
diff --git a/sample.mbox b/sample.mbox
new file mode 100644
index 0000000..de10312
--- /dev/null
+++ b/sample.mbox
@@ -0,0 +1,685 @@
+    
+	
+    
+From nobody Mon Sep 17 00:00:00 2001
+From: A (zzz)
+      U
+      Thor
+      <a.u.thor@example.com> (Comment)
+Date: Fri, 9 Jun 2006 00:44:16 -0700
+Subject: [PATCH] a commit.
+
+Here is a patch from A U Thor.
+
+---
+ foo |    2 +-
+ 1 files changed, 1 insertions(+), 1 deletions(-)
+
+diff --git a/foo b/foo
+index 9123cdc..918dcf8 100644
+--- a/foo
++++ b/foo
+@@ -1 +1 @@
+-Fri Jun  9 00:44:04 PDT 2006
++Fri Jun  9 00:44:13 PDT 2006
+-- 
+1.4.0.g6f2b
+
+From nobody Mon Sep 17 00:00:00 2001
+From: A U Thor <a.u.thor@example.com>
+Date: Fri, 9 Jun 2006 00:44:16 -0700
+Subject: [PATCH] another patch
+
+Here is a patch from A U Thor.  This addresses the issue raised in the
+message:
+
+From: Nit Picker <nit.picker@example.net>
+Subject: foo is too old
+Message-Id: <nitpicker.12121212@example.net>
+
+Hopefully this would fix the problem stated there.
+
+
+I have included an extra blank line above, but it does not have to be
+stripped away here, along with the               		   
+whitespaces at the end of the above line.  They are expected to be squashed
+when the message is made into a commit log by stripspace,
+Also, there are three blank lines after this paragraph,
+two truly blank and another full of spaces in between.
+
+            
+
+Hope this helps.
+
+---
+ foo |    2 +-
+ 1 files changed, 1 insertions(+), 1 deletions(-)
+
+diff --git a/foo b/foo
+index 9123cdc..918dcf8 100644
+--- a/foo
++++ b/foo
+@@ -1 +1 @@
+-Fri Jun  9 00:44:04 PDT 2006
++Fri Jun  9 00:44:13 PDT 2006
+-- 
+1.4.0.g6f2b
+
+From nobody Mon Sep 17 00:00:00 2001
+From: Junio C Hamano <junio@kernel.org>
+Date: Fri, 9 Jun 2006 00:44:16 -0700
+Subject: re: [PATCH] another patch
+
+From: A U Thor <a.u.thor@example.com>
+Subject: [PATCH] third patch
+
+Here is a patch from A U Thor.  This addresses the issue raised in the
+message:
+
+From: Nit Picker <nit.picker@example.net>
+Subject: foo is too old
+Message-Id: <nitpicker.12121212@example.net>
+
+Hopefully this would fix the problem stated there.
+
+---
+ foo |    2 +-
+ 1 files changed, 1 insertions(+), 1 deletions(-)
+
+diff --git a/foo b/foo
+index 9123cdc..918dcf8 100644
+--- a/foo
++++ b/foo
+@@ -1 +1 @@
+-Fri Jun  9 00:44:04 PDT 2006
++Fri Jun  9 00:44:13 PDT 2006
+-- 
+1.4.0.g6f2b
+
+From nobody Sat Aug 27 23:07:49 2005
+Path: news.gmane.org!not-for-mail
+Message-ID: <20050721.091036.01119516.yoshfuji@linux-ipv6.org>
+From: YOSHIFUJI Hideaki / =?ISO-2022-JP?B?GyRCNUhGIzFRTEAbKEI=?= 
+	<yoshfuji@linux-ipv6.org>
+Newsgroups: gmane.comp.version-control.git
+Subject: [PATCH 1/2] GIT: Try all addresses for given remote name
+Date: Thu, 21 Jul 2005 09:10:36 -0400 (EDT)
+Lines: 99
+Organization: USAGI/WIDE Project
+Approved: news@gmane.org
+NNTP-Posting-Host: main.gmane.org
+Mime-Version: 1.0
+Content-Type: Text/Plain; charset=us-ascii
+Content-Transfer-Encoding: 7bit
+X-Trace: sea.gmane.org 1121951434 29350 80.91.229.2 (21 Jul 2005 13:10:34 GMT)
+X-Complaints-To: usenet@sea.gmane.org
+NNTP-Posting-Date: Thu, 21 Jul 2005 13:10:34 +0000 (UTC)
+
+Hello.
+
+Try all addresses for given remote name until it succeeds.
+Also supports IPv6.
+
+Signed-of-by: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
+
+diff --git a/connect.c b/connect.c
+--- a/connect.c
++++ b/connect.c
+@@ -96,42 +96,57 @@ static enum protocol get_protocol(const 
+ 	die("I don't handle protocol '%s'", name);
+ }
+ 
+-static void lookup_host(const char *host, struct sockaddr *in)
+-{
+-	struct addrinfo *res;
+-	int ret;
+-
+-	ret = getaddrinfo(host, NULL, NULL, &res);
+-	if (ret)
+-		die("Unable to look up %s (%s)", host, gai_strerror(ret));
+-	*in = *res->ai_addr;
+-	freeaddrinfo(res);
+-}
++#define STR_(s)	# s
++#define STR(s)	STR_(s)
+ 
+ static int git_tcp_connect(int fd[2], const char *prog, char *host, char *path)
+ {
+-	struct sockaddr addr;
+-	int port = DEFAULT_GIT_PORT, sockfd;
+-	char *colon;
+-
+-	colon = strchr(host, ':');
+-	if (colon) {
+-		char *end;
+-		unsigned long n = strtoul(colon+1, &end, 0);
+-		if (colon[1] && !*end) {
+-			*colon = 0;
+-			port = n;
++	int sockfd = -1;
++	char *colon, *end;
++	char *port = STR(DEFAULT_GIT_PORT);
++	struct addrinfo hints, *ai0, *ai;
++	int gai;
++
++	if (host[0] == '[') {
++		end = strchr(host + 1, ']');
++		if (end) {
++			*end = 0;
++			end++;
++			host++;
++		} else
++			end = host;
++	} else
++		end = host;
++	colon = strchr(end, ':');
++
++	if (colon)
++		port = colon + 1;
++
++	memset(&hints, 0, sizeof(hints));
++	hints.ai_socktype = SOCK_STREAM;
++	hints.ai_protocol = IPPROTO_TCP;
++
++	gai = getaddrinfo(host, port, &hints, &ai);
++	if (gai)
++		die("Unable to look up %s (%s)", host, gai_strerror(gai));
++
++	for (ai0 = ai; ai; ai = ai->ai_next) {
++		sockfd = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
++		if (sockfd < 0)
++			continue;
++		if (connect(sockfd, ai->ai_addr, ai->ai_addrlen) < 0) {
++			close(sockfd);
++			sockfd = -1;
++			continue;
+ 		}
++		break;
+ 	}
+ 
+-	lookup_host(host, &addr);
+-	((struct sockaddr_in *)&addr)->sin_port = htons(port);
++	freeaddrinfo(ai0);
+ 
+-	sockfd = socket(PF_INET, SOCK_STREAM, IPPROTO_IP);
+ 	if (sockfd < 0)
+ 		die("unable to create socket (%s)", strerror(errno));
+-	if (connect(sockfd, (void *)&addr, sizeof(addr)) < 0)
+-		die("unable to connect (%s)", strerror(errno));
++
+ 	fd[0] = sockfd;
+ 	fd[1] = sockfd;
+ 	packet_write(sockfd, "%s %s\n", prog, path);
+
+-- 
+YOSHIFUJI Hideaki @ USAGI Project  <yoshfuji@linux-ipv6.org>
+GPG-FP  : 9022 65EB 1ECF 3AD1 0BDF  80D8 4807 F894 E062 0EEA
+
+From nobody Sat Aug 27 23:07:49 2005
+Path: news.gmane.org!not-for-mail
+Message-ID: <u5tacjjdpxq.fsf@lysator.liu.se>
+From: =?ISO8859-1?Q?David_K=E5gedal?= <davidk@lysator.liu.se>
+Newsgroups: gmane.comp.version-control.git
+Subject: [PATCH] Fixed two bugs in git-cvsimport-script.
+Date: Mon, 15 Aug 2005 20:18:25 +0200
+Lines: 83
+Approved: news@gmane.org
+NNTP-Posting-Host: main.gmane.org
+Mime-Version: 1.0
+Content-Type: text/plain; charset=ISO8859-1
+Content-Transfer-Encoding: QUOTED-PRINTABLE
+X-Trace: sea.gmane.org 1124130247 31839 80.91.229.2 (15 Aug 2005 18:24:07 GMT)
+X-Complaints-To: usenet@sea.gmane.org
+NNTP-Posting-Date: Mon, 15 Aug 2005 18:24:07 +0000 (UTC)
+Cc: "Junio C. Hamano" <junkio@cox.net>
+Original-X-From: git-owner@vger.kernel.org Mon Aug 15 20:24:05 2005
+
+The git-cvsimport-script had a copule of small bugs that prevented me
+from importing a big CVS repository.
+
+The first was that it didn't handle removed files with a multi-digit
+primary revision number.
+
+The second was that it was asking the CVS server for "F" messages,
+although they were not handled.
+
+I also updated the documentation for that script to correspond to
+actual flags.
+
+Signed-off-by: David K=E5gedal <davidk@lysator.liu.se>
+---
+
+ Documentation/git-cvsimport-script.txt |    9 ++++++++-
+ git-cvsimport-script                   |    4 ++--
+ 2 files changed, 10 insertions(+), 3 deletions(-)
+
+50452f9c0c2df1f04d83a26266ba704b13861632
+diff --git a/Documentation/git-cvsimport-script.txt b/Documentation/git=
+-cvsimport-script.txt
+--- a/Documentation/git-cvsimport-script.txt
++++ b/Documentation/git-cvsimport-script.txt
+@@ -29,6 +29,10 @@ OPTIONS
+ 	currently, only the :local:, :ext: and :pserver: access methods=20
+ 	are supported.
+=20
++-C <target-dir>::
++        The GIT repository to import to.  If the directory doesn't
++        exist, it will be created.  Default is the current directory.
++
+ -i::
+ 	Import-only: don't perform a checkout after importing.  This option
+ 	ensures the working directory and cache remain untouched and will
+@@ -44,7 +48,7 @@ OPTIONS
+=20
+ -p <options-for-cvsps>::
+ 	Additional options for cvsps.
+-	The options '-x' and '-A' are implicit and should not be used here.
++	The options '-u' and '-A' are implicit and should not be used here.
+=20
+ 	If you need to pass multiple options, separate them with a comma.
+=20
+@@ -57,6 +61,9 @@ OPTIONS
+ -h::
+ 	Print a short usage message and exit.
+=20
++-z <fuzz>::
++        Pass the timestamp fuzz factor to cvsps.
++
+ OUTPUT
+ ------
+ If '-v' is specified, the script reports what it is doing.
+diff --git a/git-cvsimport-script b/git-cvsimport-script
+--- a/git-cvsimport-script
++++ b/git-cvsimport-script
+@@ -190,7 +190,7 @@ sub conn {
+ 	$self->{'socketo'}->write("Root $repo\n");
+=20
+ 	# Trial and error says that this probably is the minimum set
+-	$self->{'socketo'}->write("Valid-responses ok error Valid-requests Mo=
+de M Mbinary E F Checked-in Created Updated Merged Removed\n");
++	$self->{'socketo'}->write("Valid-responses ok error Valid-requests Mo=
+de M Mbinary E Checked-in Created Updated Merged Removed\n");
+=20
+ 	$self->{'socketo'}->write("valid-requests\n");
+ 	$self->{'socketo'}->flush();
+@@ -691,7 +691,7 @@ while(<CVS>) {
+ 		unlink($tmpname);
+ 		my $mode =3D pmode($cvs->{'mode'});
+ 		push(@new,[$mode, $sha, $fn]); # may be resurrected!
+-	} elsif($state =3D=3D 9 and /^\s+(\S+):\d(?:\.\d+)+->(\d(?:\.\d+)+)\(=
+DEAD\)\s*$/) {
++	} elsif($state =3D=3D 9 and /^\s+(\S+):\d+(?:\.\d+)+->(\d+(?:\.\d+)+)=
+\(DEAD\)\s*$/) {
+ 		my $fn =3D $1;
+ 		$fn =3D~ s#^/+##;
+ 		push(@old,$fn);
+
+--=20
+David K=E5gedal
+-
+To unsubscribe from this list: send the line "unsubscribe git" in
+the body of a message to majordomo@vger.kernel.org
+More majordomo info at  http://vger.kernel.org/majordomo-info.html
+
+From nobody Mon Sep 17 00:00:00 2001
+From: A U Thor <a.u.thor@example.com>
+References: <Pine.LNX.4.640.0001@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0002@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0003@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0004@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0005@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0006@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0007@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0008@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0009@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0010@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0011@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0012@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0013@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0014@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0015@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0016@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0017@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0018@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0019@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0020@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0021@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0022@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0023@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0024@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0025@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0026@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0027@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0028@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0029@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0030@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0031@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0032@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0033@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0034@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0035@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0036@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0037@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0038@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0039@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0040@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0041@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0042@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0043@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0044@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0045@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0046@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0047@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0048@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0049@woody.linux-foundation.org>
+ <Pine.LNX.4.640.0050@woody.linux-foundation.org>
+Date: Fri, 9 Jun 2006 00:44:16 -0700
+Subject: [PATCH] a commit.
+
+Here is a patch from A U Thor.
+
+---
+ foo |    2 +-
+ 1 files changed, 1 insertions(+), 1 deletions(-)
+
+diff --git a/foo b/foo
+index 9123cdc..918dcf8 100644
+--- a/foo
++++ b/foo
+@@ -1 +1 @@
+-Fri Jun  9 00:44:04 PDT 2006
++Fri Jun  9 00:44:13 PDT 2006
+-- 
+1.4.0.g6f2b
+
+From nobody Mon Sep 17 00:00:00 2001
+From: A U Thor <a.u.thor@example.com>
+Date: Fri, 9 Jun 2006 00:44:16 -0700
+Subject: [PATCH] another patch
+
+Here is an empty patch from A U Thor.
+
+From nobody Mon Sep 17 00:00:00 2001
+From: Junio C Hamano <junio@kernel.org>
+Date: Fri, 9 Jun 2006 00:44:16 -0700
+Subject: re: [PATCH] another patch
+
+From: A U Thor <a.u.thor@example.com>
+Subject: [PATCH] another patch
+>Here is an empty patch from A U Thor.
+
+Hey you forgot the patch!
+
+From nobody Mon Sep 17 00:00:00 2001
+From: A U Thor <a.u.thor@example.com>
+Date: Mon, 17 Sep 2001 00:00:00 +0900
+Mime-Version: 1.0
+Content-Type: Text/Plain; charset=us-ascii
+Content-Transfer-Encoding: Quoted-Printable
+
+=0A=0AFrom: F U Bar <f.u.bar@example.com>
+Subject: [PATCH] updates=0A=0AThis is to fix diff-format documentation.
+
+diff --git a/Documentation/diff-format.txt b/Documentation/diff-format.txt
+index b426a14..97756ec 100644
+--- a/Documentation/diff-format.txt
++++ b/Documentation/diff-format.txt
+@@ -81,7 +81,7 @@ The "diff" formatting options can be customized via the
+ environment variable 'GIT_DIFF_OPTS'.  For example, if you
+ prefer context diff:
+=20
+-      GIT_DIFF_OPTS=3D-c git-diff-index -p $(cat .git/HEAD)
++      GIT_DIFF_OPTS=3D-c git-diff-index -p HEAD
+=20
+=20
+ 2. When the environment variable 'GIT_EXTERNAL_DIFF' is set, the
+From b9704a518e21158433baa2cc2d591fea687967f6 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Lukas=20Sandstr=C3=B6m?= <lukass@etek.chalmers.se>
+Date: Thu, 10 Jul 2008 23:41:33 +0200
+Subject: Re: discussion that lead to this patch
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+[PATCH] git-mailinfo: Fix getting the subject from the body
+
+"Subject: " isn't in the static array "header", and thus
+memcmp("Subject: ", header[i], 7) will never match.
+
+Signed-off-by: Lukas Sandström <lukass@etek.chalmers.se>
+Signed-off-by: Junio C Hamano <gitster@pobox.com>
+---
+ builtin-mailinfo.c |    2 +-
+ 1 files changed, 1 insertions(+), 1 deletions(-)
+
+diff --git a/builtin-mailinfo.c b/builtin-mailinfo.c
+index 962aa34..2d1520f 100644
+--- a/builtin-mailinfo.c
++++ b/builtin-mailinfo.c
+@@ -334,7 +334,7 @@ static int check_header(char *line, unsigned linesize, char **hdr_data, int over
+ 		return 1;
+ 	if (!memcmp("[PATCH]", line, 7) && isspace(line[7])) {
+ 		for (i = 0; header[i]; i++) {
+-			if (!memcmp("Subject: ", header[i], 9)) {
++			if (!memcmp("Subject", header[i], 7)) {
+ 				if (! handle_header(line, hdr_data[i], 0)) {
+ 					return 1;
+ 				}
+-- 
+1.5.6.2.455.g1efb2
+
+From nobody Fri Aug  8 22:24:03 2008
+Date: Fri, 8 Aug 2008 13:08:37 +0200 (CEST)
+From: A U Thor <a.u.thor@example.com>
+Subject: [PATCH 3/3 v2] Xyzzy
+MIME-Version: 1.0
+Content-Type: multipart/mixed; boundary="=-=-="
+
+--=-=-=
+Content-Type: text/plain; charset=ISO8859-15
+Content-Transfer-Encoding: quoted-printable
+
+Here comes a commit log message, and
+its second line is here.
+---
+ builtin-mailinfo.c  |    4 ++--
+
+diff --git a/builtin-mailinfo.c b/builtin-mailinfo.c
+index 3e5fe51..aabfe5c 100644
+--- a/builtin-mailinfo.c
++++ b/builtin-mailinfo.c
+@@ -758,8 +758,8 @@ static void handle_body(void)
+ 		/* process any boundary lines */
+ 		if (*content_top && is_multipart_boundary(&line)) {
+ 			/* flush any leftover */
+-			if (line.len)
+-				handle_filter(&line);
++			if (prev.len)
++				handle_filter(&prev);
+=20
+ 			if (!handle_boundary())
+ 				goto handle_body_out;
+--=20
+1.6.0.rc2
+
+--=-=-=--
+
+From bda@mnsspb.ru Wed Nov 12 17:54:41 2008
+From: Dmitriy Blinov <bda@mnsspb.ru>
+To: navy-patches@dinar.mns.mnsspb.ru
+Date: Wed, 12 Nov 2008 17:54:41 +0300
+Message-Id: <1226501681-24923-1-git-send-email-bda@mnsspb.ru>
+X-Mailer: git-send-email 1.5.6.5
+MIME-Version: 1.0
+Content-Type: text/plain;
+  charset=utf-8
+Content-Transfer-Encoding: 8bit
+Subject: [Navy-patches] [PATCH]
+	=?utf-8?b?0JjQt9C80LXQvdGR0L0g0YHQv9C40YHQvtC6INC/0LA=?=
+	=?utf-8?b?0LrQtdGC0L7QsiDQvdC10L7QsdGF0L7QtNC40LzRi9GFINC00LvRjyA=?=
+	=?utf-8?b?0YHQsdC+0YDQutC4?=
+
+textlive-* исправлены на texlive-*
+docutils заменён на python-docutils
+
+Действительно, оказалось, что rest2web вытягивает за собой
+python-docutils. В то время как сам rest2web не нужен.
+
+Signed-off-by: Dmitriy Blinov <bda@mnsspb.ru>
+---
+ howto/build_navy.txt |    6 +++---
+ 1 files changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/howto/build_navy.txt b/howto/build_navy.txt
+index 3fd3afb..0ee807e 100644
+--- a/howto/build_navy.txt
++++ b/howto/build_navy.txt
+@@ -119,8 +119,8 @@
+    - libxv-dev
+    - libusplash-dev
+    - latex-make
+-   - textlive-lang-cyrillic
+-   - textlive-latex-extra
++   - texlive-lang-cyrillic
++   - texlive-latex-extra
+    - dia
+    - python-pyrex
+    - libtool
+@@ -128,7 +128,7 @@
+    - sox
+    - cython
+    - imagemagick
+-   - docutils
++   - python-docutils
+ 
+ #. на машине dinar: добавить свой открытый ssh-ключ в authorized_keys2 пользователя ddev
+ #. на своей машине: отредактировать /etc/sudoers (команда ``visudo``) примерно следующим образом::
+-- 
+1.5.6.5
+From nobody Mon Sep 17 00:00:00 2001
+From: <a.u.thor@example.com> (A U Thor)
+Date: Fri, 9 Jun 2006 00:44:16 -0700
+Subject: [PATCH] a patch
+
+From nobody Mon Sep 17 00:00:00 2001
+From: Junio Hamano <junkio@cox.net>
+Date: Thu, 20 Aug 2009 17:18:22 -0700
+Subject: Why doesn't git-am does not like >8 scissors mark?
+
+Subject: [PATCH] BLAH ONE
+
+In real life, we will see a discussion that inspired this patch
+discussing related and unrelated things around >8 scissors mark
+in this part of the message.
+
+Subject: [PATCH] BLAH TWO
+
+And then we will see the scissors.
+
+ This line is not a scissors mark -- >8 -- but talks about it.
+ - - >8 - - please remove everything above this line - - >8 - -
+
+Subject: [PATCH] Teach mailinfo to ignore everything before -- >8 -- mark
+From: Junio C Hamano <gitster@pobox.com>
+
+This teaches mailinfo the scissors -- >8 -- mark; the command ignores
+everything before it in the message body.
+
+Signed-off-by: Junio C Hamano <gitster@pobox.com>
+---
+ builtin-mailinfo.c |   37 ++++++++++++++++++++++++++++++++++++-
+ 1 files changed, 36 insertions(+), 1 deletions(-)
+
+diff --git a/builtin-mailinfo.c b/builtin-mailinfo.c
+index b0b5d8f..461c47e 100644
+--- a/builtin-mailinfo.c
++++ b/builtin-mailinfo.c
+@@ -712,6 +712,34 @@ static inline int patchbreak(const struct strbuf *line)
+ 	return 0;
+ }
+ 
++static int scissors(const struct strbuf *line)
++{
++	size_t i, len = line->len;
++	int scissors_dashes_seen = 0;
++	const char *buf = line->buf;
++
++	for (i = 0; i < len; i++) {
++		if (isspace(buf[i]))
++			continue;
++		if (buf[i] == '-') {
++			scissors_dashes_seen |= 02;
++			continue;
++		}
++		if (i + 1 < len && !memcmp(buf + i, ">8", 2)) {
++			scissors_dashes_seen |= 01;
++			i++;
++			continue;
++		}
++		if (i + 7 < len && !memcmp(buf + i, "cut here", 8)) {
++			i += 7;
++			continue;
++		}
++		/* everything else --- not scissors */
++		break;
++	}
++	return scissors_dashes_seen == 03;
++}
++
+ static int handle_commit_msg(struct strbuf *line)
+ {
+ 	static int still_looking = 1;
+@@ -723,10 +751,17 @@ static int handle_commit_msg(struct strbuf *line)
+ 		strbuf_ltrim(line);
+ 		if (!line->len)
+ 			return 0;
+-		if ((still_looking = check_header(line, s_hdr_data, 0)) != 0)
++		still_looking = check_header(line, s_hdr_data, 0);
++		if (still_looking)
+ 			return 0;
+ 	}
+ 
++	if (scissors(line)) {
++		fseek(cmitmsg, 0L, SEEK_SET);
++		still_looking = 1;
++		return 0;
++	}
++
+ 	/* normalize the log message to UTF-8. */
+ 	if (metainfo_charset)
+ 		convert_to_utf8(line, charset.buf);
+-- 
+1.6.4.1
+From nobody Mon Sep 17 00:00:00 2001
+From: A U Thor <a.u.thor@example.com>
+Subject: check bogus body header (from)
+Date: Fri, 9 Jun 2006 00:44:16 -0700
+
+From: bogosity
+  - a list
+  - of stuff
+---
+diff --git a/foo b/foo
+index e69de29..d95f3ad 100644
+--- a/foo
++++ b/foo
+@@ -0,0 +1 @@
++content
+
+From nobody Mon Sep 17 00:00:00 2001
+From: A U Thor <a.u.thor@example.com>
+Subject: check bogus body header (date)
+Date: Fri, 9 Jun 2006 00:44:16 -0700
+
+Date: bogus
+
+and some content
+
+---
+diff --git a/foo b/foo
+index e69de29..d95f3ad 100644
+--- a/foo
++++ b/foo
+@@ -0,0 +1 @@
++content
+

From 5dbd8651350040720b7c4c707788432a76382ff2 Mon Sep 17 00:00:00 2001
From: Oliver <oliver@21zoo.com>
Date: Sat, 14 Mar 2020 21:57:23 -0400
Subject: [PATCH 12/18] drone tests

---
 .drone.yml | 33 +++++++++++++++++++++++++++++++++
 .gitignore |  2 ++
 2 files changed, 35 insertions(+)
 create mode 100644 .drone.yml

diff --git a/.drone.yml b/.drone.yml
new file mode 100644
index 0000000..a3f0f20
--- /dev/null
+++ b/.drone.yml
@@ -0,0 +1,33 @@
+kind: pipeline
+name: default
+type: docker
+
+
+workspace:
+  base: /go
+  path: src/github.com/oliver006/elasticsearch-gmail
+
+
+services:
+  - name: es
+    image: docker.elastic.co/elasticsearch/elasticsearch-oss:7.6.1
+    environment:
+      http.port: "9200"
+      discovery.type: "single-node"
+    ports:
+      - 9200
+
+
+steps:
+  - name: tests
+    image: "python:3.7"
+    pull: always
+    commands:
+      - pip3 install -r requirements.txt
+      - sleep 30
+      - curl -s http://es:9200
+      - "python3 src/index_emails.py --infile=sample.mbox --es-url=http://es:9200"
+    when:
+      event:
+        - pull_request
+        - push
diff --git a/.gitignore b/.gitignore
index 5ceb386..0231762 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,3 @@
 venv
+.idea
+*.pyc

From 6d31ea3cabd4f0177303d5fbd57662e73a4d925d Mon Sep 17 00:00:00 2001
From: ad-ast <66307268+ad-ast@users.noreply.github.com>
Date: Wed, 22 Jul 2020 18:07:12 +0200
Subject: [PATCH 13/18] Added support for MH mailboxes and several other
 features/improvements/fixes (#22)

* Ignore VS Code metadata

* Enable loading MH directory mailbox

* Enable reading body having multipart containing multiparts

* Skip mail when unable to serialize to JSON instead of exiting

* Option to filter out non-text content type body parts

* Explicitly stringify to deal with bad encodings

* Unused

* Make indexing x-* header fields optional

* Fix deleting index http request

* Optimized skipping (skip keys, do not read skipped messages)

* Added dry run option/documentation

* Fixed recursion for content type multipart

* Updated readme to reflect MH support

* Uncommented DelegatingEmailParser

* Removed unused email parser classes

Co-authored-by: adast <adast@localhost>
---
 .gitignore                   |  1 +
 README.md                    |  9 ++--
 src/AmazonEmailParser.py     | 83 ---------------------------------
 src/DelegatingEmailParser.py | 11 -----
 src/SteamEmailParser.py      | 63 -------------------------
 src/index_emails.py          | 90 +++++++++++++++++++++++-------------
 6 files changed, 63 insertions(+), 194 deletions(-)
 delete mode 100644 src/AmazonEmailParser.py
 delete mode 100644 src/DelegatingEmailParser.py
 delete mode 100644 src/SteamEmailParser.py

diff --git a/.gitignore b/.gitignore
index 0231762..1760f4f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 venv
 .idea
 *.pyc
+.vscode
diff --git a/README.md b/README.md
index db70861..ad442d1 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-Elasticsearch For Beginners: Indexing your Gmail Inbox
+Elasticsearch For Beginners: Indexing your Gmail Inbox (and more: Supports any mbox and MH mailboxes)
 =======================
 
 [![Build Status](https://cloud.drone.io/api/badges/oliver006/elasticsearch-gmail/status.svg)](https://cloud.drone.io/oliver006/elasticsearch-gmail)
@@ -43,13 +43,14 @@ $ python3 src/index_emails.py --infile=sample.mbox
 $
 ```
 
+Note: All examples focus on Gmail inboxes. Substitute any `--infile=` parameters with `--indir=` pointing to an MH directory to make them work with MH mailboxes instead.
 
 #### The Source Code
 
 The overall program will look something like this:
 
 ```python
-mbox = mailbox.UnixMailbox(open('emails.mbox', 'rb'), email.message_from_file)
+mbox = mailbox.mbox('emails.mbox') // or mailbox.MH('inbox/')
 
 for msg in mbox:
     item = convert_msg_to_json(msg)
@@ -63,9 +64,9 @@ print "Done!"
 The full Python code is here: [src/index_emails.py](src/index_emails.py)
 
 
-##### Turn mbox into JSON
+##### Turn mailbox into JSON
 
-First, we got to turn the mbox format messages into JSON so we can insert it into Elasticsearch. [Here](http://nbviewer.ipython.org/github/furukama/Mining-the-Social-Web-2nd-Edition/blob/master/ipynb/Chapter%206%20-%20Mining%20Mailboxes.ipynb) is some sample code that was very useful when it came to normalizing and cleaning up the data.
+First, we got to turn the messages into JSON so we can insert it into Elasticsearch. [Here](http://nbviewer.ipython.org/github/furukama/Mining-the-Social-Web-2nd-Edition/blob/master/ipynb/Chapter%206%20-%20Mining%20Mailboxes.ipynb) is some sample code that was very useful when it came to normalizing and cleaning up the data.
 
 A good first step:
 
diff --git a/src/AmazonEmailParser.py b/src/AmazonEmailParser.py
deleted file mode 100644
index 117c87f..0000000
--- a/src/AmazonEmailParser.py
+++ /dev/null
@@ -1,83 +0,0 @@
-from __future__ import print_function
-
-import json
-import re
-
-class AmazonEmailParser(object):
-
-    def __init__(self):
-        self.orderTotalRE = re.compile(r"(?<=Order Total:) (?:.*?)(\d+.\d+)")
-        self.postageRE = re.compile(r"(?<=Postage & Packing:) (?:.*?)(\d+.\d+)")
-        self.deliveryRE = re.compile(r"(?<=Delivery & Handling::) (?:.*?)(\d+.\d+)")
-        self.orderItemsRE = re.compile(r"==========\r\n\r\n")
-        self.costRE = re.compile(r"(\d+\.\d+)")
-
-    def canParse(self, email):
-        try:
-            if 'auto-confirm@amazon' in email['from']:
-                return True
-            else:
-                return False
-        except:
-            return False
-
-    def parse(self, email):
-        body = email['body']
-
-        if 'Order Confirmation' in body:
-            postage = 0
-            orderTotal = 0
-
-            result = re.search(self.orderTotalRE, body)
-
-            if result:
-                orderTotal = float(result.groups()[0])
-
-            result = re.search(self.postageRE, body)
-
-            if result:
-                postage = float(result.groups()[0])
-            else:
-                result = re.search(self.deliveryRE, body)
-                if result:
-                    postage = float(result.groups()[0])
-
-            email['order_details'] = {
-                "order_items" : [],
-                "order_total" : orderTotal,
-                "postage" : postage,
-                "merchant" : "amazon"
-            }
-
-            orders = re.split(self.orderItemsRE, body)[1]
-            orders = orders.split('\r\n\r\n')
-
-            #Remove first and last 3 items
-            orders.pop(0)
-            orders.pop()
-            orders.pop()
-            orders.pop()
-
-            costTotal = orderTotal
-
-            for item in orders:
-                if 'Your estimated delivery date is:' in item or 'Your order will be sent to:' in item:
-                    continue
-                else:
-                    lines = item.replace('_','').split('\r\n')
-                    if len(lines) < 4:
-                        continue
-                    itemName = lines[0].strip()
-                    cost = float(re.search(self.costRE, lines[1].strip()).groups()[0])
-                    condition = lines[2].rpartition(':')[2].strip()
-                    seller = lines[3].replace('Sold by', '').strip()
-
-                email['order_details']['order_items'].append({"item":itemName, "cost":cost, "condition": condition, "seller": seller})
-                costTotal -= cost
-
-            if costTotal != 0:
-                print("Warning order not parsed correctly, order items may be missing, or promotion may have been applied.")
-                print(email['order_details'])
-                print(body)
-
-        return email
diff --git a/src/DelegatingEmailParser.py b/src/DelegatingEmailParser.py
deleted file mode 100644
index 11c291d..0000000
--- a/src/DelegatingEmailParser.py
+++ /dev/null
@@ -1,11 +0,0 @@
-class DelegatingEmailParser(object):
-
-    def __init__(self, parsers):
-        self.parsers = parsers
-
-    def parse(self, email):
-        for parser in self.parsers:
-            if parser.canParse(email):
-                return parser.parse(email)
-
-        return email
diff --git a/src/SteamEmailParser.py b/src/SteamEmailParser.py
deleted file mode 100644
index 601660e..0000000
--- a/src/SteamEmailParser.py
+++ /dev/null
@@ -1,63 +0,0 @@
-from __future__ import print_function
-
-import json
-import re
-
-class SteamEmailParser(object):
-
-    def __init__(self):
-        self.orderTotalRE = re.compile(r"(?<=Total:)[ \t]+(\d+.\d+)")
-        self.orderItemsRE = re.compile(r"(?:\.\r\n)+")
-        self.costRE = re.compile(r"(\d+\.\d+)")
-
-    def canParse(self, email):
-        try:
-            if 'noreply@steampowered.com' in email['from']:
-                return True
-            else:
-                return False
-        except:
-            return False
-
-    def parse(self, email):
-        body = email['body']
-
-        if 'Thank you' in email['subject'] and 'purchase' in body:
-            orderTotal = 0
-
-            result = re.search(self.orderTotalRE, body)
-
-            if result:
-                orderTotal = float(result.groups()[0])
-
-            email['order_details'] = {
-                "order_items" : [],
-                "order_total" : orderTotal,
-                "merchant" : "steam"
-            }
-
-            order = re.split(self.orderItemsRE, body)[2].split('\r\n') #This parser to get order total is currently broken, gift purchases are not parsed
-
-            costTotal = orderTotal
-
-            costTotal = orderTotal
-
-            for item in order:
-                if '-------' in item:
-                    break
-                else:
-                    if item == '' or ': ' not in item:
-                        continue
-                    splitResult = item.rpartition(':')
-                    itemName = splitResult[0].strip()
-                    cost = float(re.match(self.costRE, splitResult[2].strip()).groups()[0])
-
-                email['order_details']['order_items'].append({"item":itemName, "cost":cost})
-                costTotal -= cost
-
-            if costTotal != 0:
-                print("Warning order not parsed correctly, order items may be missing, or promotion may have been applied.")
-                print(email['order_details'])
-                print(body)
-
-        return email
diff --git a/src/index_emails.py b/src/index_emails.py
index 24788e0..b3693d8 100644
--- a/src/index_emails.py
+++ b/src/index_emails.py
@@ -9,9 +9,6 @@
 import email
 import quopri
 import chardet
-from DelegatingEmailParser import DelegatingEmailParser
-from AmazonEmailParser import AmazonEmailParser
-from SteamEmailParser import SteamEmailParser
 from bs4 import BeautifulSoup
 import logging
 
@@ -39,9 +36,8 @@ def strip_html_css_js(msg):
 
 def delete_index():
     try:
-        url = "%s/%s?refresh=true" % (tornado.options.options.es_url, tornado.options.options.index_name)
+        url = "%s/%s" % (tornado.options.options.es_url, tornado.options.options.index_name)
         request = HTTPRequest(url, method="DELETE", request_timeout=240, headers={"Content-Type": "application/json"})
-        body = {"refresh": True}
         response = http_client.fetch(request)
         logging.info('Delete index done   %s' % response.body)
     except:
@@ -85,11 +81,20 @@ def create_index():
 
 
 def upload_batch(upload_data):
+    if tornado.options.options.dry_run:
+        logging.info("Dry run, not uploading")
+        return
     upload_data_txt = ""
     for item in upload_data:
         cmd = {'index': {'_index': tornado.options.options.index_name, '_type': 'email', '_id': item['message-id']}}
-        upload_data_txt += json.dumps(cmd) + "\n"
-        upload_data_txt += json.dumps(item) + "\n"
+        try:
+            json_cmd = json.dumps(cmd) + "\n"
+            json_item = json.dumps(item) + "\n"
+        except:
+            logging.warn('Skipping mail with message id %s because of exception converting to JSON (invalid characters?).' % item['message-id'])
+            continue
+        upload_data_txt += json_cmd
+        upload_data_txt += json_item
 
     request = HTTPRequest(tornado.options.options.es_url + "/_bulk", method="POST", body=upload_data_txt, request_timeout=240, headers={"Content-Type": "application/json"})
     response = http_client.fetch(request)
@@ -107,6 +112,17 @@ def normalize_email(email_in):
 
 
 def convert_msg_to_json(msg):
+
+    def parse_message_parts(current_msg):
+        if current_msg.is_multipart():
+            for mpart in current_msg.get_payload():
+                if mpart is not None:
+                    content_type = str(mpart.get_content_type())
+                    if not tornado.options.options.text_only or (content_type.startswith("text") or content_type.startswith("multipart")):
+                        parse_message_parts(mpart)
+        else:
+            result['body'] += strip_html_css_js(current_msg.get_payload(decode=True))
+
     result = {'parts': []}
     if 'message-id' not in msg:
         return None
@@ -117,11 +133,11 @@ def convert_msg_to_json(msg):
     for k in ['to', 'cc', 'bcc']:
         if not result.get(k):
             continue
-        emails_split = result[k].replace('\n', '').replace('\t', '').replace('\r', '').replace(' ', '').encode('utf8').decode('utf-8', 'ignore').split(',')
+        emails_split = str(result[k]).replace('\n', '').replace('\t', '').replace('\r', '').replace(' ', '').encode('utf8').decode('utf-8', 'ignore').split(',')
         result[k] = [normalize_email(e) for e in emails_split]
 
     if "from" in result:
-        result['from'] = normalize_email(result['from'])
+        result['from'] = normalize_email(str(result['from']))
 
     if "date" in result:
         try:
@@ -140,15 +156,7 @@ def convert_msg_to_json(msg):
     # Bodies...
     if tornado.options.options.index_bodies:
         result['body'] = ''
-        if msg.is_multipart():
-            for mpart in msg.get_payload():
-                if mpart is not None:
-                    mpart_payload = mpart.get_payload(decode=True)
-                    if mpart_payload is not None:
-                        result['body'] += strip_html_css_js(mpart_payload)
-        else:
-            result['body'] = strip_html_css_js(msg.get_payload(decode=True))
-
+        parse_message_parts(msg)
         result['body_size'] = len(result['body'])
 
     parts = result.get("parts", [])
@@ -156,6 +164,9 @@ def convert_msg_to_json(msg):
     for part in parts:
         result['content_size_total'] += len(part.get('content', ""))
 
+    if not tornado.options.options.index_x_headers:
+        result = {key: result[key] for key in result if not key.startswith("x-")}
+
     return result
 
 
@@ -166,22 +177,22 @@ def load_from_file():
     create_index()
 
     if tornado.options.options.skip:
-        logging.info("Skipping first %d messages from mbox file" % tornado.options.options.skip)
+        logging.info("Skipping first %d messages" % tornado.options.options.skip)
 
-    count = 0
     upload_data = list()
-    logging.info("Starting import from file %s" % tornado.options.options.infile)
-    # mbox = mailbox.UnixMailbox(open(tornado.options.options.infile, 'rb'), email.message_from_file)
 
-    # removed the above UnixMailbox which is not supported in python 3.x and replaced it with mailbox.mbox class
-    mbox = mailbox.mbox(tornado.options.options.infile)
+    if tornado.options.options.infile:
+        logging.info("Starting import from mbox file %s" % tornado.options.options.infile)
+        mbox = mailbox.mbox(tornado.options.options.infile)
+    else:
+        logging.info("Starting import from MH directory %s" % tornado.options.options.indir)
+        mbox = mailbox.MH(tornado.options.options.indir, factory=None, create=False)
 
-    emailParser = DelegatingEmailParser([AmazonEmailParser(), SteamEmailParser()])
+    #Skipping on keys to avoid expensive read operations on skipped messages
+    msgkeys = mbox.keys()[tornado.options.options.skip:]
 
-    for msg in mbox:
-        count += 1
-        if count < tornado.options.options.skip:
-            continue
+    for msgkey in msgkeys:
+        msg = mbox[msgkey]
         item = convert_msg_to_json(msg)
 
         if item:
@@ -194,7 +205,7 @@ def load_from_file():
     if upload_data:
         upload_batch(upload_data)
 
-    logging.info("Import done - total count %d" % count)
+    logging.info("Import done - total count %d" % len(mbox.keys()))
 
 
 if __name__ == '__main__':
@@ -206,7 +217,10 @@ def load_from_file():
                            help="Name of the index to store your messages")
 
     tornado.options.define("infile", type=str, default=None,
-                           help="The mbox input file")
+                           help="Input file (supported mailbox format: mbox). Mutually exclusive to --indir")
+
+    tornado.options.define("indir", type=str, default=None,
+                           help="Input directory (supported mailbox format: mh). Mutually exclusive to --infile")
 
     tornado.options.define("init", type=bool, default=False,
                            help="Force deleting and re-initializing the Elasticsearch index")
@@ -215,7 +229,7 @@ def load_from_file():
                            help="Elasticsearch bulk index batch size")
 
     tornado.options.define("skip", type=int, default=0,
-                           help="Number of messages to skip from the mbox file")
+                           help="Number of messages to skip from mailbox")
 
     tornado.options.define("num_of_shards", type=int, default=2,
                            help="Number of shards for ES index")
@@ -224,9 +238,19 @@ def load_from_file():
                            help="Will index all body content, stripped of HTML/CSS/JS etc. Adds fields: 'body' and \
                                     'body_size'")
 
+    tornado.options.define("text_only", type=bool, default=False,
+                           help='Only parse message body multiparts declared as text (ignoring images etc.).')
+
+    tornado.options.define("index_x_headers", type=bool, default=True,
+                           help='Index x-* fields from headers')
+
+    tornado.options.define("dry_run", type=bool, default=False,
+                           help='Do not upload to Elastic Search, just process messages')
+
     tornado.options.parse_command_line()
 
-    if tornado.options.options.infile:
+    #Exactly one of {infile, indir} must be set
+    if bool(tornado.options.options.infile) ^ bool(tornado.options.options.indir):
         IOLoop.instance().run_sync(load_from_file)
     else:
         tornado.options.print_help()

From 53283ad37805dfe62253bde14278cebe706841a5 Mon Sep 17 00:00:00 2001
From: Tim Gates <tim.gates@iress.com>
Date: Fri, 25 Dec 2020 08:54:11 +1100
Subject: [PATCH 14/18] docs: fix simple typo, settingsa -> settings (#23)

There is a small typo in README.md.

Should read `settings` rather than `settingsa`.
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ad442d1..c744744 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ Goal of this tutorial is to load an entire Gmail inbox into Elasticsearch using
 
 Set up [Elasticsearch](https://www.elastic.co/guide/en/elasticsearch/guide/current/running-elasticsearch.html) and make sure it's running at [http://localhost:9200](http://localhost:9200)
 
-A quick way to run Elasticsearch is using Docker: (the cors settingsa aren't really needed but come in handy if you want to use e.g. [dejavu](https://dejavu.appbase.io/) to explore the index)
+A quick way to run Elasticsearch is using Docker: (the cors settings aren't really needed but come in handy if you want to use e.g. [dejavu](https://dejavu.appbase.io/) to explore the index)
 ```
 docker run --name es -d -p 9200:9200 -e http.port=9200 -e http.cors.enabled=true -e 'http.cors.allow-origin=*' -e http.cors.allow-headers=X-Requested-With,X-Auth-Token,Content-Type,Content-Length,Authorization -e http.cors.allow-credentials=true -e "discovery.type=single-node" docker.elastic.co/elasticsearch/elasticsearch-oss:7.6.1
 ```

From 746ea588e4fa135a219c9fdf39541832faa732dd Mon Sep 17 00:00:00 2001
From: Stephen George <sfgeorge@users.noreply.github.com>
Date: Sat, 11 Jun 2022 14:57:21 -0500
Subject: [PATCH 15/18] Fix --index-bodies to be intended default of False.
 (#24)

---
 src/index_emails.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/index_emails.py b/src/index_emails.py
index b3693d8..0c2b316 100644
--- a/src/index_emails.py
+++ b/src/index_emails.py
@@ -234,7 +234,7 @@ def load_from_file():
     tornado.options.define("num_of_shards", type=int, default=2,
                            help="Number of shards for ES index")
 
-    tornado.options.define("index_bodies", type=bool, default=True,
+    tornado.options.define("index_bodies", type=bool, default=False,
                            help="Will index all body content, stripped of HTML/CSS/JS etc. Adds fields: 'body' and \
                                     'body_size'")
 

From 5cbb522a57407095e17e33a0024a628bd2ac94b9 Mon Sep 17 00:00:00 2001
From: Oliver <oliver@21zoo.com>
Date: Sun, 25 Jun 2023 18:08:19 -0400
Subject: [PATCH 16/18] tornado==6.3.2

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index b4cf28f..55a9969 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,3 @@
 beautifulsoup4==4.6.0
 chardet==3.0.4
-tornado==4.5.3
+tornado==6.3.2

From b6be72600ceaf92212d22a3fc891f520a002dbba Mon Sep 17 00:00:00 2001
From: Oliver <oliver006@users.noreply.github.com>
Date: Thu, 17 Aug 2023 09:13:11 -0700
Subject: [PATCH 17/18] Update tornado to 6.3.3

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 55a9969..25e9892 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,3 @@
 beautifulsoup4==4.6.0
 chardet==3.0.4
-tornado==6.3.2
+tornado==6.3.3

From 25e9cccb321e69c294919aad07eae724fcc968a4 Mon Sep 17 00:00:00 2001
From: Oliver <oliver@21zoo.com>
Date: Fri, 7 Jun 2024 22:54:38 -0700
Subject: [PATCH 18/18] tornado 6.4.1

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 25e9892..72d9aef 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,3 @@
 beautifulsoup4==4.6.0
 chardet==3.0.4
-tornado==6.3.3
+tornado==6.4.1