From 440584f0f9108b8dce275d6fa385c5e989f44f13 Mon Sep 17 00:00:00 2001
From: Sidhant Arora <32369693+Sidhant29@users.noreply.github.com>
Date: Fri, 19 Aug 2022 22:41:10 +1000
Subject: [PATCH] Updated Readme (#54)

---
 README.md | 91 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 88 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 3fe7619..e6855f4 100644
--- a/README.md
+++ b/README.md
@@ -44,6 +44,8 @@ The Python bindings have a similar API to Tantivy. To create a index first a sch
 needs to be built. After that documents can be added to the index and a reader
 can be created to search the index.
 
+## Building an index and populating it
+
 ```python
 import tantivy
 
@@ -51,29 +53,112 @@ import tantivy
 schema_builder = tantivy.SchemaBuilder()
 schema_builder.add_text_field("title", stored=True)
 schema_builder.add_text_field("body", stored=True)
+schema_builder.add_integer_field("doc_id",stored=True)
 schema = schema_builder.build()
 
-# Creating our index (in memory, but filesystem is available too)
+# Creating our index (in memory)
 index = tantivy.Index(schema)
+```
 
+To have a persistent index, use the path
+parameter to store the index on the disk, e.g:
 
-# Adding one document.
+```python
+index = tantivy.Index(schema, path=os.getcwd() + '/index')
+```
+
+By default, tantivy  offers the following tokenizers
+which can be used in tantivy-py:
+ -  `default`
+`default` is the tokenizer that will be used if you do not
+ assign a specific tokenizer to your text field.
+ It will chop your text on punctuation and whitespaces,
+ removes tokens that are longer than 40 chars, and lowercase your text.
+
+-  `raw`
+ Does not actual tokenizer your text. It keeps it entirely unprocessed.
+ It can be useful to index uuids, or urls for instance.
+
+-  `en_stem`
+
+ In addition to what `default` does, the `en_stem` tokenizer also
+ apply stemming to your tokens. Stemming consists in trimming words to
+ remove their inflection. This tokenizer is slower than the default one,
+ but is recommended to improve recall.
+
+to use the above tokenizers, simply provide them as a parameter to `add_text_field`. e.g.
+```python
+schema_builder.add_text_field("body",  stored=True,  tokenizer_name='en_stem')
+```
+
+### Adding one document.
+
+```python
 writer = index.writer()
 writer.add_document(tantivy.Document(
+	doc_id=1,
     title=["The Old Man and the Sea"],
     body=["""He was an old man who fished alone in a skiff in the Gulf Stream and he had gone eighty-four days now without taking a fish."""],
 ))
 # ... and committing
 writer.commit()
+```
 
 
+## Building and Executing Queries
+
+First you need to get a searcher for the index
+
+```python
 # Reload the index to ensure it points to the last commit.
 index.reload()
 searcher = index.searcher()
-query = index.parse_query("fish days", ["title", "body"])
+```
 
+Then you need to get a valid query object by parsing your query on the index.
+
+```python
+query = index.parse_query("fish days", ["title", "body"])
 (best_score, best_doc_address) = searcher.search(query, 3).hits[0]
 best_doc = searcher.doc(best_doc_address)
 assert best_doc["title"] == ["The Old Man and the Sea"]
 print(best_doc)
 ```
+
+### Valid Query Formats
+
+tantivy-py supports the query language used in tantivy.
+Some basic query Formats.
+
+
+ - AND and OR conjunctions.
+```python
+query = index.parse_query('(Old AND Man) OR Stream', ["title", "body"])
+(best_score, best_doc_address) = searcher.search(query, 3).hits[0]
+best_doc = searcher.doc(best_doc_address)
+```
+
+ - +(includes) and -(excludes) operators.
+```python
+query = index.parse_query('+Old +Man chef -fished', ["title", "body"])
+(best_score, best_doc_address) = searcher.search(query, 3).hits[0]
+best_doc = searcher.doc(best_doc_address)
+```
+Note: in a query like above, a word with no +/- acts like an OR.
+
+ - phrase search.
+```python
+query = index.parse_query('"eighty-four days"', ["title", "body"])
+(best_score, best_doc_address) = searcher.search(query, 3).hits[0]
+best_doc = searcher.doc(best_doc_address)
+```
+
+- integer search
+```python
+query = index.parse_query('"eighty-four days"', ["doc_id"])
+(best_score, best_doc_address) = searcher.search(query, 3).hits[0]
+best_doc = searcher.doc(best_doc_address)
+```
+Note: for integer search, the integer field should be indexed.
+
+For more possible query formats and possible query options, see [Tantivy Query Parser Docs.](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html)
\ No newline at end of file