diff --git a/.gitignore b/.gitignore index d89fa7d..b112a40 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,8 @@ # ---> Hugo # Generated files by hugo /public/ -/resources/_gen/ +/brainsteam/public +/brainstorm/resources/_gen/ # Executable may be added to repository hugo.exe diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..1914fe7 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "brainsteam/themes/hugo-ink"] + path = brainsteam/themes/hugo-ink + url = https://github.com/knadh/hugo-ink.git diff --git a/brainsteam/archetypes/default.md b/brainsteam/archetypes/default.md new file mode 100644 index 0000000..00e77bd --- /dev/null +++ b/brainsteam/archetypes/default.md @@ -0,0 +1,6 @@ +--- +title: "{{ replace .Name "-" " " | title }}" +date: {{ .Date }} +draft: true +--- + diff --git a/brainsteam/config.toml b/brainsteam/config.toml new file mode 100644 index 0000000..a49c79e --- /dev/null +++ b/brainsteam/config.toml @@ -0,0 +1,51 @@ +baseURL = "http://brainsteam.co.uk/" +languageCode = "en-us" +title = "Brainsteam" +theme='hugo-ink' + +[markup.goldmark.renderer] +unsafe= true + +[params] + subtitle = "The irregular mental expulsions of a PhD student and CTO of Filament, my views are my own and do not represent my employers in any way." + + avatar = "/images/avatar.png" + +[[menu.main]] +name = "Home" +url = "/" +weight = 1 + +[[menu.main]] +name = "All posts" +url = "/posts" +weight = 2 + +[[menu.main]] +name = "Tags" +url = "/tags" +weight = 4 + +[[menu.main]] +name = "My Home Page" +url = "https://jamesravey.me" +weight = 3 + +[[params.social]] +name = "Twitter" +icon = "twitter" +url = "https://twitter.com/jamesravey/" + +[[params.social]] +name = "Github" +icon = "github" +url = "https://github.com/ravenscroftj" + +[[params.social]] +name = "RSS" +icon = "rss" +url = "/index.xml" + + +[taxonomies] + tag = "tags" \ No newline at end of file diff --git a/brainsteam/content/posts/2015-06-28-bedford-place-vintage-festival.md b/brainsteam/content/posts/2015-06-28-bedford-place-vintage-festival.md new file mode 100644 index 0000000..b6b05d0 --- /dev/null +++ b/brainsteam/content/posts/2015-06-28-bedford-place-vintage-festival.md @@ -0,0 +1,28 @@ +--- +title: Bedford Place Vintage Festival +author: James +type: post +date: 2015-06-28T10:36:28+00:00 +url: /2015/06/28/bedford-place-vintage-festival/ +categories: + - Lindyhop +tags: + - bic + - festival + - lindyhop + - shimsham + - simone + - southampton + - vintage +format: video + +--- +Last week a bunch of my lindyhop group went and performed at the Bedford Place Vintage Festival in Southampton – its an annual event that I’ve been to twice now and we had an absolute ball. + +I think I enjoyed it that much more this year purely because I’ve been dancing twice as long now and I can hold my own on the social dance floor. + +Here’s a video of our crew performing the Shim Sham to “Mama do the hump” + +
#!/bin/sh
+
+for xmlfile in $NEMO_SCRIPT_SELECTED_FILE_PATHS; do
+
+ if [[ $xmlfile == *.xml ]]
+ then
+ xmllint --format $xmlfile > $xmlfile.tmp
+ rm $xmlfile
+ mv $xmlfile.tmp $xmlfile
+ fi
+done
+
+
+Pop that in a file called “Tidy XML” in your ~/.local/share/nemo/scripts directory and when you inspect files with Nemo it should appear in the right click menu.
\ No newline at end of file
diff --git a/brainsteam/content/posts/2015-07-15-sssplit-improvements.md b/brainsteam/content/posts/2015-07-15-sssplit-improvements.md
new file mode 100644
index 0000000..f64f8fb
--- /dev/null
+++ b/brainsteam/content/posts/2015-07-15-sssplit-improvements.md
@@ -0,0 +1,77 @@
+---
+title: SSSplit Improvements
+author: James
+type: post
+date: 2015-07-15T19:33:29+00:00
+url: /2015/07/15/sssplit-improvements/
+categories:
+ - PhD
+ - Work
+tags:
+ - demo
+ - improvements
+ - java
+ - partridge
+ - python
+ - regex
+ - sapienta
+ - split
+ - sssplit
+ - test
+
+---
+## Introduction
+
+As part of my continuing work on [Partridge][1], I’ve been working on improving the sentence splitting capability of SSSplit – the component used to split academic papers from PLosOne and PubMedCentral into separate sentences.
+
+Papers arrive in our system as big blocks of text with the occasional diagram, formula or diagram and in order to apply CoreSC annotations to the sentences we need to know where each sentence starts and ends. Of course that means we also have to take into account the other ‘stuff’ (listed above) floating around in the documents too. We can’t just ignore formulae and citations – they’re pretty important! That’s what SSSplit does. It carves up papers into sentence (_pattern = re.compile('(\.|\?|\!)(?=\s*[A-Z0-9$])|\.$')
+
+ m = pattern.search(txt)
+
+
+Of course this generates lots of false positives – what if we’ve found a decimal point inside a number? What if it’s an abbreviation like e.g. or i.e. or an initial like J. Ravenscroft? There is another regular expression check for decimal points and the string around the punctuation is checked against a list of common abbreviations. There’s also a list of authors both the writers of the paper in question and those who are cited in the paper too. The function checks that the full stop is not part of one of these authors’ names.
+
+There’s an important factor to remember: Text node does not imply finished sentence – they are interspersed with formulae and references as explained above. Therefore we can’t just finish the current sentence when we reach the end of a text node – only when we encounter a full stop (not part of an abbreviation or number), question mark or explanation mark. We also know that we can complete the current sentence at the end of a p-level container as I explained above.
+
+Every time we start parsing a sentence, text nodes and other ‘stuff’ deemed to be inside that sentence is accumulated into a list. Once we encounter the end of the sentence, the list is glued together and turned into an XML $ git clone git@github.com:ravenscroftj/solrpy.git + $ python setup.py install+ +The next step is to run python and initialise your connection. The URL you should use to initialise your SOLR connection has the following structure: + +
https://gateway.watsonplatform.net/retrieve-and-rank/api/v1/solr_clusters/<CLUSTER_ID>/solr/<COLLECTION_NAME>+ +You will also need the credentials from your bluemix service which should look something like this: + +
{ + "credentials": { + "url": "https://gateway.watsonplatform.net/retrieve-and-rank/api", + "username": "<USERNAME>", + "password": "<PASSWORD>" + } +}+ +In python you should try running the following (I am using the interactive python shell [IDLE][3] for this example) + +
>>> import solr +>>> s = solr.Solr("https://gateway.watsonplatform.net/retrieve-and-rank/api/v1/solr_clusters/<CLUSTER_ID>/solr/<COLLECTION_NAME>", http_user="<USERNAME>", http_pass="<PASSWORD>") +>>> s.search("hello world") +<solr.core.Response object at 0x7ff77f91d7d0>+ +If this worked then you will see something like _**
>>> s.add({"title" : "test", "text" : "this is a test", "id" : 1}) +'<?xml version="1.0" encoding="UTF-8"?>\n<response>\n<lst name="responseHeader"><int name="status">0</int><int name="QTime">167</int></lst>\n</response>\n' +>>> s.commit() +'<?xml version="1.0" encoding="UTF-8"?>\n<response>\n<lst name="responseHeader"><int name="status">0</int><int name="QTime">68</int></lst>\n</response>\n'+ +The XML output shows that the initial add and then commit operations were both successful. + +## Content Management + +You can also add a number of documents – this is specifically useful if you have a large number of python objects to insert into SOLR: + +
>>> s.add_many( [ { "title" : x['title'], "text" : x['text'], "id" : i } for i,x in enumerate(my_list_of_items) ] ) +'<?xml version="1.0" encoding="UTF-8"?>\n<response>\n<lst name="responseHeader"><int name="status">0</int><int name="QTime">20</int></lst>\n</response>\n'+ +Of course you can also delete items via their ID from python too: + +
>>> s.delete(id=1) +'<?xml version="1.0" encoding="UTF-8"?>\n<response>\n<lst name="responseHeader"><int name="status">0</int><int name="QTime">43</int></lst>\n</response>\n'+ +## Querying SOLR (unranked results) + +And you can use SOLR queries too (but importantly note that this does not use the retrieve and rank rankers – this only gives you access to the SOLR rankers.) + +
>>> r = s.select("test") +>>> r.numFound +1L +>>> r.results +[{u'_version_': 1518020997236654080L, u'text': [u'this is a test'], u'score': 0.0, u'id': u'1', u'title': [u'test']}] + ++ +## Querying Rankers + +Provided you have [successfully trained a ranker ][4] and have the ranker ID handy, you can also query your ranker directly from Python using solrpy too. + +
>>> import solr +>>> s = solr.Solr("https://gateway.watsonplatform.net/retrieve-and-rank/api/v1/solr_clusters/<CLUSTER_ID>/solr/<COLLECTION_NAME>", http_user="<USERNAME>", http_pass="<PASSWORD>") +>>> fcselect = solr.SearchHandler(s, "/fcselect") +>>> r = fcselect("my query text", ranker_id="<RANKER-ID>")+ +in this case **r **is the same type as in the above non-ranker example, you can access the results via **r.results.** + +## More information + +For more information on how to use solrpy, visit their documentation page [here][5] + + [1]: http://cmadison.me/2015/10/23/introducing-ibms-retrieve-and-rank-service/ + [2]: https://github.com/edsu/solrpy + [3]: https://en.wikipedia.org/wiki/IDLE_(Python) + [4]: http://www.ibm.com/smarterplanet/us/en/ibmwatson/developercloud/doc/retrieve-rank/get_start.shtml#create-train + [5]: http://pythonhosted.org/solrpy/ \ No newline at end of file diff --git a/brainsteam/content/posts/2015-11-17-spellchecking-in-retrieve-and-rank.md b/brainsteam/content/posts/2015-11-17-spellchecking-in-retrieve-and-rank.md new file mode 100644 index 0000000..c5a1b09 --- /dev/null +++ b/brainsteam/content/posts/2015-11-17-spellchecking-in-retrieve-and-rank.md @@ -0,0 +1,175 @@ +--- +title: Spellchecking in retrieve and rank +author: James +type: post +date: 2015-11-17T21:41:09+00:00 +url: /2015/11/17/spellchecking-in-retrieve-and-rank/ +categories: + - Work +tags: + - checker + - improvements + - rank + - retrieve + - search + - solr + - spell + - spelling + - suggestions + - tuning + - watson + +--- +### Introduction + +Being able to deal with typos and incorrect spellings is an absolute must in any modern search facility. Humans can be lazy and clumsy and I personally often search for things with incorrect terms due to my sausage fingers. In this article I will explain how to turn on spelling suggestions in retrieve and rank so that if your users ask your system for something with a clumsy query, you can suggest spelling fixes for them so that they can submit another, more fruitful question to the system. + +Spellchecking is a standard feature of Apache SOLR which is turned off by default with Retrieve and Rank. This post will walk through the process of turning it on for your instance and enabling spell checking suggestions to be returned as part of calls rankers through fcselect. Massive shout out to David Duffett on Stack Overflow who posted [this answer][1] from which most of my blog post is derived. + +### Enabling spell checking in your schema + +The first thing we need to do is set up a spell checker field in our SOLR schema. For the sake of simplicity, the example schema used below only has a title and text field which are used in indexing and querying. However, this methodology can easily be extended to as many fields as your use case requires. + +### Set up field type + +The first thing you need to do is define a “textSpell” field type which SOLR can use to build a field into which it can dump valid words from your corpus that have been preprocessed and made ready for use in the spell checker. Create the following element in **your schema.xml** file: + +
<fieldType name="textSpell" class="solr.TextField" positionIncrementGap="100" omitNorms="true"> + <analyzer type="index"> + <tokenizer class="solr.StandardTokenizerFactory" /> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" /> + <filter class="solr.LowerCaseFilterFactory" /> + <filter class="solr.StandardFilterFactory" /> + </analyzer> + <analyzer type="query"> + <tokenizer class="solr.StandardTokenizerFactory" /> + <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true" /> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" /> + <filter class="solr.LowerCaseFilterFactory" /> + <filter class="solr.StandardFilterFactory" /> + </analyzer> + </fieldType>+ +This field type runs a lower case filter over the words provided in the input and also expands any synonyms defined in synonyms.txt and ignores any stopwords defined in stopwords.txt before storing the output in the field. This should give us a list of lower case words that are useful in search and spell checking. + +### Create a spellcheck copy field in your schema + +The next step is to create a “textSpell” field in your SOLR schema that stores the “suggestions” from the main content to be used by the spellchecker API. + +The following XML defines the field in your schema and should be copied **into schema.xml.** It assumes that you have a couple of content fields called “title” and “text” from which content can be copied and filtered for use in the spell checker. + +
<field name="spell" type="textSpell" indexed="true" stored="false" multiValued="true" /> + + <copyField source="title" dest="spell"/> + <copyField source="text" dest="spell"/>+ +### Defining the spellcheck search component + +Once you have finished setting up your schema, you can define the spellchecker parameters **in solrconfig.xml.** + +The following XML defines two spelling analysers. The DirectSolrSpellChecker which pulls search terms directly from the index adhoc – this means that it does not need to be regularly reindexed/rebuilt and always has up to date spelling suggestions. + +[`WordBreakSolrSpellChecker` offers suggestions by combining adjacent query terms and/or breaking terms into multiple words.][2] This means that it can provide suggestions that DirectSolrSpellChecker might not find where, for example, a user has a spelling mistake in one of the words in a multi-word search term. + +Notice that both lst elements contain a
<searchComponent name="spellcheck" class="solr.SpellCheckComponent"> +<lst name="spellchecker"> + <str name="name">default</str> + <str name="field">spell</str> + <str name="classname">solr.DirectSolrSpellChecker</str> + <str name="distanceMeasure">internal</str> + <float name="accuracy">0.5</float> + <int name="maxEdits">2</int> + <int name="minPrefix">1</int> + <int name="maxInspections">5</int> + <int name="minQueryLength">4</int> + <float name="maxQueryFrequency">0.01</float> + <float name="thresholdTokenFrequency">.01</float> + </lst> + + <lst name="spellchecker"> + <str name="name">wordbreak</str> + <str name="classname">solr.WordBreakSolrSpellChecker</str> + <str name="field">spell</str> + <str name="combineWords">true</str> + <str name="breakWords">true</str> + <int name="maxChanges">10</int> + </lst> +</searchComponent>+ +### Add spelling suggestions to your request handlers + +The default SOLR approach is to add a new request handler that deals with searches on the **/spell** endpoint. However, there is no reason why you can’t add spelling suggestions to any endpoint including **/select** and perhaps more relevently in retrieve and rank **/fcselect**. Below is a snippet of XML for a custom /spell endpoint: + +
<requestHandler name="/spell" class="solr.SearchHandler" startup="lazy"> + <lst name="defaults"> + <!-- Solr will use suggestions from both the 'default' spellchecker + and from the 'wordbreak' spellchecker and combine them. + collations (re-written queries) can include a combination of + corrections from both spellcheckers --> + <str name="spellcheck.dictionary">default</str> + <str name="spellcheck.dictionary">wordbreak</str> + <str name="spellcheck">on</str> + <str name="spellcheck.extendedResults">true</str> + <str name="spellcheck.count">10</str> + <str name="spellcheck.alternativeTermCount">5</str> + <str name="spellcheck.maxResultsForSuggest">5</str> + <str name="spellcheck.collate">true</str> + <str name="spellcheck.collateExtendedResults">true</str> + <str name="spellcheck.maxCollationTries">10</str> + <str name="spellcheck.maxCollations">5</str> + </lst> + <arr name="last-components"> + <str>spellcheck</str> + </arr> + </requestHandler>+ +The following snippet adds spellchecking suggestions to the **/fcselect** endpoint. Simply append the XML inside the _**
<requestHandler name="/fcselect" class="com.ibm.watson.hector.plugins.ss.FCSearchHandler"> + <lst name="defaults"> + <str name="defType">fcQueryParser</str> + <str name="spellcheck.dictionary">default</str> + <str name="spellcheck.dictionary">wordbreak</str> + <str name="spellcheck.count">20</str> + </lst> + <arr name="last-components"> + <str>fcFeatureGenerator</str> + <str>spellcheck</str> + </arr> +</requestHandler>+ +### Create and populate your SOLR index in Retrieve and Rank + +If you haven’t done this before, you should really read the [official documentation][3] and may want to read [my post about using python to do it too.][4] + +You should also [train a ranker][5] so that you can take advantage of the fcselect with spelling suggestions example below. + +### Test your new spelling suggestor + +Once you’ve got your collection up and running you should be able to try out the new spelling suggestor. First we’ll inspect **/spell:** + +
$ curl -u $USER:$PASSWORD "https://gateway.watsonplatform.net/retrieve-and-rank/api/v1/solr_clusters/$CLUSTER_ID/solr/$COLLECTION_NAME/spell?q=businwss&wt=json + +{"responseHeader":{"status":0,"QTime":4},"response":{"numFound":0,"start":0,"docs":[]},"spellcheck":{"suggestions":["businwss",{"numFound":1,"startOffset":0,"endOffset":8,"origFreq":0,"suggestion":[{"word":"business","freq":3}]}],"correctlySpelled":false,"collations":["collation",{"collationQuery":"business","hits":3,"misspellingsAndCorrections":["businwss","business"]}]}} + ++ +As you can see, the system has not found any documents containing the word **businwss. **However, it has identified **businwss **(easily misspelt because ‘e’ and ‘w’ are next to each other) as a typo of **business**. It has also suggested business as a correction. This can be presented back to the user so that they can refine their search and presented with more results. + +Now lets also look at how to use spellcheck with your ranker results. + +
$ curl -u $USER:$PASSWORD "https://gateway.watsonplatform.net/retrieve-and-rank/api/v1/solr_clusters/$CLUSTER_ID/solr/$COLLECTION_NAME/fcselect?ranker_id=$RANKER_ID&q=test+splling+mstaek&wt=json&fl=id,title,score&spellcheck=true&spellcheck=true" + +{"responseHeader":{"status":0,"QTime":4},"response":{"numFound":0,"start":0,"maxScore":0.0,"docs":[]},"spellcheck":{"suggestions":["businwss",{"numFound":1,"startOffset":0,"endOffset":8,"suggestion":["business"]}]}}+ +You should see something similar to the above. The SOLR search failed to return any results for the ranker to rank. However it has come up with a spelling correction which should return more results for ranking next time. + + [1]: http://stackoverflow.com/questions/6653186/solr-suggester-not-returning-any-results + [2]: https://cwiki.apache.org/confluence/display/solr/Spell+Checking + [3]: http://www.ibm.com/smarterplanet/us/en/ibmwatson/developercloud/doc/retrieve-rank/get_start.shtml + [4]: https://brainsteam.co.uk/2015/11/16/retrieve-and-rank-and-python/ + [5]: https://www.ibm.com/smarterplanet/us/en/ibmwatson/developercloud/doc/retrieve-rank/plugin_overview.shtml#generate_queries \ No newline at end of file diff --git a/brainsteam/content/posts/2015-11-21-scrolling-in-elasticsearch.md b/brainsteam/content/posts/2015-11-21-scrolling-in-elasticsearch.md new file mode 100644 index 0000000..faf02db --- /dev/null +++ b/brainsteam/content/posts/2015-11-21-scrolling-in-elasticsearch.md @@ -0,0 +1,30 @@ +--- +title: Scrolling in ElasticSearch +author: James +type: post +date: 2015-11-21T09:41:19+00:00 +url: /2015/11/21/scrolling-in-elasticsearch/ +categories: + - PhD +tags: + - elasticsearch + - lucene + - python + - results + - scan + - scroll + +--- +I know I’m doing a lot of flip-flopping between SOLR and Elastic at the moment – I’m trying to figure out key similarities and differences between them and where one is more suitable than the other. + +The following is an example of how to map a function _**f **_onto an entire set of indexed data in elastic using the scroll API. + +If you use elastic, it is possible to do paging by adding a size and a from parameter. For example if you wanted to retrieve results in pages of 5 starting from the 3rd page (i.e. show results 11-15) you would do: + +
GET /_search?size=5&from=10+ +However this becomes more expensive as you move further and further into the list of results. Each time you make one of these calls you are re-running the search operation – forcing Lucene to go off and re-score all the results, rank them and then discard the first 10 (or 10000 if you get that far). There is an easier option: the scan and scroll API. + +The idea is that you run your actual query once and then Elastic caches the result somewhere gives you an “access token” to go back in and get them. Then you call the scroll API endpoint with said token to get each page of results (a caveat of this is that each time you make a call your token updates and you need to use the new one. My code sample deals with this but it took me a while to figure out what was going on). + +The below code uses the python elasticsearch library to make a scan and scroll call to an index and continues to load results until there are no more hits. For each page it maps a function _**f**_** **onto the results. It would not be hard to modify this code to work on multiple threads/processes using the Python multiprocessing API. Take a look! \ No newline at end of file diff --git a/brainsteam/content/posts/2015-11-22-freecite-python-wrapper.md b/brainsteam/content/posts/2015-11-22-freecite-python-wrapper.md new file mode 100644 index 0000000..761a194 --- /dev/null +++ b/brainsteam/content/posts/2015-11-22-freecite-python-wrapper.md @@ -0,0 +1,25 @@ +--- +title: Freecite python wrapper +author: James +type: post +date: 2015-11-22T19:20:19+00:00 +url: /2015/11/22/freecite-python-wrapper/ +categories: + - PhD +tags: + - citations + - freecite + - python + - rcuk + - ref + - references + +--- +I’ve written a simple wrapper around the Brown University Citation parser [FreeCite][1]. I’m planning to use the service to pull out author names from references in REF impact studies and try to link them back to investigators listed on RCUK funding applications. + +The code is [here][2] and is MIT licensed. It provides a simple method which takes a string representing a reference and returns a dict with each field separated. There is also a parse_many function which takes an array of reference strings and returns an array of dicts. + + + + [1]: http://freecite.library.brown.edu/ + [2]: https://github.com/ravenscroftj/freecite \ No newline at end of file diff --git a/brainsteam/content/posts/2015-11-28-watson-home-automation.md b/brainsteam/content/posts/2015-11-28-watson-home-automation.md new file mode 100644 index 0000000..ead2314 --- /dev/null +++ b/brainsteam/content/posts/2015-11-28-watson-home-automation.md @@ -0,0 +1,50 @@ +--- +title: Home automation with Raspberry Pi and Watson +author: James +type: post +date: 2015-11-28T10:57:14+00:00 +url: /2015/11/28/watson-home-automation/ +categories: + - Work +tags: + - automation + - home + - iot + - jasper + - pi + - raspberry + - speech + - speech-to-text + - stt + - text + - watson + +--- +I’ve recently been playing with trying to build a Watson powered home automation system using my Raspberry Pi and some other electronic bits that I have on hand. + +There are already a lot of people doing work in this space. One of the most successful projects being [JASPER][1] which uses speech to text and an always on background listening microphone to talk to you and carry out actions when you ask it things in natural language like “What’s the weather going to be like tomorrow?” and “What is the meaning of life?” Jasper works using a library called [Sphinx][2] developed by Carnegie Mellon University to do speech recognition. However the models aren’t great – especially if you have a british accent. + +Jasper also allows you to use other speech to text libraries and services too such as the [Google Speech service][3] and the [AT&T speech service][4]. However there is no currently available code for using the Watson speech to text API – until now. + +The below code snippet can be added to your stt.py file in your jasper project. + + + +Then you need to create a Watson speech-to-text instance in bluemix add the following to your JASPER configuration: + +
stt_engine: watson +stt_passive_engine: sphinx +watson-stt: + username: "<Text-to-speech-credentials-username>" + password: "<Text-to-speech-credentials-password>"+ +This configuration will use the local Sphinx engine to listen out for “JASPER” or whatever you choose to call your companion (which it is actually pretty good at) and then send off 10-15s of audio to Watson STT to be analysed more accurately once the trigger word has been detected. Here’s a video of the system in action: + +
{ + "query":{ + "filtered":{ + "query":{ + "match_all":{ + + } + }, + "filter":{ + "term":{ + "UOA":"General Engineering" + } + } + } + } +}+ +For some reason this returns zero results. Now it took me ages to find [this page][1] in the elastic manual which talks about the exact phenomenon I’m running into above. It turns out that the default analyser is tokenizing every text field and so Elastic has no notion of UOA ever containing “General Engineering”. Instead it only knows of a UOA field that contains the word “general” and the word “engineering” independently of each other in the model somewhere (bag-of-words). To solve this you have to + + * Download the existing schema from elastic: + *
curl -XGET "http://localhost:9200/impact_studies/_mapping/study" master [4cb268b] untracked +{"impact_studies":{"mappings":{"study":{"properties":{"CaseStudyId":{"type":"string"},"Continent":{"properties":{"GeoNamesId":{"type":"string"},"Name":{"type":"string"}}},"Country":{"properties":{"GeoNamesId":{"type":"string"},"Name":{"type":"string"}}},"Funders":{"type":"string"},"ImpactDetails":{"type":"string"},"ImpactSummary":{"type":"string"},"ImpactType":{"type":"string"},"Institution":{"type":"string"},"Institutions":{"properties":{"AlternativeName":{"type":"string"},"InstitutionName":{"type":"string"},"PeerGroup":{"type":"string"},"Region":{"type":"string"},"UKPRN":{"type":"long"}}},"Panel":{"type":"string"},"PlaceName":{"properties":{"GeoNamesId":{"type":"string"},"Name":{"type":"string"}}},"References":{"type":"string"},"ResearchSubjectAreas":{"properties":{"Level1":{"type":"string"},"Level2":{"type":"string"},"Subject":{"type":"string"}}},"Sources":{"type":"string"},"Title":{"type":"string"},"UKLocation":{"properties":{"GeoNamesId":{"type":"string"},"Name":{"type":"string"}}},"UKRegion":{"properties":{"GeoNamesId":{"type":"string"},"Name":{"type":"string"}}},"UOA":{"type":"string"},"UnderpinningResearch":{"type":"string"}}}}}}+ + * Delete the schema (unfortunately you can’t make this change on the fly) and then turn off the analyser which tokenizes the values in the field: + +
$ curl -XDELETE "http://localhost:9200/impact_studies"+ + * Then recreate the schema with “index”:”not_analyzed” on the field you are interested in: + +
curl -XPUT "http://localhost:9200/impact_studies/" -d '{"mappings":{"study":{"properties":{"CaseStudyId":{"type":"string"},"Continent":{"properties":{"GeoNamesId":{"type":"string"},"Name":{"type":"string"}}},"Country":{"properties":{"GeoNamesId":{"type":"string"},"Name":{"type":"string"}}},"Funders":{"type":"string"},"ImpactDetails":{"type":"string"},"ImpactSummary":{"type":"string"},"ImpactType":{"type":"string"},"Institution":{"type":"string"},"Institutions":{"properties":{"AlternativeName":{"type":"string"},"InstitutionName":{"type":"string"},"PeerGroup":{"type":"string"},"Region":{"type":"string"},"UKPRN":{"type":"long"}}},"Panel":{"type":"string"},"PlaceName":{"properties":{"GeoNamesId":{"type":"string"},"Name":{"type":"string"}}},"References":{"type":"string"},"ResearchSubjectAreas":{"properties":{"Level1":{"type":"string"},"Level2":{"type":"string"},"Subject":{"type":"string"}}},"Sources":{"type":"string"},"Title":{"type":"string"},"UKLocation":{"properties":{"GeoNamesId":{"type":"string"},"Name":{"type":"string"}}},"UKRegion":{"properties":{"GeoNamesId":{"type":"string"},"Name":{"type":"string"}}},"UOA":{"type":"string", "index" : "not_analyzed"},"UnderpinningResearch":{"type":"string"}}}}}'+ +Once you’ve done this you’re good to go reingesting your data and your filter queries should be much more fruitful. + + [1]: https://www.elastic.co/guide/en/elasticsearch/guide/current/_finding_exact_values.html#_term_filter_with_text \ No newline at end of file diff --git a/brainsteam/content/posts/2016-03-29-cognitive-quality-assurance-an-introduction.md b/brainsteam/content/posts/2016-03-29-cognitive-quality-assurance-an-introduction.md new file mode 100644 index 0000000..3deef14 --- /dev/null +++ b/brainsteam/content/posts/2016-03-29-cognitive-quality-assurance-an-introduction.md @@ -0,0 +1,184 @@ +--- +title: Cognitive Quality Assurance – An Introduction +author: James +type: post +date: 2016-03-29T08:50:29+00:00 +url: /2016/03/29/cognitive-quality-assurance-an-introduction/ +medium_post: + - 'O:11:"Medium_Post":11:{s:16:"author_image_url";s:69:"https://cdn-images-1.medium.com/fit/c/200/200/0*naYvMn9xdbL5qlkJ.jpeg";s:10:"author_url";s:30:"https://medium.com/@jamesravey";s:11:"byline_name";N;s:12:"byline_email";N;s:10:"cross_link";s:2:"no";s:2:"id";s:12:"e20dc490dab8";s:21:"follower_notification";s:3:"yes";s:7:"license";s:19:"all-rights-reserved";s:14:"publication_id";s:12:"22a2beb5a88a";s:6:"status";s:5:"draft";s:3:"url";s:43:"https://medium.com/@jamesravey/e20dc490dab8";}' +categories: + - Uncategorized + - Work +tags: + - assurance + - cognitive + - cqa + - machine learning + - qa + - quality + - watson + +--- + +***EDIT: Hello readers, these articles are now 4 years old and many of the Watson services and APIs have moved or been changed. The concepts discussed in these articles are still relevant but I am working on 2nd editions of them.*** + +
+ Quality assurance is arguably one of the most important parts of the software development lifecycle. In order to release a product that is production ready, it must be put under, and pass, a number of tests – these include unit testing, boundary testing, stress testing and other practices that many software testers are no doubt familiar with. The ways in which traditional software are relatively clear.In a normal system, developers write deterministic functions, that is – if you put an input parameter in, unless there is a bug, you will always get the same output back. This principal makes it.. well not easy… but less difficult to write good test scripts and know that there is a bug or regression in your system if these scripts get a different answer back than usual. +
+ ++ Cognitive systems are not deterministic in nature. This means that you can receive different results from the same input data when training a system. Such systems tend to be randomly initialised and learn in different, nuanced ways every time they are trained. This is similar to how identical twins who may be biologically identical still learn their own preferences, memories and skillsets. +
+ ++ Thus, a traditional unit testing approach with tests that pass and fail depending on how the output of the system compares to an expected result is not helpful. +
+ ++ This article is the first in a series on Cognitive Quality Assurance. Or in other words, how to test and validate the performance of non-deterministic, machine learning systems. In today’s article we look at how to build a good quality ground truth and then carrying out train/test/blind data segmentation and how you can use your ground truth to verify that a cognitive system is doing its job. +
+ ++ Let’s take a step back for a moment and make sure we’re ok with the concept of ground truth. +
+ ++ In machine learning/cognitive applications, the ground truth is the dataset which you use to train and test the system. You can think of it like a school textbook that the cognitive system treats as the absolute truth and first point of reference for learning the subject at hand. Its structure and layout can vary depending on the nature of the system you are trying to build but it will always abide by a number of rules. As I like to remember them: R-C-S! +
+ ++ +
+ + ++ In a typical work flow you may be training, testing, altering your ground truth to try and improve performance and re-training. This is perfectly normal and it often takes some time to tune and tweak a model in order to get optimal performance. +
+ ++ However, in doing this, you may be inadvertently biasing your model towards the test data – which in itself may change how the model performs in the real world. When you are happy with your test performance, you may wish to benchmark against another third dataset – a blind test set that the machine has not been ‘tweaked’ in order to perform better against. This will give you the most accurate view, with respect to the data available, of how well your classifier is performing in the real world. +
+ ++ In the case of three data sets (test, train, blind) you should use a similar algorithm/work flow as describe in the above section. The important thing is that the three sets must not overlap in any way and should all be representative of the problem you are trying to train on. +
+ ++ There are a lot of differing opinions on what proportions to separate out the data set into. Some folks advocate 50%, 25%, 25% for test, train, blind respectively, others 70, 20, 10. I personally start at the latter and change these around if they don’t work – your mileage may vary depending on the type of model you are trying to build and the sort of problem you are trying to model. +
+ +
+
Important: once you have done your blind test to get an accurate idea of how well your model performs in the real world, you must not do any more tuning on the model.If you do, your metrics will be meaningless since you are now biasing the new model towards the blind data set. You can of course, start from scratch and randomly initialise a new set of test, train and blind data sets from your ground truth at any time.
+
+ Hopefully, this article has given you some ideas about how best to start assessing the quality of your cognitive application. In the next article, I cover some more in depth measurements that you can do on your model to find out where it is performing well and where it needs tuning beyond a simple accuracy rating. We will also discuss some other methods for segmenting test and train data for smaller corpuses in a future article. +
+ \ No newline at end of file diff --git a/brainsteam/content/posts/2016-05-01-ibm-watson-its-for-data-scientists-too.md b/brainsteam/content/posts/2016-05-01-ibm-watson-its-for-data-scientists-too.md new file mode 100644 index 0000000..c4a8324 --- /dev/null +++ b/brainsteam/content/posts/2016-05-01-ibm-watson-its-for-data-scientists-too.md @@ -0,0 +1,25 @@ +--- +title: IBM Watson – It’s for data scientists too! +author: James +type: post +date: 2016-05-01T11:28:13+00:00 +url: /2016/05/01/ibm-watson-its-for-data-scientists-too/ +medium_post: + - 'O:11:"Medium_Post":11:{s:16:"author_image_url";N;s:10:"author_url";N;s:11:"byline_name";N;s:12:"byline_email";N;s:10:"cross_link";s:2:"no";s:2:"id";N;s:21:"follower_notification";s:3:"yes";s:7:"license";s:19:"all-rights-reserved";s:14:"publication_id";s:2:"-1";s:6:"status";s:4:"none";s:3:"url";N;}' +categories: + - Work +tags: + - data science + - ibm + - watson + +--- +Last week, my colleague Olly and I gave a talk at a data science meetup on how [IBM Watson can be used for data science applications][1]. + +We had an amazing time and got some really great feedback from the event. We will definitely be doing more talks at events like these in the near future so keep an eye out for us! + +I will also be writing a little bit more about the experiment I did around Core Scientific Concepts and Watson Natural Language Classifier in a future blog post. + + + + [1]: https://skillsmatter.com/skillscasts/8076-ibm-watson-it-s-for-data-scientists-too \ No newline at end of file diff --git a/brainsteam/content/posts/2016-05-29-cognitive-quality-assurance-pt-2-performance-metrics.md b/brainsteam/content/posts/2016-05-29-cognitive-quality-assurance-pt-2-performance-metrics.md new file mode 100644 index 0000000..eb617c6 --- /dev/null +++ b/brainsteam/content/posts/2016-05-29-cognitive-quality-assurance-pt-2-performance-metrics.md @@ -0,0 +1,816 @@ +--- +title: 'Cognitive Quality Assurance Pt 2: Performance Metrics' +author: James +type: post +date: 2016-05-29T09:41:26+00:00 +url: /2016/05/29/cognitive-quality-assurance-pt-2-performance-metrics/ +featured_image: /wp-content/uploads/2016/05/Oma--825x510.png +medium_post: + - 'O:11:"Medium_Post":11:{s:16:"author_image_url";s:69:"https://cdn-images-1.medium.com/fit/c/200/200/0*naYvMn9xdbL5qlkJ.jpeg";s:10:"author_url";s:30:"https://medium.com/@jamesravey";s:11:"byline_name";N;s:12:"byline_email";N;s:10:"cross_link";s:2:"no";s:2:"id";s:12:"1f1de4b3132e";s:21:"follower_notification";s:3:"yes";s:7:"license";s:19:"all-rights-reserved";s:14:"publication_id";s:2:"-1";s:6:"status";s:6:"public";s:3:"url";s:96:"https://medium.com/@jamesravey/cognitive-quality-assurance-pt-2-performance-metrics-1f1de4b3132e";}' +categories: + - Work +tags: + - cognitive + - cqa + - evaluation + - learning + - machine + - rank + - retrieval + - retrieve + - supervised + - watson + +--- + +***EDIT: Hello readers, these articles are now 4 years old and many of the Watson services and APIs have moved or been changed. The concepts discussed in these articles are still relevant but I am working on 2nd editions of them.*** + +[Last time][1] we discussed some good practices for collecting data and then splitting it into test and train in order to create a ground truth for your machine learning system. We then talked about calculating accuracy using test and blind data sets. + +In this post we will talk about some more metrics you can do on your machine learning system including **Precision**, **Recall**, **F-measure** and **confusion matrices.** These metrics give you a much deeper level of insight into how your system is performing and provide hints at how you could improve performance too! + +## A recap – Accuracy calculation + +This is the most simple calculation but perhaps the least interesting. We are just looking at the percentage of times the classifier got it right versus the percentage of times it failed. Simply: + + 1. sum up the number of results (count the rows), + 2. sum up the number of rows where the predicted label and the actual label match. + 3. Calculate percentage accuracy: correct / total * 100. + +This tells you how good the classifier is in general across all classes. It does not help you in understanding how that result is made up. + +## Going above and beyond accuracy: why is it important? + +Imagine that you are a hospital and it is critically important to be able to predict different types of cancer and how urgently they should be treated. Your classifier is 73% accurate overall but that does not tell you anything about it’s ability to predict any one type of cancer. What if the 27% of the answers it got wrong were the cancers that need urgent treatment? We wouldn’t know! + +This is exactly why we need to use measurements like precision, recall and f-measure as well as confusion matrices in order to understand what is really going on inside the classifier and which particular classes (if any) it is really struggling with. + +## Precision, Recall and F-measure and confusion matrices (Grandma’s Memory Game) + +Precision, Recall and F-measure are incredibly useful for getting a deeper understanding of which classes the classifier is struggling with. They can be a little bit tricky to get your head around so lets use a metaphor about Grandma’s memory. + +Imagine Grandma has 24 grandchildren. As you can understand it is particularly difficult to remember their names. Thankfully, her 6 children, the grandchildren’s parents all had 4 kids and named them after themselves. Her son Steve has 3 sons: Steve I, Steve II, Steve III and so on. + +This makes things much easier for Grandma, she now only has to remember 6 names: Brian, Steve, Eliza, Diana, Nick and Reggie. The children do not like being called the wrong name so it is vitally important that she correctly classifies the child into the right name group when she sees them at the family reunion every Christmas. + +I will now describe Precision, Recall, F-Measure and confusion matrices in terms of Grandma’s predicament. + +### Some Terminology + +Before we get on to precision and recall, I need to introduce the concepts of true positive, false positive, true negative and false negative. Every time Grandma gets an answer wrong or right, we can talk about it in terms of these labels and this will also help us get to grips with precision and recall later. + +These phrases are in terms of each class – you have TP, FP, FN, TN for each class. In this case we can have TP,FP,FN,TN with respect to Brian, with respect to Steve, with respect to Eliza and so on. + +This table shows how these four labels apply to the class “Brian” – you can create a table will + ++ | + ++ Brian + | + ++ Not Brian + | +
+ Grandma says “Brian” + | + ++ True Positive + | + ++ False Positive + | +
+ Grandma says |
+
+ + False Negative + | + ++ True Negative + | +
+ | + ++ TP + | + ++ FP + | + ++ FN + | +
+ Brian + | + ++ 2 + | + ++ 1 + | + ++ 1 + | +
+ Eliza + | + ++ | + ++ 1 + | + ++ | +
+ Steve + | + ++ 1 + | + ++ | + ++ 1 + | +
+ | + ++ TP + | + ++ FP + | + ++ FN + | + ++ Precision + | +
+ Brian + | + ++ 2 + | + ++ 1 + | + ++ 1 + | + ++ 66% + | +
+ Eliza + | + ++ | + ++ 1 + | + ++ | + ++ N/A + | +
+ Steve + | + ++ 1 + | + ++ | + ++ 1 + | + ++ 100% + | +
+ | + ++ TP + | + ++ FP + | + ++ FN + | + ++ Recall + | +
+ Brian + | + ++ 2 + | + ++ 1 + | + ++ 1 + | + ++ 66.6% + | +
+ Eliza + | + ++ | + ++ 1 + | + ++ | + ++ N/A + | +
+ Steve + | + ++ 1 + | + ++ | + ++ 1 + | + ++ 50% + | +
+ | + ++ TP + | + ++ FP + | + ++ FN + | + ++ Precision + | + ++ Recall + | + ++ F-measure + | +
+ Brian + | + ++ 2 + | + ++ 1 + | + ++ 1 + | + ++ 66.6% + | + ++ 66.6% + | + ++ 66.6% + | +
+ Eliza + | + ++ | + ++ 1 + | + ++ | + ++ N/A + | + ++ N/A + | + ++ N/A + | +
+ Steve + | + ++ 1 + | + ++ | + ++ 1 + | + ++ 1 + | + ++ 0.5 + | + ++ 0.6666666667 + | +
+ | + ++ | + ++ Predictions + | +|||||
+ | + ++ | + ++ Steve + | + ++ Brian + | + ++ Eliza + | + ++ Diana + | + ++ Nick + | + ++ Reggie + | +
+Actual
+
+ +Class |
+
++ Steve + | + ++ 4 + | + ++ 1 + | + ++ | + ++ 1 + | + ++ | + ++ |
+ Brian + | + ++ 1 + | + ++ 3 + | + ++ | + ++ | + ++ 1 + | + ++ 1 + | +|
+ Eliza + | + ++ | + ++ | + ++ 5 + | + ++ 1 + | + ++ | + ++ | +|
+ Diana + | + ++ | + ++ | + ++ 5 + | + ++ 1 + | + ++ | + ++ | +|
+ Nick + | + ++ 1 + | + ++ | + ++ | + ++ | + ++ 5 + | + ++ | +|
+ Reggie + | + ++ | + ++ | + ++ | + ++ | + ++ | + ++ 6 + | +
+ Ok so lets have a closer look at the above. +
+ ++ Reading across the rows left to right these are the actual examples of each class – in this case there are 6 children with each name so if you sum over the row you will find that they each add up to 6. +
+ ++ Reading down the columns top-to-bottom you will find the predictions – i.e. what Grandma thought each child’s name was. You will find that these columns may add up to more than or less than 6 because Grandma may overfit for one particular name. In this case she seems to think that all her female Grandchildren are called Eliza (she predicted 5/6 Elizas are called Eliza and 5/6 Dianas are also called Eliza). +
+ ++ Reading diagonally where I’ve shaded things in bold gives you the number of correctly predicted examples. In this case Reggie was 100% accurately predicted with 6/6 children called “Reggie” actually being predicted “Reggie”. Diana is the poorest performer with only 1/6 children being correctly identified. This can be explained as above with Grandma over-generalising and calling all female relatives “Eliza”. +
+ ++ +
+ ++ Grandma seems to have gender nailed except in the case of one of the Steves (who in fairness does have a Pony Tail and can sing very high). She is best at predicting Reggies and struggles with Brians (perhaps Brians have the most diverse appearance and look a lot like their respective male cousins). She is also pretty good at Nicks and Steves. +
+ ++ Grandma is terrible at female grandchildrens’ names. If this was a machine learning problem we would need to find a way to make it easier to identify the difference between Dianas and Elizas through some kind of further feature extraction or weighting or through the gathering of additional training data. +
+ ++ Machine learning is definitely no walk in the park. There are a lot of intricacies involved in assessing the effectiveness of a classifier. Accuracy is a great start if until now you’ve been praying to the gods and carrying four-leaf-clovers around with you to improve your cognitive system performance. +
+ ++ However, Precision, Recall, F-Measure and Confusion Matrices really give you the insight you need into which classes your system is struggling with and which classes confuse it the most. +
+ ++ This example is probably directly relevant to those building classification systems (i.e. extracting intent from questions or revealing whether an image contains a particular company’s logo). However all of this stuff works directly for document retrieval use cases too. Consider true positive to be when the first document returned from the query is the correct answer and false negative is when the first document returned is the wrong answer. +
+ ++ There are also variants on this that consider the top 5 retrieved answer (Precision@N) that tell you whether your system can predict the correct answer in the top 1,3,5 or 10 answers by simply identifying “True Positive” as the document turning up in the top N answers returned by the query. +
+ ++ Overall I hope this tutorial has helped you to understand the ins and outs of machine learning evaluation. +
+ ++ Next time we look at cross-validation techniques and how to assess small corpii where carving out a 30% chunk of the documents would seriously impact the learning. Stay tuned for more! +
+ + [1]: https://brainsteam.co.uk/2016/03/29/cognitive-quality-assurance-an-introduction/ + [2]: https://upload.wikimedia.org/math/9/9/1/991d55cc29b4867c88c6c22d438265f9.png + [3]: https://en.wikipedia.org/wiki/Harmonic_mean#Harmonic_mean_of_two_numbers \ No newline at end of file diff --git a/brainsteam/content/posts/2016-06-05-blackgangpi-a-raspberry-pi-hack-at-blackgang-chine.md b/brainsteam/content/posts/2016-06-05-blackgangpi-a-raspberry-pi-hack-at-blackgang-chine.md new file mode 100644 index 0000000..671ac10 --- /dev/null +++ b/brainsteam/content/posts/2016-06-05-blackgangpi-a-raspberry-pi-hack-at-blackgang-chine.md @@ -0,0 +1,47 @@ +--- +title: '#BlackgangPi – a Raspberry Pi Hack at Blackgang Chine' +author: James +type: post +date: 2016-06-05T07:59:40+00:00 +url: /2016/06/05/blackgangpi-a-raspberry-pi-hack-at-blackgang-chine/ +medium_post: + - 'O:11:"Medium_Post":11:{s:16:"author_image_url";s:69:"https://cdn-images-1.medium.com/fit/c/200/200/0*naYvMn9xdbL5qlkJ.jpeg";s:10:"author_url";s:30:"https://medium.com/@jamesravey";s:11:"byline_name";N;s:12:"byline_email";N;s:10:"cross_link";s:2:"no";s:2:"id";s:12:"360de275805d";s:21:"follower_notification";s:3:"yes";s:7:"license";s:19:"all-rights-reserved";s:14:"publication_id";s:2:"-1";s:6:"status";s:6:"public";s:3:"url";s:94:"https://medium.com/@jamesravey/blackgangpi-a-raspberry-pi-hack-at-blackgang-chine-360de275805d";}' +categories: + - Work +tags: + - cognitive + - hackathon + - ibm + - watson + +--- +I was very excited to be invited along with some other IBMers to the Blackgang Pi event run by Dr Lucy Rogers on a semi regular basis at the Blackgang Chine theme park on the Isle of Wight. + +[Blackgang Chine ][1]is a theme park on the southern tip of the Isle of Wight and holds the title of oldest theme park in the United Kingdom. We were lucky enough to be invited along to help them modernise some of their animatronic exhibits, replacing some of the aging bespoke PCBs and controllers with Raspberry Pis running Node-RED and communicating using MQTT/Watson IOT. + +Over the course of two days, my colleague [James Sutton][2] and I built a talking moose head using some of the IBM Watson Cognitive services. + +We got it talking fairly quickly using IBM text to speech and had it listening for intents like “tell joke” or “check weather” via NLC. + +++ + + +I also built out a dialog that would monitor the state of the conversation and make the user comply with the knock knock joke format (i.e. if you say anything except “who’s there” it will moan and call you a spoil-sport). + +Video we managed to capture before we had to pack up yesterday below + ++ So good so far! A talking Moose head powered by @IBMIoT, @IBMWatson & @NodeRED #BlackgangPi pic.twitter.com/Vhgkr8q9cw +
+ ++ — James Sutton (@jpwsutton) June 4, 2016 +
+
Salesman(SM): Welcome Mr Tycoon, please allow me to introduce to you our master builder. She has over 25 years in the construction industry and qualifications in bricklaying, plumbing and electrics. + +Master Builder (MB): Hi Mr Tycoon, great to meet you *handshake* + +Tycoon(TC): Lovely to meet you both. I'm here today because I want some advice on my latest building project. I've been buying blocks of apartments and letting them out for years. My portfolio is worth £125 Million. However, I want to get into the construction game. + +MB: That's exciting. So how can we help? + +TC: Ok I'm a direct kind of guy and I say things how I see them so I'll cut to the chase. I want to build a house. What tools shall I use? + +MB: Good question... what kind of house are you looking to build? + +TC: Well, whatever house I can build with tools. + +MB: ok... well you know there are a lot of options and it depends on budget. I mean you must have something in mind? Bungalow? 2-Story family house? Manor house? + +TC: Look, I don't see how this is relevant. I've been told that the tools can figure all this stuff out for me. + +SM: Yes MB, we can provide tools that will help TC decide what house to build right? + +MB: That's not really how it works but ok... Let's say for the sake of argument that we're going to help you build a 2 bedroom townhouse. + +TC: Fantastic! Yes, great initiative MB, a townhouse. Will tools help me build a townhouse? + +MB: Yeah... tools will help you build a townhouse... + +TC: That's great! + +MB: TC, do you have any experience building houses? You said you mainly buy houses, not build them. + +T: No not really. However, SM told me that with the right tools, I don't need any experience, the tools will do all the work for me. + +MB: Right... ok... SM did you say that? + +SM: Well, with recent advances in building techniques and our latest generation of tools, anything is possible! + +MB: Yes... that's true tools do make things easier. However, you really do need to know how to use the tools. They're not 'magic' - you should understand which ones are useful in different situations + +TC: Oh, that's not the kind of answer I was looking for. SM, you said this wouldn't be a problem. + +SM: It's not going to be a problem is it MB? I mean we can help TC figure out which tools to use? + +MB: I suppose so... + +SM: That's the attitude MB... Tell TC about our services + +MB: Sure, I have had many years of experience building townhouses, we have a great architect at our firm who can design the house for you. My team will take care of the damp coursing, wooden frame, brickwork and plastering and then I will personally oversee the installation of the electrics and pipework. + +TC: Let's not get bogged down in the detail here MB, I just want a townhouse... Now I have a question. Have you heard of mechanical excavators - I think you brits call them "diggers". + +MB: Yes... I have used diggers a number of times in the past. + +TC: Oh that's great. MB, do you think diggers can help me build a house faster? + +MB: Urm, well maybe. It depends on the state of the terrain that you want to build on. + +TC: Oh that's great, some of our potential tenants have heard of diggers and if I tell them we used diggers to build the house they will be so excited. + +MB: Wonderful... + +TC: I've put an order in for 25 diggers - if we have more that means we can build the house faster right? + +MB: Are you serious? + +SM: Of course TC is serious, that's how it works right? + +MB: Not exactly but ok, if you have already ordered them that's fine *tries to figure out what to do with 24 spare diggers* + +TC: Great, it's settled then. One more thing, I don't know if I want to do a townhouse. Can you use diggers to build townhouses? I'm only interested in building things that diggers can build. + +MB: Yes don't worry, you can build townhouses with diggers. I've done it personally a number of times + +TC: I'm not so sure. I've heard of this new type of house called a Ford Mustang. Everyone in the industry is talking about how we can drive up ROI by building Ford Mustangs instead of Townhouses. What are your thoughts MB? + +MB: That's not a... diggers... I... I'm really sorry TC, I've just received an urgent text message from one of our foremen at a building site, I have to go and resolve this. Thanks for your time, SM can you wrap up here? *calmly leaves room and breathes into a paper bag* + +SM: Sorry about that TC, anyway yes I really love the Ford mustang idea, what's your budget? + +-FIN-+ +This post is supposed to raise a chuckle and it’s not supposed to offend anyone in particular. However, on a more serious note, there is definitely a problem with buzzwords in machine learning and industry. Let’s try and fix it. + + [1]: http://filament.uk.com/ \ No newline at end of file diff --git a/brainsteam/content/posts/2016-11-23-timetrack-a-simple-time-tracking-application-for-developers.md b/brainsteam/content/posts/2016-11-23-timetrack-a-simple-time-tracking-application-for-developers.md new file mode 100644 index 0000000..2ed3b26 --- /dev/null +++ b/brainsteam/content/posts/2016-11-23-timetrack-a-simple-time-tracking-application-for-developers.md @@ -0,0 +1,27 @@ +--- +title: timetrack – a simple time tracking application for developers +author: James +type: post +date: 2016-11-23T14:43:58+00:00 +url: /2016/11/23/timetrack-a-simple-time-tracking-application-for-developers/ +medium_post: + - 'O:11:"Medium_Post":11:{s:16:"author_image_url";N;s:10:"author_url";N;s:11:"byline_name";N;s:12:"byline_email";N;s:10:"cross_link";s:2:"no";s:2:"id";N;s:21:"follower_notification";s:3:"yes";s:7:"license";s:19:"all-rights-reserved";s:14:"publication_id";s:2:"-1";s:6:"status";s:4:"none";s:3:"url";N;}' +categories: + - Open Source +tags: + - phd + - projects + - python + - time + - tracking + +--- +I’ve written a small command line application for tracking my time on my PhD and other projects. We use Harvest at Filament which is great if you’ve got a huge team and want the complexity (and of course license charges) of an online cloud solution for time tracking. + +If, like me, you’re just interested to see how much time you are spending on your different projects and you don’t have any requirement for fancy web interfaces or client billing, then [timetrack][1] might be for you. For me personally, I was wondering how much of my week is spent on my PhD as opposed to Filament client work. I know its a fair amount but I want some clear cut numbers. + +[timetrack][1] is a simple application that allows you to log what time you’ve spent and where from the command line with relative ease. It logs everything to a text file which is relatively easy to read by !machines. However it also provides filtering and reporting functions so that you can see how much time you spend on your projects, how much time you used today and how much of your working day is left. + +It’s written in python with minimal dependencies on external libraries (save for progressbar2 which is used for the live tracker). The code is open source and available under an MIT license. Download it from [GitHub][1] + + [1]: https://github.com/ravenscroftj/timetrack \ No newline at end of file diff --git a/brainsteam/content/posts/2016-11-27-we-need-to-talk-about-push-notifications-and-why-i-stopped-wearing-my-smartwatch.md b/brainsteam/content/posts/2016-11-27-we-need-to-talk-about-push-notifications-and-why-i-stopped-wearing-my-smartwatch.md new file mode 100644 index 0000000..e6b9893 --- /dev/null +++ b/brainsteam/content/posts/2016-11-27-we-need-to-talk-about-push-notifications-and-why-i-stopped-wearing-my-smartwatch.md @@ -0,0 +1,58 @@ +--- +title: We need to talk about push notifications (and why I stopped wearing my smartwatch) +author: James +type: post +date: 2016-11-27T12:59:22+00:00 +url: /2016/11/27/we-need-to-talk-about-push-notifications-and-why-i-stopped-wearing-my-smartwatch/ +featured_image: /wp-content/uploads/2016/11/IMG_20161127_130808-e1480252170130-576x510.jpg +medium_post: + - 'O:11:"Medium_Post":11:{s:16:"author_image_url";s:69:"https://cdn-images-1.medium.com/fit/c/200/200/0*naYvMn9xdbL5qlkJ.jpeg";s:10:"author_url";s:30:"https://medium.com/@jamesravey";s:11:"byline_name";N;s:12:"byline_email";N;s:10:"cross_link";s:2:"no";s:2:"id";s:12:"3a1b15a3f469";s:21:"follower_notification";s:3:"yes";s:7:"license";s:19:"all-rights-reserved";s:14:"publication_id";s:2:"-1";s:6:"status";s:6:"public";s:3:"url";s:124:"https://medium.com/@jamesravey/we-need-to-talk-about-push-notifications-and-why-i-stopped-wearing-my-smartwatch-3a1b15a3f469";}' +categories: + - Uncategorized +tags: + - multi-tasking + - notifications + - phd + - planning + - work + +--- +I own a Pebble Steel which I got for Christmas a couple of years ago. I’ve been very happy with it so far. I can control my music player from my wrist, get notifications and a summary of my calender. Recently, however I’ve stopped wearing it. The reason is that constant streams of notifications stress me out, interrupt my workflow and not wearing it makes me feel more calm and in control and allows me to be more productive. + +As you can imagine, trying to do a PhD and be a CTO at the same time has its challenges. I struggle with the cognitive dissonance between walling off my research days to focus on my PhD and making sure that the developers at work are getting on ok and being productive without me. I have thus far tended to compromise by leaving slack running and fielding the odd question from colleagues even on my off days. + +Conversely, when I’m working for [Filament,][1] I often get requests from University colleagues to produce reports and posters, share research notes and to resolve problems with [SAPIENTA][2] or [Partridge][3] infrastructure (or even run experiments on behalf of other academics). Both of these scenarios play havoc with my prioritisation of todos when I get notified about them. + +## Human Multitasking + +Human Multitasking is something of a myth – as is [the myth that women can multitask and men can’t][4]. It turns out that we are all ([except for a small group of people scientists call “supertaskers”][5]) particularly rubbish at multi-tasking. I am no exception, however much I wish I was. + +When we “multitask” we are actually context switching. Effectively, we’re switching between a number of different tasks very quickly, kind of like how a computer is able to run many applications on the same CPU core by executing different bits of each app – it might deal with an incoming email, then switch to rendering your netflix movie, then switch to continuing to download that email. It does this so quickly that it seems like both activities are happening at once. That’s obviously different for dual or quad core CPUs but that’s not really the point here since our brains are not “quad core”. + +CPUs are really good at context switching very quickly. However, the human brain is really rubbish at this. [Joel Spolsky has written a really cool computer analogy on why][6] but if you don’t want to read a long article on it, lets just say that where a computer can context-switch in milliseconds, a human needs a few minutes. + +It also logically follows that the more cognitively intensive a job is, the more time a brain needs to swap contexts. For example, you might be able to press the “next” button on your car stereo while driving at 70 MPH down the motorway, but (aside from the obvious practical implications) you wouldn’t be able to perform brain surgery and drive at the same time . If you consider studying for a PhD and writing machine learning software for a company to be roughly as complex as the above example, you can hopefully understand why I’d struggle. + +## Push Notifications + +The problem I find with “push” notifications is that they force you to context switch. We, as a society, have spent the last couple of decades training ourselves to stop what we are doing and check our phones as soon as that little vibration or bling noise comes through. If you are a paramedic or surgeon with a pager, that’s the best possible use case for this tech, and I’m not saying we should stop push notifications for emergency situations like that. However, when the notification is “check out this dank meme dude” but we are still stimulated into action this can have a very harmful effect on our concentration and ability to focus on the task at hand. + +Mobile phone notifications are bad enough but occasionally, if your phone buzzes in your pocket and you are engrossed in another task, you won’t notice and you’ll check your phone later. Smartwatch notifications seem to get my attention 9 times out of 10 – I guess that’s what they’re designed for. Having something strapped directly to the skin on my wrist is much more distracting than something buzzing through a couple of layers of clothing on my leg. + +I started to find that push notifications forcibly jolt me out of whatever task I’m doing and I immediately feel anxious until I’ve handled the new input stimulus. This means that I will often prioritise unimportant stuff like responding to memes that my colleague has posted in slack over the research paper I’m reading. Maybe this means I miss something crucial, or maybe I just have to go back to the start of the page I’m looking at. Either way, time is a’wastin’. + +## The Solution + +For me, it’s obvious. Push notifications need a huge re-think. I am currently reorganising the way I work, think and plan and ripping out as many push notification mechanisms as I can. [I’ve also started keeping track of how I’m spending my time using a tool I wrote last week.][7] + +I can definitely see a use case for “machine learning” triage of notifications based on intent detection and personal priorities. If a relative is trying to get hold of me because there’s been an emergency, I wouldn’t mind being interrupted during a PhD reading session. If a notification asking for support on Sapienta or a work project comes through, that’s urgent but can probably wait half an hour until I finish my current reading session. If a colleague wants to send me a video of grumpy cat, that should wait in a list of things to check out after 5:30pm. + +Until me, or someone with more time to do so builds a machine learning filter like this one, I’ve stopped wearing my smart watch and my phone is on silent. If you need me and I’m ignoring you, don’t take it personally. I’ll get back to you when I’m done with my current task. If it’s urgent, you’ll just have to try phoning and hoping I notice the buzz in my pocket (until I find a more elegant way to screen urgent calls and messages). + + [1]: http://filament.uk.com + [2]: http://sapienta.papro.org.uk + [3]: http://farnsworth.papro.org.uk/ + [4]: http://link.springer.com/article/10.3758%2FPBR.17.4.479 + [5]: http://link.springer.com/article/10.3758/PBR.17.4.479 + [6]: http://www.joelonsoftware.com/articles/fog0000000022.html + [7]: https://brainsteam.co.uk/2016/11/23/timetrack-a-simple-time-tracking-application-for-developers/ \ No newline at end of file diff --git a/brainsteam/content/posts/2016-12-08-ai-cant-solve-all-our-problems-but-that-doesnt-mean-it-isnt-intelligent.md b/brainsteam/content/posts/2016-12-08-ai-cant-solve-all-our-problems-but-that-doesnt-mean-it-isnt-intelligent.md new file mode 100644 index 0000000..d6e2468 --- /dev/null +++ b/brainsteam/content/posts/2016-12-08-ai-cant-solve-all-our-problems-but-that-doesnt-mean-it-isnt-intelligent.md @@ -0,0 +1,56 @@ +--- +title: AI can’t solve all our problems, but that doesn’t mean it isn’t intelligent +author: James +type: post +date: 2016-12-08T10:08:13+00:00 +url: /2016/12/08/ai-cant-solve-all-our-problems-but-that-doesnt-mean-it-isnt-intelligent/ +medium_post: + - 'O:11:"Medium_Post":11:{s:16:"author_image_url";s:69:"https://cdn-images-1.medium.com/fit/c/200/200/0*naYvMn9xdbL5qlkJ.jpeg";s:10:"author_url";s:30:"https://medium.com/@jamesravey";s:11:"byline_name";N;s:12:"byline_email";N;s:10:"cross_link";s:2:"no";s:2:"id";s:12:"e3e315592001";s:21:"follower_notification";s:3:"yes";s:7:"license";s:19:"all-rights-reserved";s:14:"publication_id";s:12:"6fc55de34f53";s:6:"status";s:6:"public";s:3:"url";s:117:"https://medium.com/@jamesravey/ai-cant-solve-all-our-problems-but-that-doesn-t-mean-it-isn-t-intelligent-e3e315592001";}' +categories: + - PhD + - Work +tags: + - AI + - machine learning + - philosophy + +--- + + +[A recent opinion piece I read on Wired][1] called for us to stop labelling our current specific machine learning models AI because they are not intelligent. I respectfully disagree. + +AI is not a new concept. The idea that a computer could ‘think’ like a human and one day pass for a human has been around since Turing and even in some form long before him. The inner workings the human brain and how we carry out computational processes have even been discussed by great philosophers such as Thomas Hobbes who wrote in his book, De Corpore in 1655 that _“by reasoning, I understand computation. And to compute is to collect the sum of many things added together at the same time, or to know the remainder when one thing has been taken from another. To reason therefore is the same as to add or to subtract.”_ Over the years, AI has continued to capture the hearts and minds of great thinkers, scientists and of course creatives and artists. + + + +Visionary Science Fiction authors of the 20th century: Arthur C Clarke, Isaac Asimov and Philip K Dick have built worlds of fantasy inhabited by self-aware artificial intelligence systems and robots, [some of whom could pass for humans unless subject to a very specific and complicated test][3]. Endless films have been released that “sex up” AI. The Terminator series, The Matrix, Ex Machina, the list goes on. However, like all good science fiction, these stories that paint marvellous and thrilling visions of futures that are still in the future even in 2016. + +The science of AI is a hugely exciting place to be too (_I would say that, wouldn’t I). _In the 20th century we’ve mastered speech recognition, optical character recognition and machine translation good enough that I can visit Japan and communicate, via my mobile phone, with a local shop keeper without either party having to learn the language of their counterpart. We have arrived at a point where we can train machine learning models to do some specific tasks better than people (including drive cars and [diagnostic oncology][4]). We call these current generation AI models “weak AI”. Computers that can solve any problem we throw at them (in other words, ones that have generalised intelligence and known as “strong AI” systems) are a long way off. However, that shouldn’t detract from what we have solved already with weak AI. + +One of the problems with living in a world of 24/7 new cycles and clickbait titles is that nothing is new or exciting any more. Every small incremental change in the world is reported straight away across the globe. Every new discovery, every fractional increase in performance from AI gets a blog post or a news article. It makes everything seem boring. _Oh Tesla’s cars can drive themselves? So what? Google’s cracked Go? Whatever… _ + + + +If you lose 50kgs in weight over 6 months, your spouse is only going to notice when you buy a new shirt that’s 2 sizes smaller or notice a change in your muscle when you get out of the shower. A friend you meet up with once a year is going to see a huge change because last time they saw you you were twice the size. In this day and age, technology moves on so quickly in tiny increments that we don’t notice the huge changes any more because we’re like the spouse – we constantly see the tiny changes. + +What if we did see huge changes? What if we could cut ourselves off from the world for months at a time? If you went back in time to 1982 and told them that every day you talk to your phone using just your voice and it is able to tell you about your schedule and what restaurant to go to, would anyone question that what you describe is AI? If you told someone from 1995 that you can [buy a self driving car][5] via a small glass tablet you carry around in your pocket, are they not going to wonder at the world that we live in? We have come a long long way and we take it for granted. Most of us use AI on a day to day basis without even questioning it. + +Another common criticism of current weak AI models is the exact lack of general reasoning skills that would make them strong AI. + +> DEEPMIND HAS SURPASSED the human mind on the Go board. Watson has crushed America’s trivia gods on _Jeopardy_. But ask DeepMind to play Monopoly or Watson to play _Family Feud_, and they won’t even know where to start. + +That’s absolutely true. The AI/compsci definition of this constraint is the “no free lunch for optimisation” theorem. That is that you don’t get something for nothing when you train a machine learning model. In training a weak AI model for a specific task, you are necessarily hampering its ability to perform well at other tasks. I guess a human analogy would be the education system. + + + +Aged 14 in a high school in the UK, I was asked which 11 GCSEs I wanted to take. At 16 I had to reduce this scope to 5 A levels, aged 18 I was asked to specify a single degree and aged 21 I had to decide which tiny part of AI/Robotics (which I’d studied at degree level) I wanted to specialise in at PhD level. Now that I’m half way through a PhD in Natural Language Processing in my late 20s, would you suddenly turn around and say “actually you’re not intelligent because if I asked you to diagnose lung cancer in a child you wouldn’t be able to”? Does what I’ve achieved become irrelevant and pale against that which I cannot achieve? I do not believe that any reasonable person would make this argument. + +The AI Singularity has not happened yet and it’s definitely a few years away. However, does that detract from what we have achieved so far? No. No it does not. + + + + [1]: https://www.wired.com/2016/12/artificial-intelligence-artificial-intelligent/ + [2]: https://en.wikipedia.org/wiki/Brain_in_a_vat + [3]: https://en.wikipedia.org/wiki/Do_Androids_Dream_of_Electric_Sheep%3F + [4]: https://www.top500.org/news/watson-proving-better-than-doctors-in-diagnosing-cancer/ + [5]: https://www.tesla.com/en_GB/models \ No newline at end of file diff --git a/brainsteam/content/posts/2016-12-10-timetrack-improvements.md b/brainsteam/content/posts/2016-12-10-timetrack-improvements.md new file mode 100644 index 0000000..eb6437b --- /dev/null +++ b/brainsteam/content/posts/2016-12-10-timetrack-improvements.md @@ -0,0 +1,23 @@ +--- +title: timetrack improvements +author: James +type: post +date: 2016-12-10T09:33:41+00:00 +url: /2016/12/10/timetrack-improvements/ +medium_post: + - 'O:11:"Medium_Post":11:{s:16:"author_image_url";N;s:10:"author_url";N;s:11:"byline_name";N;s:12:"byline_email";N;s:10:"cross_link";s:2:"no";s:2:"id";N;s:21:"follower_notification";s:3:"yes";s:7:"license";s:19:"all-rights-reserved";s:14:"publication_id";s:2:"-1";s:6:"status";s:4:"none";s:3:"url";N;}' +categories: + - Open Source + - PhD +tags: + - python + - timetrack + +--- +I’ve just added a couple of improvements to timetrack that allow you to append to existing time recordings (either with an amount like 15m or using live to time additional minutes spent and append them). + +You can also remove entries using timetrack rm instead of remove – saving keystrokes is what programming is all about. + +You can find the [updated code over at github.][1] + + [1]: https://github.com/ravenscroftj/timetrack \ No newline at end of file diff --git a/brainsteam/content/posts/2017-06-05-exploring-web-archive-data-cdx-files.md b/brainsteam/content/posts/2017-06-05-exploring-web-archive-data-cdx-files.md new file mode 100644 index 0000000..1fc49e9 --- /dev/null +++ b/brainsteam/content/posts/2017-06-05-exploring-web-archive-data-cdx-files.md @@ -0,0 +1,114 @@ +--- +title: Exploring Web Archive Data – CDX Files +author: James +type: post +date: 2017-06-05T07:24:22+00:00 +url: /2017/06/05/exploring-web-archive-data-cdx-files/ +medium_post: + - 'O:11:"Medium_Post":11:{s:16:"author_image_url";N;s:10:"author_url";N;s:11:"byline_name";N;s:12:"byline_email";N;s:10:"cross_link";s:2:"no";s:2:"id";N;s:21:"follower_notification";s:3:"yes";s:7:"license";s:19:"all-rights-reserved";s:14:"publication_id";s:2:"-1";s:6:"status";s:4:"none";s:3:"url";N;}' +categories: + - PhD +tags: + - cdx + - python + - webarchive + +--- +I have recently been working in partnership with [UK Web Archive][1] in order to identify and parse large amounts of historic news data for an NLP task that I will blog about in the future. The NLP portion of this task will surely present its own challenges, but for now there is the small matter of identifying news data amongst the noise of 60TB of [web archive dumps of the rest of the .UK top level domain.][1] + +## WARC and CDX Files + +The web archive project have produced standardized file formats for describing historic web resources in a compressed archive. The website is scraped and the content is stored chronologically in a [WARC][2] file. A CDX index file is also produced describing every URL scraped, the time it was retrieved at and which WARC file the content is in, along with some other metadata. + +Our first task is to identify news content in order to narrow down our search to a subset of WARC files (in order not to fill 60TB of storage or have to traverse that amount of data). The CDX files allow us to do this. These files are available for [free download from the Web Archive website.][3] They are compressed using Gzip compression down to around 10-20GB per file. If you try to expand these files locally, you’re looking at 60-120GB of uncompressed data – a great way to fill up your hard drive. + +## Processing Huge Gzip Files + +Ideally we want to explore these files without having to uncompress them explicitly. This is possible using Python 3’s gzip module but it took me a long time to find the right options. + +Python file i/o typically allows you to read a file in line by line. If you have a text file, you can iterate over the lines using something like the following: + +
with open("my_text_file.txt", "r") as f:
+ for line in f:
+ print(line)
+
+
+Now clearly trying this approach with a .gz file isn’t going to work. Using the [gzip][4] module we can open and uncompress gz as a stream – examining parts of the file in memory and discarding data that we’ve already seen. This is the most efficient way of dealing with a file of this magnitude that won’t fit into RAM on a modern machine and would will a hard drive uncompressed.
+
+I tried a number of approaches using the gzip library, trying to run the gzip command line utility using [subprocess][5] and combinations of [TextIOWrapper][6] and [BufferedReader][7] but to no avail.
+
+## The Solution
+
+The solution is actually incredibly simple in Python 3 and I wasn’t far off the money with [TextIOWrapper.][6] The gzip library offers a file read/write flag for accessing gzipped text in a buffered line-by-line fashion as above for the uncompressed text file. Simply passing in “rt” to the gzip.open() function will wrap the input stream from Gzip in a TextIOWrapper and allow you to read the file line by line.
+
+import gzip
+
+with gzip.open("2012.cdx.gz","rt") as gzipped:
+
+ for i,line in enumerate(gzipped):
+ print(line)
+ # stop this thing running off and printing the whole file.
+ if i == 10:
+ break
+
+If you’re using an older version of Python (2.7 for example) or you would prefer to see what’s going on beneath the covers here explicitly, you can also use the following code:
+
+import io
+import gzip
+
+with io.TextIOWrapper(gzip.open("2012.cdx.gz","r")) as gzipped:
+
+ for i,line in enumerate(gzipped):
+ print(line)
+ # stop this thing running off and printing the whole file.
+ if i == 10:
+ break
+
+And its as simple as that. You can now start to break down each line in the file using tools like [urllib][8] to identify content stored in the archive from domains of interest.
+
+## Solving a problem
+
+We may want to understand how much content is available in the archive for a given Domain. To put this another way, which are the domains with the most pages that we have stored in the web archive. In order to answer this, we can run a simple script that parses all of the URLs, examines the domain name and counts instances of each.
+
+import io +import gzip +from collections import Counter +from urllib.parse import urlparse + +with gzip.open("2012.cdx.gz","rt") as gzipped: + + + for i,line in enumerate(gzipped): + + parts = line.split(" ") + + urlbits = urlparse(parts[2]) + + urlcounter[urlbits.netloc] += 1 + +#at the end we print out the top 10 URLs +print(urlcounter.most_common(10))+ +Just to quickly explain what is going on here: + + 1. We load up the CDX file in compressed text mode as described above + 2. We split each line using space characters. This gives us an array of fields, the order and content of which are described by the WebArchive team [here.][3] + 3. We parse the URL (which is at index 2) using the [urlparse][9] function which will break the URL up into things like domain, protocol (HTTP/HTTPS), path, query, fragment. + 4. We increment the counter for the current domain (described in the ‘netloc’ field of the parsed url. + 5. After iterating we print out the domains with the most URLs in the CDX file. + +This will take a long time to complete since we’re iterating over ~60TB of text. I intend to investigate parallel processing of these CDX files as a next step. + +## Conclusion + +We’ve looked into how to dynamically unzip and examine a CDX file in order to understand which domains host the most content. The next step is to identify which WARC files are of interest and request access to them from the Web Archive. + + [1]: https://www.webarchive.org.uk/ukwa/ + [2]: http://commoncrawl.org/2014/04/navigating-the-warc-file-format/ + [3]: http://data.webarchive.org.uk/opendata/ukwa.ds.2/cdx/ + [4]: https://docs.python.org/3.6/library/gzip.html + [5]: https://docs.python.org/3/library/subprocess.html + [6]: https://docs.python.org/3/library/io.html#io.TextIOWrapper + [7]: https://docs.python.org/3/library/io.html#io.BufferedReader + [8]: https://docs.python.org/3/library/urllib.html + [9]: https://docs.python.org/3/library/urllib.parse.html \ No newline at end of file diff --git a/brainsteam/content/posts/2017-07-25-dialect-sensitive-topic-models.md b/brainsteam/content/posts/2017-07-25-dialect-sensitive-topic-models.md new file mode 100644 index 0000000..25e33f6 --- /dev/null +++ b/brainsteam/content/posts/2017-07-25-dialect-sensitive-topic-models.md @@ -0,0 +1,100 @@ +--- +title: Dialect Sensitive Topic Models +author: James +type: post +date: 2017-07-25T11:02:42+00:00 +url: /2017/07/25/dialect-sensitive-topic-models/ +medium_post: + - 'O:11:"Medium_Post":11:{s:16:"author_image_url";N;s:10:"author_url";N;s:11:"byline_name";N;s:12:"byline_email";N;s:10:"cross_link";s:2:"no";s:2:"id";N;s:21:"follower_notification";s:3:"yes";s:7:"license";s:19:"all-rights-reserved";s:14:"publication_id";s:2:"-1";s:6:"status";s:4:"none";s:3:"url";N;}' +categories: + - Open Source + - PhD +tags: + - lda + - machine learning + - python + - topic model + +--- +As part of my PhD I’m currently interested in topic models that can take into account the dialect of the writing. That is, how can we build a model that can compare topics discussed in different dialectical styles, such as scientific papers versus newspaper articles. If you’re new to the concept of topic modelling then [this article][1] can give you a quick primer. + +## Vanilla LDA + + + +Vanilla topic models such as [Blei’s LDA][2] are great but start to fall down when the wording around one particular concept varies too much. In a scientific paper you might expect to find words like “gastroenteritis”, “stomach” and “virus” whereas in newspapers discussing the same topic you might find “tummy”, “sick” and “bug”. A vanilla LDA implementation might struggle to understand that these concepts are linked unless the contextual information around the words is similar (e.g. both articles have “uncooked meat” and “symptoms last 24 hours”). + + + +We define a set of toy documents that have 3 main topics around sickness and also around health and going to the gym. Half of the documents are written in “layman’s” english and the other half “scientific” english. The documents are shown below + +
doc1 = ["tummy", "ache", "bad", "food","poisoning", "sick"]
+doc2 = ["pulled","muscle","gym","workout","exercise", "cardio"]
+doc3 = ["diet", "exercise", "carbs", "protein", "food","health"]
+doc4 = ["stomach", "muscle", "ache", "food", "poisoning", "vomit", "nausea"]
+doc5 = ["muscle", "aerobic", "exercise", "cardiovascular", "calories"]
+doc6 = ["carbohydrates", "diet", "food", "ketogenic", "protein", "calories"]
+doc7 = ["gym", "food", "gainz", "protein", "cardio", "muscle"]
+doc8 = ["stomach","crunches", "muscle", "ache", "protein"]
+doc9 = ["gastroenteritis", "stomach", "vomit", "nausea", "dehydrated"]
+doc10 = ["dehydrated", "water", "exercise", "cardiovascular"]
+doc11 = ['drink', 'water', 'daily','diet', 'health']
+
+Using a normal implementation of LDA with 3 topics, we get the following results after 30 iterations:
+
+
+
+It is fair to say that Vanilla LDA didn’t do a terrible job but it did make end up with some strange decisions like putting poisoning (as in ‘food poisoning’ in with cardio and calories). The other two topics seem fairly consistent and sensible.
+
+
+
+## DiaTM
+
+Crain et al. 2010 paper [_**“Dialect topic modeling for improved consumer medical**_ search.”][3] proposes a modified LDA that they call “DiaTM”.
+
+
+
+DiaTM works in the same way as LDA but also introduces the concept of collections and dialects. A collection defines a set of documents from the same source or written with a similar dialect – you can imagine having a collection of newspaper articles and a collection of scientific papers for example. Dialects are a bit like topics – each word is effectively “generated” from a dialect and the probability of a dialect being used is defined at collection level.
+
+The handy thing is that words have a probability of appearing in every dialect which is learned by the model. This means that words common to all dialects (such as ‘diet’ or ‘food’) can weighted as such in the model.
+
+Running DiaTM on the same corpus as above yields the following results:
+
+
+
+You can see how the model has effectively identified the three key topics in the documents above but has also segmented the topics by dialect. Topic 2 is mainly concerned with food poisoning and sickness. In dialect 0 the words “sick” and “bad” appear but in dialect 1 the words “vomit” and “gastroenteritis” appear.
+
+## Open Source Implementation
+
+I have tried to turn my experiment into a Python library that others can make use of. It is currently early stage and a little slow but it works. The code is [available here][4] and pull requests are very welcome.
+
+The library offers a ‘Scikit-Learn-like’ interface where you fit the model to your data like so:
+
+doc1 = ["tummy", "ache", "bad", "food","poisoning", "sick"]
+doc2 = ["pulled","muscle","gym","workout","exercise", "cardio"]
+doc3 = ["diet", "exercise", "carbs", "protein", "food","health"]
+doc4 = ["stomach", "muscle", "ache", "food", "poisoning", "vomit", "nausea"]
+doc5 = ["muscle", "aerobic", "exercise", "cardiovascular", "calories"]
+doc6 = ["carbohydrates", "diet", "food", "ketogenic", "protein", "calories"]
+doc7 = ["gym", "food", "gainz", "protein", "cardio", "muscle"]
+doc8 = ["stomach","crunches", "muscle", "ache", "protein"]
+doc9 = ["gastroenteritis", "stomach", "vomit", "nausea", "dehydrated"]
+doc10 = ["dehydrated", "water", "exercise", "cardiovascular"]
+doc11 = ['drink', 'water', 'daily','diet', 'health']
+
+collection1 = [doc1,doc2,doc3, doc7, doc11]
+# 'scientific' documents
+collection2 = [doc4,doc5,doc6, doc8, doc9, doc10]
+
+collections = [collection1, collection2]
+
+dtm = DiaTM(n_topic=3, n_dialects=2)
+dtm.fit(X)
+
+
+Fitting the model to new documents using transform() will be available soon as will finding the log probability of the current model parameters.
+
+ [1]: http://www.kdnuggets.com/2016/07/text-mining-101-topic-modeling.html
+ [2]: http://dl.acm.org/citation.cfm?id=2133826
+ [3]: http://www.ncbi.nlm.nih.gov/pubmed/21346955
+ [4]: https://github.com/ravenscroftj/diatm
\ No newline at end of file
diff --git a/brainsteam/content/posts/2017-08-03-182.md b/brainsteam/content/posts/2017-08-03-182.md
new file mode 100644
index 0000000..bdd7060
--- /dev/null
+++ b/brainsteam/content/posts/2017-08-03-182.md
@@ -0,0 +1,266 @@
+---
+title: Why I keep going back to Evernote
+author: James
+type: post
+date: 2017-08-03T08:27:53+00:00
+url: /2017/08/03/182/
+featured_image: /wp-content/uploads/2017/08/cahier-spirale-825x510.png
+medium_post:
+ - 'O:11:"Medium_Post":11:{s:16:"author_image_url";s:69:"https://cdn-images-1.medium.com/fit/c/200/200/0*naYvMn9xdbL5qlkJ.jpeg";s:10:"author_url";s:30:"https://medium.com/@jamesravey";s:11:"byline_name";N;s:12:"byline_email";N;s:10:"cross_link";s:2:"no";s:2:"id";s:12:"5ce618eb3174";s:21:"follower_notification";s:3:"yes";s:7:"license";s:19:"all-rights-reserved";s:14:"publication_id";s:2:"-1";s:6:"status";s:6:"public";s:3:"url";s:139:"https://medium.com/@jamesravey/as-the-cto-for-a-london-machine-learning-startup-and-a-phd-student-at-warwick-institute-for-the-5ce618eb3174";}'
+categories:
+ - PhD
+ - Work
+tags:
+ - evernote
+ - filament
+ - knowledge management
+ - markdown
+ - phd
+
+---
+import pyximport; pyximport.install()
+
+Then you can literally just import your library. Imagine your Cython file is called test.pyx, you can just do:
+
+import test
+
+and off you go.
+
+If, like me, you’re a big fan of Jupyter notebooks and using importlib reload to bring in new versions of models you’re developing, Cython and pyximport offer a hack that supports this. When you import pyximport, add reload_support=True to the install function call to enable this.
+
+import pyximport; pyximport.install(reload_support=True)
+
+I found this to be very hacky and that reloading often failed with this method unless preceeded by another import statement. Something like this usually works:
+
+from importlib import reload
+import test
+reload(test)
+
+
+## Optimising and Understanding Cython Code
+
+Remember that Cython code is first “re-written” or “transpiled” to C code and then is compiled to machine readable binary by your system’s C compiler. Well written C is still one of the fastest languages you can write an application in (but also complex and easy to cause a crash from). Since Python is an interpreted language that lives inside a virtual environment, each operation – such as adding together two numbers – actually translates to several C expressions.
+
+Well written Cython code can be compiled down to a small number of instructions but badly optimised Cython will just result in lines and lines of C code. In these cases, the benefit you’re going to be getting from having written the module in Cython is likely to be negligible over standard interpreted Python code.
+
+Cython comes with a handy tool which generates a HTML report showing how well optimised your code is. You can run it on your code by doing
+
+cython -a test.pyx
+
+What you should now have is a test.c file and a test.html file in the directory. IF you open the HTML file in the browser you’ll see your Cython code and yellow highlights. It’s pretty simple: the brighter/more intense the yellow colouring, the more likely it is that your code is interacting with normal Python objects rather than pure C ones and ergo the more likely it is that you can optimise that code and speed things up*.
+
+*Of course this isn’t always the case. In some cases you will want to be interacting with the Python world like in code that passes the output from a highly optimised C function back into the world of the Python interpreter so that it can be used by normal Python code.
+
+If you’re trying to squeeze loads of performance out of Cython, what you should be aiming for is to get to a point where all your variables have a C type (by using **cde****f** to declare them before you use them) and by only applying C operations and functions wherever possible.
+
+For example the code:
+
+i = 0 +while i < 99: + i += 1 ++ +will result in + + [1]: http://cython.org/ \ No newline at end of file diff --git a/brainsteam/content/posts/2017-08-11-machine-learning-and-hardware-requirements.md b/brainsteam/content/posts/2017-08-11-machine-learning-and-hardware-requirements.md new file mode 100644 index 0000000..86dfac8 --- /dev/null +++ b/brainsteam/content/posts/2017-08-11-machine-learning-and-hardware-requirements.md @@ -0,0 +1,87 @@ +--- +title: Machine Learning and Hardware Requirements +author: James +type: post +date: 2017-08-11T17:22:12+00:00 +draft: true +url: /?p=195 +medium_post: + - 'O:11:"Medium_Post":11:{s:16:"author_image_url";s:69:"https://cdn-images-1.medium.com/fit/c/200/200/0*naYvMn9xdbL5qlkJ.jpeg";s:10:"author_url";s:30:"https://medium.com/@jamesravey";s:11:"byline_name";N;s:12:"byline_email";N;s:10:"cross_link";s:2:"no";s:2:"id";s:12:"6e9abb882f26";s:21:"follower_notification";s:3:"yes";s:7:"license";s:19:"all-rights-reserved";s:14:"publication_id";s:2:"-1";s:6:"status";s:6:"public";s:3:"url";s:86:"https://medium.com/@jamesravey/machine-learning-and-hardware-requirements-6e9abb882f26";}' +categories: + - Uncategorized + +--- +_**With recent advances in machine learning techniques, vendors like [Nvidia][1], [Intel][2], [AMD][3] and [IBM][3] are announcing hardware offerings specifically tailored around machine learning. In this post we examine the key differences between “traditional” software and machine learning software and why those differences necessitate a new type of hardware stack.**_ + +Most readers would certainly be forgiven for wondering why NVidia (NVDA on the stock market), a company that rose to prominence for manufacturing and distributing graphics processing chips to video games enthusiasts, are suddenly being mentioned in tandem with machine learning and AI products. You would also be forgiven for wondering why machine learning needs its own hardware at all. Surely a program is a program right? To understand how these things are connected, we need to talk a little bit about how software runs and the key differences between a procedural application that you’d run on your smart phone versus a deep neural network. + +## How Traditional (Procedural) Software Works + + + +You can think of software as a series of instructions. In fact, that’s all an algorithm is. A cooking recipe that tells you how to make a cake step-by-step is a real world example of an algorithm that you carry out by hand every day. + +Traditional software is very similar to a food recipe in principle. + + 1. First you define your variables (a recipe tells you what ingredients you need and how much you’ll need for each). + 2. Then you follow a series of instructions. (Measure out the flour, add it to the mixing bowl, measure out the sugar, add that to the bowl). + 3. Somewhere along the way you’re going to encounter conditions (mix in the butter until the mixture is smooth or whip the cream until it is stiff). + 4. At the end you produce a result (i.e. you present the cake to the birthday girl or boy). + +A traditional Central Processing Unit (CPU) that you’d find in your laptop, mobile phone or server is designed to process one instruction at a time. When you are baking a cake that’s fine because often the steps are dependent upon each other. You wouldn’t want to beat the eggs, put them in the oven and start pouring the flour all at the same time because that would make a huge mess. In the same way, it makes no sense to send each character in an email at the same time unless you want the recipient’s message to be garbled. + +## Parallel Processing and “Dual Core” + +Over the last 2 decades, processing speed of CPUs has got faster and faster which effectively means that they are able to do more and more instructions one at a time. Imagine moving from one person making a cake to a machine that makes cakes on a conveyer belt. However, consumer computing has also become more and more demanding and with many homes globally connected to high speed internet, multitasking, running more than one application on your laptop at the same time or looking at multiple tabs in your browser, is becoming more and more common. + +Before Parallel Processing (machines that advertise being “dual core”, and more recently “quad core” and even “octo-core”), computers appeared to be running multiple applications at the same time by doing little bits of each of the applications and switching around. Continuing our cake analogy, this would be like putting a chocolate cake in the oven and then proceeding to mix the flour and eggs for a vanilla sponge all the time, periodically checking that the chocolate cake isn’t burning. + +Multi-processing (dual/quad/octo core) allows your computer really run multiple programs at the same time, rather than appearing to. This is because each chip has 2 (duo) 4 (quad) or 8 (octo) CPUs all working on the data at the same time. The cake analogy is that we now have 2 chefs or even 2 conveyer belt factory machines. + +## How [Deep] Neural Networks Work + +Neural Networks are modelled around how the human brain processes and understands information. Like a brain, they consist of neurons which get excited under certain circumstances like observing a particular word or picture and synapses which pass messages between neurons. Training a neural network is about strengthening and weakening the synapses that connect the neurons to manipulate which neurons get excited based on particular inputs. This is more or less how humans learn too! + +The thing about human thinking is that we don’t tend to process the things we see and hear in small chunks, one at a time, like a traditional processor would. We process a whole image in one go, or at least if feels that way right? Our brains do a huge amount of parallel processing. Each neuron in our retinas receives a small part of the light coming in through our eyes and through communication via the synapses connecting our brain cells, we assemble a single coherent image. + + + +Simulated neural networks work in the same way. In a model learning to recognise faces in an image, each neuron receives a small part of the picture – usually a single pixel – carries out some operation and passes the message along a synapse to the next neuron which carries out an operation. The calculations that each neuron makes is largely independent unless it is waiting for the output from a neuron the next layer up. That means that while it is possible to simulate a neural network on a single CPU, it is very inefficient because it has to calculate what each neuron’s verdict about it’s pixel is independently. It’s a bit like the end of the Eurovision song contest where each country is asked for its own vote over the course of about an hour. Or if you’re unfamiliar with our wonderful but[ obscure european talent contest][4], you could say its a bit like a government vote where each representative has to say “Yea” or “Ney” one after another. Even with a dual, quad or octo core machine, you can still only simulate a small number of neurons at a time. If only there was a way to do that… + +## Not Just for Gaming: Enter NVidia and GPUs + + + +GPUs or Graphical Processing Units are microprocessors that were historically designed for running graphics-based workloads such as rendering 3D models in video games or animated movies like Toy Story or Shrek. Graphics workloads are also massively parallel in nature. + +An image on a computer is made up of a series of pixels. In order to generate a coherent image, a traditional single-core CPU has to calculate what colour each pixel should be one-by-one. When a modern (1280×1024) laptop screen is made up of 1310720 pixels – that’s 1.3 million pixels. If we’re watching a video, which usually runs at 30 frames per second, we’re looking at nearly 40 million pixels per second that have to be processed. That is a LOT of processing. If we’re playing a video game, then on top of this your CPU has to deal with all the maths that comes with running around a virtual environment and the behaviours and actions of the in-game characters. You can see how things could quickly add up and your machine grind to a halt. + +GPUs, unlike CPUs are made up of thousands – that’s right, not duo or octo but thousands of processing cores so that they can do a lot of that pixel rendering in parallel. The below video, which is also hosted on the [NVidia website,][5] gives an amusing example of the differences here. + +
pip install findspark+ +Then you’ll want to export SPARK_HOME environment variable so that findspark knows where to look for the libraries (if you don’t do this, you’ll get an error in your python session. + +
export SPARK_HOME=/usr/local/spark+ +Obviously you’ll want to change this if you’re working with a Spark install at a different location – just point it to the root directory of the Spark installation that you unzipped above. + +a pro-tip here is to actually add this line to your .bashrc or .profile files so that every time you start a new terminal instance, this information is already available. + +## Python and Findspark first steps + +If you did the above properly you can now launch python and start your first Spark job. + +Try running the following: + +
import findspark<br />
+findspark.init()</p>
+<p>import pyspark #if the above didn't work then you'll get an error here</p>
+<p>from pyspark.sql import SQLContext</p>
+<p>if <strong>name</strong> == "<strong>main</strong>":<br />
+ """<br />
+ Usage: pi [partitions]<br />
+ """<br />
+ sc = pyspark.SparkContext()<br />
+ partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2<br />
+ n = 100000 * partitions</p>
+<pre><code>def f(_):
+ x = random() * 2 - 1
+ y = random() * 2 - 1
+ return 1 if x ** 2 + y ** 2 <= 1 else 0
+
+count = sc.parallelize(range(1, n + 1), partitions).map(f).reduce(add)
+print("Pi is roughly %f" % (4.0 * count / n))
+
+sc.stop()
+
+
+
+
+ [1]: https://spark.apache.org/downloads.html
+ [2]: https://pypi.python.org/pypi/findspark
\ No newline at end of file
diff --git a/brainsteam/content/posts/2018-01-27-how-i-became-a-gopher.md b/brainsteam/content/posts/2018-01-27-how-i-became-a-gopher.md
new file mode 100644
index 0000000..096d7a1
--- /dev/null
+++ b/brainsteam/content/posts/2018-01-27-how-i-became-a-gopher.md
@@ -0,0 +1,110 @@
+---
+title: How I became a gopher over christmas
+author: James
+type: post
+date: 2018-01-27T10:09:34+00:00
+url: /2018/01/27/how-i-became-a-gopher/
+medium_post:
+ - 'O:11:"Medium_Post":11:{s:16:"author_image_url";s:69:"https://cdn-images-1.medium.com/fit/c/200/200/0*naYvMn9xdbL5qlkJ.jpeg";s:10:"author_url";s:30:"https://medium.com/@jamesravey";s:11:"byline_name";N;s:12:"byline_email";N;s:10:"cross_link";s:2:"no";s:2:"id";s:12:"452cd617afb4";s:21:"follower_notification";s:3:"yes";s:7:"license";s:19:"all-rights-reserved";s:14:"publication_id";s:2:"-1";s:6:"status";s:6:"public";s:3:"url";s:95:"https://medium.com/@jamesravey/how-i-became-a-gopher-and-learned-myself-an-angular-452cd617afb4";}'
+categories:
+ - Uncategorized
+tags:
+ - chatbots
+ - filament
+ - go
+
+---
+Happy new year to one and all. It’s been a while since I posted and life continues onwards at a crazy pace. I meant to publish this post just after Christmas but have only found time to sit down and write now.
+
+If anyone is wondering what’s with the crazy title – a gopher is someone who practices the Go programming language (just as those who write in Python refer to themselves as pythonistas. There’s an interesting list of labels that programmers self-assign [here][1] if you’re interested).
+
+Over Christmas I decided I was going to have a break from working as a [CTO][2] and a [PhD student][3] in order to do something relaxing. That’s why I thought I’d teach myself a new programming language. I also taught myself how to use Angular.js too but I’ll probably write about that separately.
+
+## Go Go Gadget… keyboard?
+
+First on my list is the Go programming language. I decided that I would spend 2 weeks over xmas (in between playing with my nintendo switch and of course spending time with my lovely fiancee and our families) building something useful and practical in the language because if there’s one thing I can’t stand its trying to learn to use a programming language by writing yet another [todo list.][4]
+
+
+
+At Filament, we are fast becoming one of the UK’s leading chat-bot providers, working with brands like T-Mobile and Hiscox insurance. We have an excellent team of chat-bot builders run by our very own Mr Chat-bot, [Rory][5] and supported by a very stringent but also very manual QA process. I decided I was going to try and help the team work smarter by building a PoC chatbot testing framework.
+
+The general gist of my tool, named Chatty Cathy, is that I can “record” a conversation with a bot, go make some changes to the intents and flows and then “playback” my conversation to make sure the bot still responds in the way I’d like.
+
+## Why Go?
+
+
+
+If you hadn’t gathered, I’m in to performance-sensitive compute applications: AI, statistical modelling, machine learning, natural language processing. All of these things need a lot of juice. We’re also in the business of writing large-scale web applications that serve these kinds of machine learning models to huge numbers of users. I love python and node.js as much as the next man but most machine learning applications that have interfaces in those languages are written in something low level (C or C++) and bound to these higher level languages for ease of use. I know that Go is still higher level than C or something like Rust* but it outperforms all of the interpreted languages and Java in the [Benchmark Games][6] and is very easy to learn (see below). It is this trade-off relatively-high-performance-versus-ease-of-use that has me so excited.
+
+*Incidentally I spent some time last year working in Rust and even though I loved it, found it very hard going – I’ve been following [Julia Evans’ Ruby Profiler][7] project and it’s got me excited about Rust again so maybe I’ll dive back in some day soon.
+
+### Pros of Working with Go
+
+go +└── src + ├── github.com + │ ├── author1 + │ │ └── project1 + │ │ ├── LICENSE.txt + │ │ ├── README.md + │ │ ├── somefile.go + │ │ ├── more.go + │ ├── author2 + │ │ └── project2 + │ │ ├── LICENSE.txt + │ │ ├── README.md + │ │ ├── somefile.go + │ │ ├── more.go+ +Basically, each project gets stored somewhere hierarchically in the tree and all dependencies of your project end up somewhere in here. This can make it a little bit confusing when you’re working on really large projects but then perhaps this is no more confusing than a python virtualenv or the node_modules directory of a mature node.js app and I’m being silly? + + * Some of the libraries are still a little bit immature. I don’t mean to be disparaging in any way towards the authors of these open source libraries that are doing a fantastic job for absolutely zero payment. However, a lot of the java tooling we use at work has been in development for 2 (in some cases nearly 3) decades and a lot of the crazier use cases I want to try and do are supported out of the box. + * I’m still not a massive fan of the way that [struct tag][8] syntax works. Again, perhaps this is because I am a programming luddite but I definitely prefer Java Annotations and Python Decorators a lot more. + +### Summary + +Go is a really exciting language for me personally but also for many of us still plagued by nightmarish visions of java boilerplate in our sleep. It has a fantastic ecosystem and first-class support for parallel programming and building web services via a really nice set of REST server libraries and JSON serialization libraries that are already built in. There are a few small issues that probably still need addressing but I’m sure they will come out in the wash. + +The most exciting thing for me is how well Go slots into the toolbox of any competent imperative language programmer (e.g. C, C++, Java, Python, Javascript, C#, PHP etc.). Whichever language you come from there are likely to be a few minor changes to get used to but the syntax and concepts are familiar enough that you can be up and running in no time at all! + +Is Go ready to be used in prime-time production-ready systems? I definitely think so but if you don’t believe me, why don’t you ask [the Docker Foundation?][9] + + [1]: https://gist.github.com/coolaj86/9256619 + [2]: http://filament.uk.com/ + [3]: https://www.wisc.warwick.ac.uk/people/student-profiles/2015-intake/james-ravenscroft/ + [4]: https://github.com/search?l=JavaScript&q=todo&type=Repositories&utf8=%E2%9C%93 + [5]: https://disruptionhub.com/whats-crappy-chatbots/ + [6]: http://benchmarksgame.alioth.debian.org/u64q/compare.php?lang=go&lang2=node + [7]: https://jvns.ca/blog/2017/12/02/taking-a-sabbatical-to-work-on-ruby-profiling-tools/ + [8]: https://golang.org/ref/spec#Tag + [9]: https://www.slideshare.net/jpetazzo/docker-and-go-why-did-we-decide-to-write-docker-in-go \ No newline at end of file diff --git a/brainsteam/content/posts/2018-01-27-ui-router-misery.md b/brainsteam/content/posts/2018-01-27-ui-router-misery.md new file mode 100644 index 0000000..04b2105 --- /dev/null +++ b/brainsteam/content/posts/2018-01-27-ui-router-misery.md @@ -0,0 +1,13 @@ +--- +title: Upgrading from legacy ui-router +author: James +type: post +date: -001-11-30T00:00:00+00:00 +draft: true +url: /?p=231 +medium_post: + - 'O:11:"Medium_Post":11:{s:16:"author_image_url";N;s:10:"author_url";N;s:11:"byline_name";N;s:12:"byline_email";N;s:10:"cross_link";N;s:2:"id";N;s:21:"follower_notification";N;s:7:"license";N;s:14:"publication_id";N;s:6:"status";N;s:3:"url";N;}' +categories: + - Uncategorized + +--- diff --git a/brainsteam/content/posts/2018-03-21-re-using-machine-learning-models-and-the-no-free-lunch-theorem.md b/brainsteam/content/posts/2018-03-21-re-using-machine-learning-models-and-the-no-free-lunch-theorem.md new file mode 100644 index 0000000..b703024 --- /dev/null +++ b/brainsteam/content/posts/2018-03-21-re-using-machine-learning-models-and-the-no-free-lunch-theorem.md @@ -0,0 +1,133 @@ +--- +title: Re-using machine learning models and the “no free lunch” theorem +author: James +type: post +date: 2018-03-21T11:26:27+00:00 +url: /2018/03/21/re-using-machine-learning-models-and-the-no-free-lunch-theorem/ +medium_post: + - 'O:11:"Medium_Post":11:{s:16:"author_image_url";s:69:"https://cdn-images-1.medium.com/fit/c/200/200/0*naYvMn9xdbL5qlkJ.jpeg";s:10:"author_url";s:30:"https://medium.com/@jamesravey";s:11:"byline_name";N;s:12:"byline_email";N;s:10:"cross_link";s:2:"no";s:2:"id";s:12:"dd5196577b34";s:21:"follower_notification";s:3:"yes";s:7:"license";s:19:"all-rights-reserved";s:14:"publication_id";s:2:"-1";s:6:"status";s:6:"public";s:3:"url";s:106:"https://medium.com/@jamesravey/re-using-machine-learning-models-and-the-no-free-lunch-theorem-dd5196577b34";}' +categories: + - Uncategorized + +--- +## Why re-use machine learning models? + +
+ +
+ ++ You can get a lot of value out of training a machine learning model to solve a single use case, like predicting emotion in your customer chatbot transcripts and putting the angry ones through to real humans. However, you might be able to extract even more value out of your model by using it in more than one use case. You could use an emotion model to prioritise customer chat sessions but also to help monitor incoming email inquiries and social media channels too. A model can often be deployed across multiple channels and use cases with significantly less effort than creating a new, complete training set for each problem your business encounters. However there are some caveats that you should be aware of. In particular, the “No Free Lunch Theorem” which is concerned with the theoretical drawbacks of deploying a model across multiple use cases. +
++ +
++ +
+ ++ Not exactly. The theorem says that there’s no correlation between your model’s performance in its intended environment and its performance in a completely new environment. However, it doesn’t rule out the possibility of be correlations if we know the nature of the new problems and data. You can think about it in terms of human expertise and specialisation. Humans learn to specialise as they work their way through the education system. A medical doctor and a veterinarian both go through extensive training in order to be able to carry out medical procedures on humans and animals respectively. A veterinarian might be excellent at operating on different species of animals. However, a veterinarian would not be able to operate on an injured human to the same level of professionalism as a medical doctor without some additional training. +
++ +
+ ++ On September 23rd, 1999, a $125 Million Mars lander burned up in orbit around the red planet, much to the dismay of the incredibly talented engineering teams who worked on it. The theory was sound. Diligent checks had been carried out. So why, then, had the mission gone up in flames? A review panel meeting showed that a miscommunication between teams working on the project was the cause. One set of engineers expressed force in pounds, the other team preferred to use newtons. Neither group were wrong but this small miscommunication had everything grinding to a half.
+ I’ve recently been trying to become intimately familiar with how LSH works in theory and practice in order to solve some prickly comparison problems with O(n²) comparisons. +
+ ++ For the uninitiated, LSH or Locally Sensitive Hashing is a method frequently used for “sketching” large data structures in a way that allows quick comparison and grouping of similar items without having to compare every item with every other item in the dataset. It’s often mentioned in the same breath as “the curse of dimensionality”: the problem of dealing with complex data structures like documents and images that must be represented in terms of the words or pixels that they contain which quickly add up and require enormous amounts of memory and compute time for processing. +
+ ++ The literature on LSH is factually accurate and mathematically complete but at the same time it’s really hard going. At the other end of the spectrum are some incredibly helpful blog posts that tell you how LSH works in practice. This post aims to explain the connections between the two. +
+ +## Nearest-Neighbour (NN) Problem + +The nearest neighbour problem is the issue of finding data points or items “most similar” to a particular starting point or “query”. For example, we are building a music recommendation system and we know that the user likes song **q. **We want to find artists similar to song **q** to recommend to the user. We can represent each song as a vector of their attributes – let’s say for simplicity that we’re using 3 dimensions on a scale 1-10: Tempo of music (slow to fast), Singer Pitch (deep to high) and Heaviness (Pop Rock to Death Metal). If you plot all of the song in your catalogue in this way, the ones with a similar sound should end up clustered together. + +In order to find the nearest neighbours for a given data point, for example **q is “Van Halen – Jump” **– we have to loop over all items, find the [euclidean distance][1] between the points and then take the point with the smallest distance as the most similar, in this case, its Queen’s Boh-rhap! In this case there are 6 songs and therefore only 5 comparisons. What if we have a music library of millions of songs? That’s an awful lot of comparisons! + +It’s also reasonable to assume that we’d be interested in comparing more than 3 attributes of a song – we can’t render more than 3 dimensions on a diagram but as you can imagine if there are 1000 or even 10,000 attributes then working out the euclidean distance becomes much harder. How can we reduce the number of comparisons needed? + +## Approximate Nearest-Neighbour (NN) Problem + +In order to speed up the recommendation process, we need to artificially reduce the number of comparisons that our system has to make. What if we had some prior knowledge about which part of the feature space song **q** is in and chose only to compare it with other songs from that space? + +Drawing on from the example above, let’s imagine that we divide our space into two buckets: ‘rock’ and ‘metal’ – if we already know that Van Halen – Jump is a rock song then we can immediately discount Dragonforce, Slipknot and System of a Down as possible nearest neighbours and compare only with Aerosmith and Queen. + +You may have already noticed that there’s a catch here. We’re at risk of missing potential nearest neighbours that sit on the border of our divisions. Metallica – Nothing Else Matters is an unusually slow balladic number from the thrash metal heavyweights and many people who don’t otherwise like Metallica might enjoy it – especially if they like Aerosmith’s Don’t Wanna’ Miss a Thing and other pop-rock ballads. The trade-off here is one of speed versus accuracy. By drawing lines of division down through our collection, we reduce the number of comparisons we usually need to make but risk missing near neighbours that are “on the edge” in the process. We can somewhat address this problem that by dividing our collection up into buckets in a few different ways and checking a handful of them. For example Queen – Bohemian Rhapsody could belong to “singers with high voices” and “rock ballads”. + + + + + + [1]: https://en.wikipedia.org/wiki/Euclidean_distance \ No newline at end of file diff --git a/brainsteam/content/posts/2018-10-18-dont-forget-your-life-jacket-the-dangers-of-diving-in-deep-at-the-deep-end-with-deep-learning.md b/brainsteam/content/posts/2018-10-18-dont-forget-your-life-jacket-the-dangers-of-diving-in-deep-at-the-deep-end-with-deep-learning.md new file mode 100644 index 0000000..7a4b0c0 --- /dev/null +++ b/brainsteam/content/posts/2018-10-18-dont-forget-your-life-jacket-the-dangers-of-diving-in-deep-at-the-deep-end-with-deep-learning.md @@ -0,0 +1,227 @@ +--- +title: 'Don’t forget your life jacket: the ‘dangers’ of diving in deep at the deep end with deep learning' +author: James +type: post +date: 2018-10-18T14:35:05+00:00 +url: /2018/10/18/dont-forget-your-life-jacket-the-dangers-of-diving-in-deep-at-the-deep-end-with-deep-learning/ +featured_image: /wp-content/uploads/2018/10/livesaver-825x510.png +medium_post: + - 'O:11:"Medium_Post":11:{s:16:"author_image_url";s:69:"https://cdn-images-1.medium.com/fit/c/200/200/0*naYvMn9xdbL5qlkJ.jpeg";s:10:"author_url";s:30:"https://medium.com/@jamesravey";s:11:"byline_name";N;s:12:"byline_email";N;s:10:"cross_link";s:2:"no";s:2:"id";s:12:"735db0cf9d14";s:21:"follower_notification";s:3:"yes";s:7:"license";s:19:"all-rights-reserved";s:14:"publication_id";s:12:"6fc55de34f53";s:6:"status";s:6:"public";s:3:"url";s:137:"https://medium.com/@jamesravey/dont-forget-your-life-jacket-the-dangers-of-diving-in-deep-at-the-deep-end-with-deep-learning-735db0cf9d14";}' +categories: + - PhD + - Work +tags: + - deep learning + - filament + - machine learning + - neural networks + +--- ++ +
+ ++ It’s unquestionable that over the last decade, deep learning has changed machine learning landscape for the better. Deep Neural Networks (DNNs), first popularised by Yan LeCunn, Yoshua Bengio and Geoffrey Hinton, are a family of machine learning models that are capable of learning to see and categorise objects, predict stock market trends, understand written text and even play video games.
+ +
++ +
+ ++ Conversely, one of the most exciting things about “deep learning” is that these models are able to learn complex features for themselves over time. Just like a human brain slowly assigns meaning to the seemingly random photons that hit our retinas, deep networks are able to receive series of pixels from images and slowly learn which patterns of pixels are interesting or predictive. The caveat is that automatically deriving these features requires huge volumes of data to learn from (see point 3). Ultimately a deep learning model may be able to implicitly learn features of the data that human data scientists are unable to isolate but if a classical, hand-engineered model gets you to 90% accuracy, is the extra data gathering and compute power worth it for that 5-7% boost? +
++ +
+ ++ It often makes sense to prefer simpler models in cases where compute resource is at a premium or even not available and where classical models give “good enough” accuracy. For example in an edge computing environment in a factory or in an anti-fraud solution at a retail bank where millions of transactions must be examined in real-time. It would either be impossible or obscenely expensive to run a complex deep learning model on millions of data records in real time. Or, it might not be practical to install a cluster of whirring servers into your working environment. On the other hand, if accuracy is what you need and you have lots of data then maybe its time to buy those GPUs… +
+api error: Authentication required: Authorization error: <REPO_URL>/info/lfs/objects/batch +Check that you have proper access to the repository +batch response: Authentication required: Authorization error: <REPO_URL>/info/lfs/objects/batch +Check that you have proper access to the repository+ +Irritatingly I couldn’t find any references to this particular error message or documentation. But, I had a hunch that the authentication on the LFS upload was timing out because I was able to upload smaller files that don’t take as long to send. + +It turns out that this is exactly what was happening. When you push to a Gitea SSH repository, the server gives your local machine an authorization token that it can use to upload the LFS files. This token has an expiry time which defaults to 20 minutes in the future. If you’re uploading 5GB of data over 100Mb/down 10Mb/up DSL line then you’re gonna have a bad time… + +I had a dig through the Gitea github repository and came across an example [config file][4] which includes a variable called LFS\_HTTP\_AUTH_EXPIRY with a default value of 20m. In your gitea config file you can set this to 120m then you have 2 hours to get that file uploaded. Adjust as you see fit/require. + + [1]: https://gitea.io/en-us/ + [2]: https://pimylifeup.com/raspberry-pi-gitea/ + [3]: https://git-lfs.github.com/ + [4]: https://github.com/go-gitea/gitea/blob/master/custom/conf/app.ini.sample \ No newline at end of file diff --git a/brainsteam/content/posts/2018-11-07-why-is-tmux-crashing-on-start.md b/brainsteam/content/posts/2018-11-07-why-is-tmux-crashing-on-start.md new file mode 100644 index 0000000..9e149b3 --- /dev/null +++ b/brainsteam/content/posts/2018-11-07-why-is-tmux-crashing-on-start.md @@ -0,0 +1,25 @@ +--- +title: Why is Tmux crashing on start? +author: James +type: post +date: 2018-11-07T07:40:45+00:00 +url: /2018/11/07/why-is-tmux-crashing-on-start/ +medium_post: + - 'O:11:"Medium_Post":11:{s:16:"author_image_url";N;s:10:"author_url";N;s:11:"byline_name";N;s:12:"byline_email";N;s:10:"cross_link";s:2:"no";s:2:"id";N;s:21:"follower_notification";s:3:"yes";s:7:"license";s:19:"all-rights-reserved";s:14:"publication_id";s:2:"-1";s:6:"status";s:4:"none";s:3:"url";N;}' +categories: + - Open Source +tags: + - linux + - script + - tmux + +--- +I spent several hours trying to get to the bottom of why tmux was crashing as soon as I ran it on Fedora. It turns out there’s a simple fix. When tmux starts it uses /dev/ptmx to create a new TTY (virtual terminal) that the user can interact with. If your user does not have permission to access this device then tmux will silently die. A good way to verify this is to try running [screen][1] too. + +In my case I realised that my user was not a member of the user group “tty” on my system. The answer was therefore simple: + +
sudo usermod -a -G tty james+ +I hope this helps someone avoid spending hours searching for the right incantation. + + [1]: https://en.wikipedia.org/wiki/GNU_Screen \ No newline at end of file diff --git a/brainsteam/content/posts/2018-12-09-🤐🤐can-bots-keep-secrets-the-future-of-chatbot-security-and-conversational-hacks.md b/brainsteam/content/posts/2018-12-09-🤐🤐can-bots-keep-secrets-the-future-of-chatbot-security-and-conversational-hacks.md new file mode 100644 index 0000000..57e6a9d --- /dev/null +++ b/brainsteam/content/posts/2018-12-09-🤐🤐can-bots-keep-secrets-the-future-of-chatbot-security-and-conversational-hacks.md @@ -0,0 +1,121 @@ +--- +title: 🤐🤐Can Bots Keep Secrets? The Future of Chatbot Security and Conversational “Hacks” +author: James +type: post +date: 2018-12-09T10:36:34+00:00 +url: /2018/12/09/🤐🤐can-bots-keep-secrets-the-future-of-chatbot-security-and-conversational-hacks/ +medium_post: + - 'O:11:"Medium_Post":11:{s:16:"author_image_url";s:69:"https://cdn-images-1.medium.com/fit/c/200/200/0*naYvMn9xdbL5qlkJ.jpeg";s:10:"author_url";s:30:"https://medium.com/@jamesravey";s:11:"byline_name";N;s:12:"byline_email";N;s:10:"cross_link";s:3:"yes";s:2:"id";s:12:"8be78d43ff66";s:21:"follower_notification";s:3:"yes";s:7:"license";s:19:"all-rights-reserved";s:14:"publication_id";s:12:"6fc55de34f53";s:6:"status";s:6:"public";s:3:"url";s:121:"https://medium.com/@jamesravey/can-bots-keep-secrets-the-future-of-chatbot-security-and-conversational-hacks-8be78d43ff66";}' +categories: + - Work +tags: + - bots + - chatbots + - nlu + - security + +--- +**As adoption of chatbots and conversational interfaces continues to grow, how will businesses keep their brand safe and their customer’s data safer?** + +From [deliberate infiltration of systems][1] to[ bugs that cause accidental data leakage][2], these days, the exposure or loss of personal data is a large part of what occupies almost every self-respecting CIO’s mind. Especially since [the EU has just slapped its first defendant with a GDPR fine.][3] + +Over the last 10-15 years, through the rise of the “interactive” web and social media, many companies have learned the hard way about the importance of techniques like [hashing passwords][4] stored in databases and [sanitising user input before it is used for querying databases][5]. However as the use of chatbots continues to grow, conversational systems are almost certain to become an attractive method of attack for discerning hackers. + +In this article I’m going to talk about some different types of chatbot attacks that we might start to see and what could be done to prevent them. + +## Man in the Middle Attack + +In a man in the middle attack, the adversary intercepts traffic in between the many components that make up a chatbot. Baddies might be able to [inject something into a library][6] that your beautiful UX uses that logs everything that your user is saying or they might not need to change the code at all [if you are not using HTTPS][7]. + +These sorts of attacks are clearly a serious problem for any chatbot that will be talking to users about personal information. Even if your chatbot is designed to answer frequently asked questions without any specific link to personal accounts, vulnerability to this attack could give away personal information that the user has inadvertently shared (From “Do you have kids meals?” and “Do you deliver to Example Street” we can infer that the user has children and lives on Example Street). + +### Mitigation + +Developers of chatbots should make sure that bots are using the [latest security standards][8] – at a minimum [all communication should be encrypted at the transport layer (e.g. HTTPS)][9] but you might also consider [encrypting the actual messages][10] before they are transmitted as well. If you’re reliant on external open source libraries then make sure you [regularly run security checks on your codebase][11] to make sure that those external libraries can be trusted. If you are deploying a bot in a commercial context then you should definitely have independent security/penetration testing of chatbots as a key part of your quality assurance process. + +## Exploitation of Third Party Services + +The chatbot has often been seen as the “silver bullet” for quickly acquiring usage. No longer do you need to build an app that users have to install on their devices, simply integrate with the platforms that people already use e.g. Facebook, Google Home, Alexa and others. However, it’s important to remember the security consequences of this approach, especially in use cases with sensitive personal information and high stakes if there was ever a data leak. + +In this scenario your bot’s security is heavily reliant on the security of the messaging platform that you deploy your system onto. For the most part, these platforms typically have[ sensible security procedures][12]. However it’s important to consider that large companies and platforms are desirable targets for hackers due to the huge potential personal data pay off from a successful breach. + +Of course it’s not just the “Messenger Platform” part of this system that’s of interest to attackers. The “External NLU provider” in our diagram above could also be the target of an attack and user utterances stolen. Remember that any external service, whilst useful in many use cases, should be regarded with a healthy scepticism where security is concerned. + +### Mitigation + +If you are building chatbots tied to third party platforms then you can try to mitigate risks by coding defensively and sharing information sparingly. For example, never have your chatbot ask the user for things like passwords or credit card numbers through one of these portals. Instead use your companion app or website to gather this information securely and tie the user’s Messenger ID to their user account within your infrastructure. + +When it comes to using external NLU a good practice is to run some [anonymisation, removing things like names, addresses, phone numbers etc,][13] on input utterances before passing them on to the service. You might also consider using on-premise NLU solutions so that chat utterances never have to leave your secure environment once they’ve been received. + +## Webhook Exploits + +When your bot relies on an external messaging platform as in the above scenario, the WebHook can be another point of weakness. If hackers can find the URL of your webhook then [they can probe it and they can send it messages][14] that look like they’re from the messaging platform. + +### Mitigation + +Make sure that your webhook requires authentication and make sure that you follow the guidelines of whichever messenger platform you are using in order to authenticate all incoming messages. Never process messages that fail these checks. + +## Unprotected Device Attacks + +Have you ever left your computer unlocked and gone to the water cooler? How about handing your mobile phone to a friend in order to make a call or look at a funny meme? Most people have done this at least once and if you haven’t, well done! + +You should [be prepared for opportunistic attackers posing as other users when using your chatbot][15]. They might ask probing questions in order to get the user’s information “What delivery address do you have for me again?” or “What credit card am I using?” + +### Mitigation + +Remember to code and design defensively. Responding with something like “I’m sorry I don’t know that but you can find out by logging in to the secure preferences page [URL Here]” would be a relatively good response. + +Of course there’s not much you can do if the user leaves their passwords written down on a sticky note next to the terminal or leaves their password manager app unlocked but by requiring users log in to get access to sensitive personal info we’ve taken some sensible precautions. + +## Brand Poisoning Attacks {#mce_12} + +
+ Looking back at some of the biggest AI and ML developments from 2018 and how they might influence applied AI in the coming year. +
+ +2018 was a pretty exciting year for AI developments. It’s true to say there is still a lot of hype in the space but it feels like people are beginning to really understand where AI can and can’t help them solve practical problems. + +In this article we’ll take a look at some of the AI innovation that came out of academia and research teams in 2018 and how they might affect practical AI use cases in the coming year. + +## More Accurate and Faster-to-Train NLP Models with Pre-trained Language Models + +Imagine if instead of going to school and university you could be given a microchip implant that teaches you most things you need to know about your subject of choice. You’d still need to learn-by-doing when you landed a job with your “instant” degree and fine tune the knowledge that had been given to you but hey, we’re talking about 6-12 months of learning instead of 12-18 years. That’s the essence of what Transfer Learning is all about within the Machine Learning and Deep Learning space. + +BERT is a state-of-the-art neural NLP model [unveiled by Google][1] in November 2018. It, like a number of other models unveiled in 2018 like [ELMo][2] and [ULMFiT][3] can be pre-trained on unlabelled text (think news articles, contracts and legal terms, research papers or even wikipedia) and then used to support supervised/labelled tasks that require much smaller volumes of training data than an end-to-end supervised task. For example we might want to automate trading of stocks and shares based on sentiment about companies in the news. In the old days we’d have to spend weeks having armies of annotators read news articles and highlight companies and indicators of sentiment. A pre-trained language model may already have captured the underlying semantic relationships needed to understand company sentiment so we only need to annotate a fraction of the data that we would if we were training from scratch. + +Of course another benefit of using pre-trained models is reduced training time, compute resources (read: server/energy usage costs). Like half-baked bread, the model still needs some time in the oven to make the connections it needs to perform its final task but this is a relatively short amount of time compared to training from empty. + +In 2019 we’ll be training lots of NLP models a lot more quickly and effectively thanks to these techniques. + + +## Photo-realistic Image Creation + +For those unfamiliar with GANs, we’re talking about unsupervised neural models that can learn to generate photo-realistic images of people, places and things that don’t really exist. Let that sink in for a moment! + +Originally invented by [Ian Goodfellow in 2014][4], GANs back then were able to generate small, pixelated likenesses but they’ve come a long way. [StyleGAN][5] is a paper published by a research team at NVIDIA which came out in December and might have slipped under your radar in the festive mayhem of that month. However StyleGAN represents some serious progress in generated photo-realism. + +Firstly StyleGAN can generate images up to 1024×1024 pixels. That’s still not huge in terms of modern photography but instagram pictures are 1080×1080 and most social media networks will chop your images down to this kind of ballpark in order to save bandwidth so we’re pretty close to having GANs that can generate social-media-ready images. + +The second major leap represented by StyleGAN is the ability to exercise tight control over the style of the image being generated. Previous GAN implementations generated their images at random. StyleGAN uses parameters to control the styles of the output images, changing things like hair colour, whether or not the person is wearing glasses, and other physical properties. + +++ +This isn’t a clear cut right to an explanation for all automated decisions but it does mean that extra dilligence should be carried out where possible in order to understand automated decisions that affect users’ legal rights. As the provision says this could massively affect credit scoring bureaus and e-recruitment firms but could also affect car insurance firms who use telemetrics data as part of their decision making process when paying out for claims or retailers that use algorithms to decide whether to accept returned digital or physical goods. + +In 2018 the best practices for model interpretability lay in training a “meta model” that sits on top of your highly accurate deep neural network and tries to guess which features of the data caused it to make a particular decision. These meta-models are normally simple in implementation (e.g. automated decision trees) so that they themselves can be directly inspected and interpreted. + +Whether spurred on by the letter of the law or not, understanding why your model made a particular decision can be useful for diagnosing flaws and undesirable biases in your systems anyway. + +In 2019 we expect that model interpretability will help providers and developers of AI to improve their approach and offer their users more transparency about decisions made. + + [1]: https://ai.googleblog.com/2018/11/open-sourcing-bert-state-of-art-pre.html + [2]: https://arxiv.org/abs/1802.05365 + [3]: https://arxiv.org/abs/1801.06146 + [4]: https://arxiv.org/abs/1406.2661 + [5]: https://arxiv.org/abs/1812.04948 + [6]: https://www.thecut.com/2018/05/lil-miquela-digital-avatar-instagram-influencer.html + [7]: https://ai.googleblog.com/2018/05/duplex-ai-system-for-natural-conversation.html + [8]: https://uk.pcmag.com/opinions/94828/google-duplex-is-classist-heres-how-to-fix-it \ No newline at end of file diff --git a/brainsteam/content/posts/2019-01-15-spacy-link-or-how-not-to-keep-downloading-the-same-files-over-and-over.md b/brainsteam/content/posts/2019-01-15-spacy-link-or-how-not-to-keep-downloading-the-same-files-over-and-over.md new file mode 100644 index 0000000..cf6f31d --- /dev/null +++ b/brainsteam/content/posts/2019-01-15-spacy-link-or-how-not-to-keep-downloading-the-same-files-over-and-over.md @@ -0,0 +1,37 @@ +--- +title: Spacy Link or “How not to keep downloading the same files over and over” +author: James +type: post +date: 2019-01-15T18:14:16+00:00 +url: /2019/01/15/spacy-link-or-how-not-to-keep-downloading-the-same-files-over-and-over/ +medium_post: + - 'O:11:"Medium_Post":11:{s:16:"author_image_url";s:69:"https://cdn-images-1.medium.com/fit/c/200/200/0*naYvMn9xdbL5qlkJ.jpeg";s:10:"author_url";s:30:"https://medium.com/@jamesravey";s:11:"byline_name";N;s:12:"byline_email";N;s:10:"cross_link";s:2:"no";s:2:"id";s:12:"11a44e1c247f";s:21:"follower_notification";s:3:"yes";s:7:"license";s:19:"all-rights-reserved";s:14:"publication_id";s:2:"-1";s:6:"status";s:6:"public";s:3:"url";s:114:"https://medium.com/@jamesravey/spacy-link-or-how-not-to-keep-downloading-the-same-files-over-and-over-11a44e1c247f";}' +categories: + - Uncategorized + +--- +If you’re a frequent user of spacy and virtualenv you might well be all too familiar with the following: + ++ not to be subject to a decision, which may include a measure, evaluating personal aspects relating to him or her which is based solely on automated processing and which produces legal effects concerning him or her or similarly significantly affects him or her, such as automatic refusal of an online credit application or e-recruiting practices without any human intervention. +
+ + GDPR Recital 71 +
++ +If you’re lucky and you have a decent internet connection then great, if not it’s time to make a cup of tea. + +Even if your internet connection is good. Did you ever stop to look at how much disk space your python virtual environments were using up? I recently found that about 40GB of disk space on my laptop was being used by spacy models I’d downloaded and forgotten about. + +Fear not – spacy link offers you salvation from this wasteful use of disk space. + +Spacy link essentially allows you to link your virtualenv copy of spacy to a copy of the model you already downloaded. Say you installed your desired spacy model to your global python3 installation – somewhere like** _/usr/lib/python3/site-packages/spacy/data_**** __** + +Spacy link will let you link your existing model into a virtualenv to save redownloading (and using extra disk space). From your virtualenv you can do: + +python -m spacy link ** _/usr/lib/python3/site-packages/spacy/data/+ python -m spacy download en_core_web_lg
+
Collecting en_core_web_lg==2.0.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.0.0/en_core_web_lg-2.0.0.tar.gz#egg=en_core_web_lg==2.0.0
Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.0.0/en_core_web_lg-2.0.0.tar.gz (852.3MB)
5% |█▉ | 49.8MB 11.5MB/s eta 0:01:10 +
pip install torch==1.4.0+cu100 -f https://download.pytorch.org/whl/torch_stable.html
+
+This will install torch 1.4.0 with cuda 10.0 support and it’ll work out which version of Python you’re running for you.
+
+Now if you’re using Pipenv which tries to simplify virtualenv management and package versioning, you’ll quickly see that there is no way to run the above with `pipenv install`. Currently the only solution I can find is to manually run the above command prefixed with `pipenv run`
+
+So far I’ve only found one other person who’s asked about this particular issue on [stackoverflow][2]. I’ve also [opened a github ticket][3] in the pipenv project. I am curious to know if anyone else has run into this issue or has a solution
+
+ [1]: https://download.pytorch.org/whl/torch_stable.html
+ [2]: https://stackoverflow.com/questions/59752559/how-to-specify-pytorch-cuda-version-in-pipenv
+ [3]: https://github.com/pypa/pipenv/issues/4121
\ No newline at end of file
diff --git a/brainsteam/content/posts/2020-09-04-dark-recommendation-engines-algorithmic-curation-as-part-of-a-healthy-information-diet.md b/brainsteam/content/posts/2020-09-04-dark-recommendation-engines-algorithmic-curation-as-part-of-a-healthy-information-diet.md
new file mode 100644
index 0000000..1707fc2
--- /dev/null
+++ b/brainsteam/content/posts/2020-09-04-dark-recommendation-engines-algorithmic-curation-as-part-of-a-healthy-information-diet.md
@@ -0,0 +1,107 @@
+---
+title: '‘Dark’ Recommendation Engines: Algorithmic curation as part of a ‘healthy’ information diet.'
+author: James
+type: post
+date: 2020-09-04T15:30:19+00:00
+url: /2020/09/04/dark-recommendation-engines-algorithmic-curation-as-part-of-a-healthy-information-diet/
+featured_image: /wp-content/uploads/2020/09/maxresdefault-825x510.jpg
+medium_post:
+ - 'O:11:"Medium_Post":11:{s:16:"author_image_url";s:69:"https://cdn-images-1.medium.com/fit/c/200/200/0*naYvMn9xdbL5qlkJ.jpeg";s:10:"author_url";s:30:"https://medium.com/@jamesravey";s:11:"byline_name";N;s:12:"byline_email";N;s:10:"cross_link";s:3:"yes";s:2:"id";s:12:"2969b63de7ec";s:21:"follower_notification";s:3:"yes";s:7:"license";s:19:"all-rights-reserved";s:14:"publication_id";s:2:"-1";s:6:"status";s:6:"public";s:3:"url";s:130:"https://medium.com/@jamesravey/dark-recommendation-engines-algorithmic-curation-as-part-of-a-healthy-information-diet-2969b63de7ec";}'
+categories:
+ - Uncategorized
+
+---
+### In an ever-growing digital landscape filled with more content than a person can consume in their lifetime, recommendation engines are a blessing but can also be a a curse and understanding their strengths and weaknesses is a vital skill as part of a balanced media diet.
+
+If you remember when connecting to the internet involved a squawking modem and images that took 5 minutes to load then you probably discovered your favourite musician after hearing them on the radio, reading about them in NME being told about them by a friend. Likewise you probably discovered your favourite TV show by watching live terrestrial TV, your favourite book by taking a chance at your local library and your favourite movie at a cinema. You only saw the movies that had cool TV ads or rave reviews – you couldn’t afford to take a chance on a dud when one ticket, plus bus fare plus popcorn and a drink cost more than two weeks pocket money.
+
+In the year 2020 you can plug your phone into your car, load up Spotify and instantly access over 40 million songs at the touch of a button. You can watch almost any TV show or movie from the last 60 years from your couch. You can read almost any book ever written for free or next to nothing online (especially if your library has free ebook access [like mine][1]). In the space of a few years, our media consumption habits have COMPLETELY changed and that is wonderful and amazing in a kind of utopian star trek “land of plenty” kind of way.
+
+Unfortunately there’s a downside to having access to the entirety of humanity’s collective knowledge at the click of a button. With so much choice and [3 weeks of video content being added to youtube every minute][2] it is easy to become overwhelmed. Humans aren’t good at choices that have too many options. We are overcome with [analysis paralysis][3] and if it is left unchecked, we can waste hours of our lives scrolling netflix, reading show synopses but never watching any shows. After all, time is precious and a 90 minute movie is a sizeable, non-refundable investment. What if you don’t like it when there’s thousands of hours of other movies that you could be watching instead that could be better? Solving this problem across all sorts of media (news articles, movies, songs, video games) was the original motivation behind recommendation systems.
+
+## Recommender Systems 101
+
+Recommendation engines are all about driving people towards a certain type of content – in the use case above, it’s about driving people towards stuff they’ll like so that they feel like they’re getting value out of the platform they’re paying for and they continue to use the platform. There are a few different ways that recommender systems work but here are the basics:
+
+#### Collaborative Recommendation
+
+**_If Bob buys nappies (diapers) and Fred buys diapers AND powdered milk then_ maybe we should recommend powdered milk to Bob**
+
+The above sentence summarises the underlying theory behind collaborative recommenders. We can build a big table of all of our customers and the products that they bought (or movies that they watched) and we can use a technique called [matrix factorization][4] to find sets of products that commonly get consumed together and then finding users who already consumed a subset of these products and recommending the missing piece. The below video explains this concept in more detail.
+
+Collaborative filtering has a neat little surprise up its sleeve: emergent novelty. The chances are that someone you don’t know who has similar taste to you is in a good position to introduce you to new content that you didn’t know you liked. If Bob buys a coffee machine and we recommend it to Fred, the latter user might go “oh wow, I am pretty tired, I hadn’t considered a coffee machine – neat!” Of course this can have the opposite effect too.
+
+#### Content-based Recommendation
+
+**_Bob likes Terminator 2 which has the properties: ‘science fiction’, ’80s movie’_,’directed-by-James-Cameron’ he might also therefore like “Aliens”**,
+
+Content-based recommenders, as the summary above suggests, are all about taking properties of the content and using them to draw similarities with other content that might interest the user. Content-based recommendation is more computationally expensive than collaborative filtering since you need to extract ‘features’ of the things you’re recommending at scale (e.g. you might build an algorithm that looks at every frame of every movie in your collection and checks for cyborgs). It’s also very hard to do feature extraction on physical products and e-commerce sites tend to stick to collaborative approaches.
+
+Content-based recommenders can sometimes get stuck in an echo-chamber mode of recommending very ‘samey’ stuff all the time – there’s no element of surprise or novelty like you’d get with collaborative filtering.
+
+#### Hybrid Content-Collaborative Recommendation
+
+**Bob likes Terminator – an 80s sci-fi movie, Fred likes Terminator- an 80s sci-fi movie and Aliens, Janet likes Ghostbusters, an 80s sci-fi comedy. Recommend Aliens and Terminator to Janet and Ghostbusters to Bob and Fred.**
+
+In this mode of operating, we get the best of both worlds. Terminator and Aliens have a very different tone to Ghostbusters but there’s a decent chance that Bob and Fred would like it and there’s some ‘feature’ overlap between the three movies (80s, sci-fi).
+
+Hybrid recommendation is also pretty useful when you have limited information about your users because they only just joined or they didn’t use your system very much yet (This is known as the cold start problem). For example, if a new user, Rachael, comes along we can’t use collaborative filtering because we don’t know what films she likes and what other users with her taste have watched. However, we could give her an on-boarding questionnaire and if she tells us she likes 80s sci-fi but not comedy then we can recommend Aliens, Terminator and not Ghostbusters. The more we learn about her, the better these recommendations will get.
+
+## Manipulation and ulterior motive: the dark side of recommendation engines
+
+Recommendation engines are a great way to introduce people to movies, songs, news articles and even physical products that they might be interested in. But what if the motivation behind your recommendation system is no longer to make the user happy? As long as we have a large, consistent set of data relating products (movies/songs/books etc) to users we can train a recommendation engine to optimise itself towards that end. We could train a recommender that always makes terrible recommendations by flipping the dataset we collected about what users like – not a particularly useful exercise but it could be fun.
+
+What if the recommendation engine serving up your news articles isn’t optimised to show you what you like but in fact is optimised to show you more of what keeps you engaged? There may be some overlap here but the distinction is **key**. All the system’s owner would need to do is collect a table of content that the user likes or comments on or shares.
+
+The phrase “there’s no such thing as bad press” is a lot older than social media but has never been more relevant. For decades, traditional print media outlets have used bad news and emotive content to sell more papers. Journalists have become experts at politicising and polarising everything from [avocados][5] to [gen z][6]. Online news outlets use a similar mechanism.
+
+Online news outlets don’t make money from selling print media but from selling space on their websites for showing adverts and they get paid for every person who clicks on an advert. It’s probably only 1 in 1000 people that clicks on an ad but if 100,000 people read your article then maybe you’ll get 100 clicks. This has given rise to [“clickbait”][7] headlines that use misleading exaggeration to pull users in to what is more often than not an article of dubious or no interest. Clickbait, at least, is usually fairly easy to detect since the headlines are pretty formulaic and open ended (that’s my one neat trick that journalists hate me for).
+
+Social networks, like online news outlets, also make money from driving users towards adverts. Most people would read a news article once and close the page, 1 in 1000 of them might click a relevant advert while they’re at it. However, users typically spend a lot more time on a social network site, liking their neighbour’s cat picture, wishing their great aunt a happy birthday, getting into arguments and crucially clicking adverts. The longer you spend on the social network site, the more adverts you’re exposed to and maybe, just maybe, if you see the picture of the new coffee machine enough times you’ll finally click buy.
+
+So how can social networks keep users clicking around for as long as possible? Maybe by showing them content that piques their interest, that they respond emotionally to, that they want to share with their friends and comment on. How can they make sure that the content that they show is relevant and engaging? Well they can use recommendation engines!
+
+#### A recipe for a “dark” recommendation engine
+
+In order to train a pretty good hybrid recommendation engine that can combine social recommendations with “features” of the content to get relevant data we need:
+
+ 1. Information about users – what they like, what they dislike – what they had for breakfast (they know it was a muffin and a latte from that cute selfie you uploaded at Starbucks this morning), what your political alignment is (from when you joined “Socialist memes for marxist teens” facebook group) – **CHECK**
+
+ 2. Information about the content – what’s the topic? Does it use emotive words/swears? Does it have a strong political alignment either way? – using Natural Language Processing they can automatically find all of this information for millions of articles at a time – **CHECK**
+
+ 3. Information about users who interact with certain content – they know who commented on what. They know that the photo of your breakfast got 25 likes 2 comments and that the news article in the Washington Post about Trump got 1500 likes, 240 angry reacts and 300 comments. They also know that 250 of the 300 comments were left by people from the left-wing of politics – **CHECK**
+
+That’s all they need to optimise for “engagement”. A hybrid recommendation engine can learn that putting pro-Trump articles in front of people who like “Bernie 2020” is going to drive a lot of “engagement” and it can learn that displaying articles branding millenials as lazy and workshy in front of 20-to-30-somethings is going to drive a lot of “engagement” too.
+
+Recommendation engines can learn to only ever share left wing content with left wing people, likewise for right-wingers – creating an echo-chamber effect. Even worse, articles containing misinformation can be promoted to the top of everyone’s “to read” list because of the controversial response they will receive.
+
+These effects contribute to the often depressing and exhausting experience of spending time on a social media site in 2020. You might come away miserable but the algorithm has done its job – it’s kept a large number of people engaged with the site and exposed them to lots of adverts.
+
+## Good news everyone!
+
+Let’s face it – its not all bad – I love pictures of cats sat in boxes and the algorithms have learned this. Spotify has exposed me to a number of bands that absolutely love and that would never get played on the local terrestrial radio station I periodically listen to in the car. I’ve found shows and books I adore on Netflix and Kindle. I’ve found loads of scientific papers that were very relevant for my research into NLP using sites like [Semantic Scholar][8]
+
+I guess its also worth noting that the motivation of media platforms like Netflix and Spotify is to help you enjoy yourself so that you pay your subscription as opposed to ‘free’ social sites that are happy to make you miserable if it means that you’ll use them for longer.
+
+The aim of this article was to show you how recommendation engines work and why the motivation for building them is **SO IMPORTANT.** Secondly, I wanted to show you that it’s important for us to diversify our information intake beyond what the big social media platforms spoon feed us.
+
+You can use sites like reddit where content is aggregated by human votes rather than machines (although fair warning, controversial material can still be disproportionately represented and certain subreddits might depress you more than your social media feed).
+
+You can use chronological social media systems like [mastodon][9] that don’t shuffle content around hoping to get you to bite on something juicy. I can also recommend the use of RSS reader systems like [Feedly][10] which aggregate content from blog sites in chronological order with minimal interference.
+
+Finally I want to issue a rallying cry to fellow machine learning engineers and data scientists to really think about the recommendation systems that you’re building and the optimisation mission you’ve been set. Would you let your family use it or would it make them miserable? Be responsible and be kind.
+
+ [1]: https://www.hants.gov.uk/librariesandarchives/library/whatyoucanborrow/ebooksaudiobooks
+ [2]: https://tubularinsights.com/youtube-300-hours/
+ [3]: https://www.ted.com/talks/barry_schwartz_the_paradox_of_choice
+ [4]: https://en.wikipedia.org/wiki/Matrix_factorization_(recommender_systems)
+ [5]: https://www.theguardian.com/lifeandstyle/2017/may/15/australian-millionaire-millennials-avocado-toast-house
+ [6]: https://www.benzinga.com/fintech/20/09/17377335/the-pandemic-is-contributing-to-financial-scams-and-generation-z-is-especially-vulnerable
+ [7]: https://www.merriam-webster.com/dictionary/clickbait
+ [8]: http://semanticscholar.org/
+ [9]: https://mastodon.social/about
+ [10]: https://feedly.com/
\ No newline at end of file
diff --git a/brainsteam/content/posts/2020-09-11-.md b/brainsteam/content/posts/2020-09-11-.md
new file mode 100644
index 0000000..eff92a5
--- /dev/null
+++ b/brainsteam/content/posts/2020-09-11-.md
@@ -0,0 +1,29 @@
+---
+title: Do more than ‘kick the tires’ of your NLP model
+author: James
+type: post
+date: -001-11-30T00:00:00+00:00
+draft: true
+url: /?p=498
+medium_post:
+ - 'O:11:"Medium_Post":11:{s:16:"author_image_url";N;s:10:"author_url";N;s:11:"byline_name";N;s:12:"byline_email";N;s:10:"cross_link";N;s:2:"id";N;s:21:"follower_notification";N;s:7:"license";N;s:14:"publication_id";N;s:6:"status";N;s:3:"url";N;}'
+categories:
+ - Uncategorized
+
+---
+### _We’ve known for a while that ‘accuracy’ doesn’t tell you much about your machine learning models but now we have a better alternative!_
+
+“So how accurate is it?” – a phrase that many data scientists like myself fear and dread being asked by business stakeholders. It’s not that I fear I’ve done a bad job but that evaluation of model performance is complex and multi-faceted and that summarising it with a single number usually doesn’t do it justice. Accuracy can also be a communications hurdle – it is not an intuitive concept and it can lead to friction and misunderstanding if you’re not ‘in’ with the AI crowd. 50% model accuracy across a model that has 1500 possible answers could be considered pretty good. 80% accuracy in a task setting where data is split 80:10 across two classes is meaningless (that means that randomly guessing is more effective than the model).
+
+I’ve written before about [how we can use finer-grained metrics like Recall, Precision and F1-score to evaluate machine learning models][1]. However, many of us in the AI/NLP community still feel that these metrics are too simplistic and do not adequately describe the characteristics of trained ML models. Unfortunately, we didn’t have many other options for evaluating model performance… until now that is…
+
+## Checklist – When machine learning met test automation
+
+At the Annual Meeting of the Association for Computational Linguistics 2020 – a very popular academic conference on NLP – [Ribeiro et al presented a new method for evaluating NLP models,][2] inspired by principles and techniques that software quality assurance (QA) specialists have been using for years.
+
+The idea is that we should design and implement test cases for NLP models that reflect the tasks that the model will be required to perform “in the wild”. Like software QA, these test cases should include tricky edge cases that may trip the model up in order to understand the practical limitations of the model.
+
+For example, we might train a named entity recognition model that
+
+ [1]: https://brainsteam.co.uk/2016/03/29/cognitive-quality-assurance-an-introduction/
+ [2]: https://www.aclweb.org/anthology/2020.acl-main.442.pdf
\ No newline at end of file
diff --git a/brainsteam/content/posts/2020-11-27-dvc-and-backblaze-b2-for-reliable-reproducible-data-science.md b/brainsteam/content/posts/2020-11-27-dvc-and-backblaze-b2-for-reliable-reproducible-data-science.md
new file mode 100644
index 0000000..9b133ca
--- /dev/null
+++ b/brainsteam/content/posts/2020-11-27-dvc-and-backblaze-b2-for-reliable-reproducible-data-science.md
@@ -0,0 +1,161 @@
+---
+title: 'DVC and Backblaze B2 for Reliable & Reproducible Data Science'
+author: James
+type: post
+date: 2020-11-27T15:43:48+00:00
+url: /2020/11/27/dvc-and-backblaze-b2-for-reliable-reproducible-data-science/
+featured_image: /wp-content/uploads/2020/11/pexels-panumas-nikhomkhai-1148820-825x510.jpg
+medium_post:
+ - 'O:11:"Medium_Post":11:{s:16:"author_image_url";s:69:"https://cdn-images-1.medium.com/fit/c/200/200/0*naYvMn9xdbL5qlkJ.jpeg";s:10:"author_url";s:30:"https://medium.com/@jamesravey";s:11:"byline_name";N;s:12:"byline_email";N;s:10:"cross_link";s:3:"yes";s:2:"id";s:12:"d44d231b648f";s:21:"follower_notification";s:3:"yes";s:7:"license";s:19:"all-rights-reserved";s:14:"publication_id";s:2:"-1";s:6:"status";s:6:"public";s:3:"url";s:103:"https://medium.com/@jamesravey/dvc-and-backblaze-b2-for-reliable-reproducible-data-science-d44d231b648f";}'
+categories:
+ - Uncategorized
+tags:
+ - data science
+ - devops
+ - machine learning
+
+---
+## Introduction
+
+When you’re working with large datasets, storing them in git alongside your source code is usually not an optimal solution. Git is famously, not really suited to large files and whilst general purpose solutions exist ([Git LFS][1] being perhaps the most famous and widely adopted solution), [DVC][2] is a powerful alternative that does not require a dedicated LFS server and can be used directly with a range of cloud storage systems as well as traditional NFS and SFTP-backed filestores all listed out [here.][3]
+
+It’s also worth pointing out that another point in DVC’s favour is its [powerful dependency system][4] and [being able to precisely recreate data science projects down to the command line flag][5] – particularly desirable in academic and commercial R&D settings.
+
+I use data buckets like S3 and Google Cloud Storage at work frequently and they’re very useful as an off-site backup large quantities of training data. However, in my personal life my favourite S3-like vendor is [BackBlaze][6] who offer a professional, reliable service with [cheaper data access rates than Amazon and Google][7] and [offer an S3-compatible API][8] which you can use in many places – including DVC. If you’re new to remote storage buckets or you want to try-before-you-buy, BackBlaze offer 10GB of remote storage free – plenty of room for a few hundred thousand pictures of [dogs and chicken nuggets][9] to train your classifier with.
+
+## Setting up your DVC Project
+
+Configuring DVC to use B2 instead of S3 is actually a breeze once you find the right incantation in the documentation. Our first step, if you haven’t done it already is to install dvc. You can download an installer bundle/debian package/RPM package from [their website][2] or if you prefer you can install it inside python via `pip install dvc[all]` – the [all] on the end pulls in all the various DVC remote storage libraries – you could swap this for [s3] if you just want to use that.
+
+Next you will want to create your data science project – I usually set mine up like this:
+
+- README.md
+- .gitignore <-- prefilled with pythonic ignore rules
+- environment.yml <-- my conda environment yaml
+- data/
+ - raw/ <-- raw unprocessed data assets go here
+ - processed/ <-- partially processed and pre-processed data assets go here
+-
+
+
+Now we can initialize git and dvc:
+
+git init
+dvc init
+
+## Setting up your Backblaze Bucket and Credentials
+
+Now we’re going to create our bucket in backblaze. Assuming you’ve registered an account, you’ll want to go to “My Account” in the top right hand corner, then click “Create a new bucket”
+
+Enter a bucket name (little gotcha: the name must be unique across the whole of backblaze – not just your account) and click “Create a Bucket” taking the default options on the rest of the fields.
+
+Once your bucket is created you’ll also need to copy down the “endpoint” value that shows up in the information box – we’ll need this later when we set up DVC.
+
+We’re also going to need to create credentials for accessing the bucket. Go back to “My Account” and then “App Keys” and go for “Add a New Application Key”
+
+Here you can enter a memorable name for this key – by convention I normally use the name of the experiment or model that I’m training.
+
+You can leave all of the remaining options with default/empty values or you can use these to lock down your security if you have multiple users accessing your account (or in the event that your key got committed to a public github repo) – for example we could limit this key to only the bucket we just created or only folders with a certain prefix within this bucket. For this tutorial I’m assuming you left these as they were and if you change them, your mileage may vary.
+
+Once you create the key you will need to copy down the keyID and applicationKey values – heed the warning – they will only appear once and as soon as you move off this page it will be gone forever unless you copy the values somewhere safe. It’s not the end of the world since we can create more keys but still a bit annoying to have to go through again.
+
+If you’ve got the name of your bucket, your endpoint, your keyID and applicationKey values stored somewhere safe then we’re done here and we can move on to the next step.
+
+## Configuring your DVC ‘remote’
+
+With our bucket all set up, we can configure DVC to talk to backblaze. First we add a new remote to DVC. The `-d` flag sets this as the default (so that when we push it will send the data to this location by default without being told explicitely).
+
+dvc remote add b2 s3://your-bucket-name/
+
+So DVC knows about our bucket but unless we tell it otherwise it will assume that it’s an Amazon S3 bucket rather than a B2 bucket. We need to tell it our endpoint value:
+
+dbc remote modify b2 endpointurl https://s3.us-west-002.backblazeb2.com
+
+You’ll see that I’ve copied and pasted my endpoint from when I set up my bucket and stuck “https://” on the front which dvc needs to know about to form a valid URL.
+
+## Authenticating DVC
+
+Next we need to tell DVC about our auth keys. [In the DVC manual][10] they show you that you can use the `dvc remote modify` command to permanently store your access credentials in the DVC config file. However this stores your super-duper secret credentials in plain text in a file called `.dvc/config` which gets stored in your git repository meaning that if you’re storing your work on GitHub then Joe Public could come along and start messing with your private bucket.
+
+Instead I advocate the following approach. Firstly, in our `.gitignore` file at the top level of our project (create one if it doesn’t exist) add a line that says `.env`
+
+Now we’re going to create a new file – again in the top level of our project directory called `.env` and paste in the following:
+
+export AWS_ACCESS_KEY_ID='<keyID>'
+export AWS_SECRET_ACCESS_KEY='<applicationKey>'
+
+Replace dvc add data/raw/training-data
+
+After you run this, you’ll get a message along these lines:
+
+100% Add|████████████████████████████████████████████████████████████|1/1 [00:01, 1.36s/file]
+
+To track the changes with git, run:
+
+ git add data/raw/.gitignore data/raw/training-data/001.jpg
+
+Go ahead and execute the git command now. This will update your git repository so that the actual data (the pictures of dogs and chicken nuggets) will be gitignored but the .dvc files which contain metadata about those files and where to find them will be added to the repository. When you’re ready you can now `git commit` to save the metadata about the data to git permanently.
+
+## Storing DVC data in backblaze
+
+Now we have the acid test: this next step will push your data to your backblaze bucket if we have everything configured correctly. Simply run:
+
+source .env
+dvc push
+
+At this point you’ll either get error messages or a bunch of progress bars that will populate as the images in your folder are uploaded. Once the process is finished you’ll see a summary that says `N files pushed` where N is the number of pictures you had in your folder. If that happened then congratulations you’ve successfully configured DVC and backblaze.
+
+## Getting the data back on another machine
+
+If you want to work on this project with your friends on this project or you want to check out the project on your other laptop then you or they will need to install git and dvc before checking out your project from github (or wherever your project is hosted). Once they have a local copy they should be able to go into the `data/raw/training-data` folder and they will see all of the `*.dvc` files describing where the training data is.
+
+Your git repository should have all of your dvc configuration in it already including the endpoint URL for your bucket. However, In order to check out this data they will first need to create a `.env` file of their own containing a key pair (ideally one that you’ve generated for them that is locked down as much as possible to just the project that you’d like to collaborate with them on). Then they will need to run:
+
+source .env
+dvc checkout
+
+This should begin the process of downloading your files from backblaze and making local copies of them in `data/raw/training-data`.
+
+## Streamlining Workflows
+
+One final tip I’d offer is using `dvc install` which will add hooks to git so that every time you push and pull, dvc push and pull are also automatically triggered – saving you from manually running those steps. It will also hook up dvc checkout and git checkout in case you’re working with different data assets on different project branches.
+
+## Final Thoughts
+
+Congratulations, if you got this far it means you’ve configured DVC and Backblaze B2 and have a perfectly reproducible data science workflow at the tips of your fingers. This workflow is well optimised for teams of people working on data science experiments that need to be repeatable or have large volumes of unwieldy data that needs a better home than git.
+
+_If you found this post useful please leave claps and comments or follow me on twitter [@jamesravey][11] for more._
+
+ [1]: https://git-lfs.github.com/
+ [2]: https://dvc.org/
+ [3]: https://dvc.org/doc/command-reference/remote/add
+ [4]: https://dvc.org/doc/command-reference/dag
+ [5]: https://dvc.org/doc/command-reference/run
+ [6]: https://www.backblaze.com/
+ [7]: https://www.backblaze.com/b2/cloud-storage.html
+ [8]: https://www.backblaze.com/b2/docs/s3_compatible_api.html
+ [9]: http://www.mtv.com/news/2752312/bagel-or-dog-or-fried-chicken-or-dog/
+ [10]: https://dvc.org/doc/command-reference/remote/modify#example-customize-an-s3-remote
+ [11]: https://twitter.com/jamesravey
\ No newline at end of file
diff --git a/brainsteam/content/posts/2020-12-16-.md b/brainsteam/content/posts/2020-12-16-.md
new file mode 100644
index 0000000..e6f580d
--- /dev/null
+++ b/brainsteam/content/posts/2020-12-16-.md
@@ -0,0 +1,19 @@
+---
+title: Easy MLFlow Server Hosting with Docker-Compose
+author: James
+type: post
+date: -001-11-30T00:00:00+00:00
+draft: true
+url: /?p=532
+medium_post:
+ - 'O:11:"Medium_Post":11:{s:16:"author_image_url";N;s:10:"author_url";N;s:11:"byline_name";N;s:12:"byline_email";N;s:10:"cross_link";N;s:2:"id";N;s:21:"follower_notification";N;s:7:"license";N;s:14:"publication_id";N;s:6:"status";N;s:3:"url";N;}'
+categories:
+ - Uncategorized
+
+---
+At Filament we’re really big fans of MLFlow for managing our ML model lifecycle from experiment to deployment. I won’t go into the [many advantages][1] of using this software since [many others][2] have done a good job of this before me.
+
+If you’re bought in
+
+ [1]: https://towardsdatascience.com/tracking-ml-experiments-using-mlflow-7910197091bb
+ [2]: https://towardsdatascience.com/5-tips-for-mlflow-experiment-tracking-c70ae117b03f
\ No newline at end of file
diff --git a/brainsteam/static/images/avatar.png b/brainsteam/static/images/avatar.png
new file mode 100644
index 0000000..c9c27f9
Binary files /dev/null and b/brainsteam/static/images/avatar.png differ