Publications markdown generator for academicpages

Takes a set of bibtex of publications and converts them for use with academicpages.github.io. This is an interactive Jupyter notebook (see more info here).

The core python code is also in pubsFromBibs.py. Run either from the markdown_generator folder after replacing updating the publist dictionary with:

  • bib file names
  • specific venue keys based on your bib file preferences
  • any specific pre-text for specific files
  • Collection Name (future feature)

TODO: Make this work with other databases of citations, TODO: Merge this with the existing TSV parsing solution

In [1]:
from pybtex.database.input import bibtex
import pybtex.database.input.bibtex 
from time import strptime
import string
import html
import os
import re
In [2]:
#todo: incorporate different collection types rather than a catch all publications, requires other changes to template
publist = {
    "proceeding": {
        "file" : "/home/lensenandr/publications/lensen.bib",
        "venuekey": ["booktitle","journal"],
        "venue-pretext": "",
        "collection" : {"name":"publications",
                        "permalink":"/publication/"}
    }
}
In [3]:
html_escape_table = {
    "&": "&",
    '"': """,
    "'": "'"
    }

def html_escape(text):
    """Produce entities within text."""
    return "".join(html_escape_table.get(c,c) for c in text)
In [4]:
for pubsource in publist:
    parser = bibtex.Parser()
    bibdata = parser.parse_file(publist[pubsource]["file"])

    #loop through the individual references in a given bibtex file
    for bib_id in bibdata.entries:
        #reset default date
        pub_year = "1900"
        pub_month = "01"
        pub_day = "01"
        
        b = bibdata.entries[bib_id].fields
        
        try:
            pub_year = f'{b["year"]}'

            #todo: this hack for month and day needs some cleanup
            if "month" in b.keys(): 
                if(len(b["month"])<3):
                    pub_month = "0"+b["month"]
                    pub_month = pub_month[-2:]
                elif(b["month"] not in range(12)):
                    tmnth = strptime(b["month"][:3],'%b').tm_mon   
                    pub_month = "{:02d}".format(tmnth) 
                else:
                    pub_month = str(b["month"])
            if "day" in b.keys(): 
                pub_day = str(b["day"])

                
            pub_date = pub_year+"-"+pub_month+"-"+pub_day
            
            #strip out {} as needed (some bibtex entries that maintain formatting)
            clean_title = b["title"].replace("{", "").replace("}","").replace("\\","").replace(" ","-")    

            url_slug = re.sub("\\[.*\\]|[^a-zA-Z0-9_-]", "", clean_title)
            url_slug = url_slug.replace("--","-")

            md_filename = (str(pub_date) + "-" + url_slug + ".md").replace("--","-")
            html_filename = (str(pub_date) + "-" + url_slug).replace("--","-")

            #Build Citation from text
            citation = ""

            #citation authors - todo - add highlighting for primary author?
            for author in bibdata.entries[bib_id].persons["author"]:
                citation = citation+(" "+author.first_names[0]+" "+author.last_names[0]+", ").replace("{", "").replace("}","").replace("\\","")

            #citation title
            citation = citation + "\"" + html_escape(b["title"].replace("{", "").replace("}","").replace("\\","")) + ".\""

            #add venue logic depending on citation type
            venue = publist[pubsource]["venue-pretext"]
            for k in publist[pubsource]["venuekey"]:
                if k in b:
                    venue = venue + b[k].replace("{", "").replace("}","").replace("\\","")
                    break
            #venue = publist[pubsource]["venue-pretext"]+b[publist[pubsource]["venuekey"]].replace("{", "").replace("}","").replace("\\","")

            citation = citation + " " + html_escape(venue)
            citation = citation + ", " + pub_year + "."
            if "note" in b.keys():
                citation = citation + " " + html_escape(b['note']) + "."
            
            ## YAML variables
            md = "---\ntitle: \""   + html_escape(b["title"].replace("{", "").replace("}","").replace("\\","")) + '"\n'
            
            md += """collection: """ +  publist[pubsource]["collection"]["name"]

            md += """\npermalink: """ + publist[pubsource]["collection"]["permalink"]  + html_filename
            
            note = False
            if "note" in b.keys():
                if len(str(b["note"])) > 5:
                    #md += "\nexcerpt: '" + html_escape(b["note"]) + "'"
                    note = True

            md += "\ndate: " + str(pub_date) 

            md += "\nvenue: '" + html_escape(venue) + "'"
            
            b["url"] = '/files/{}.pdf'.format(bib_id)
            
            url = False
            if "url" in b.keys():
                if len(str(b["url"])) > 5:
            #this is annoying??
                    #md += "\npaperurl: '" + b["url"] + "'"
                    url = True

            md += "\ncitation: '" + html_escape(citation) + "'"

            md += "\n---"

            
            ## Markdown description for individual page
            #if note:
            #    md += "\n" + html_escape(b["note"]) + "\n"

            if url:
                md += "\n[Access paper here](" + b["url"] + "){:target=\"_blank\"}\n" 
            #else:
            #    md += "\nUse [Google Scholar](https://scholar.google.com/scholar?q="+html.escape(clean_title.replace("-","+"))+"){:target=\"_blank\"} for full citation"

            md_filename = os.path.basename(md_filename)

            with open("../_publications/" + md_filename, 'w') as f:
                f.write(md)
            print(f'SUCESSFULLY PARSED {bib_id}: \"', b["title"][:60],"..."*(len(b['title'])>60),"\"")
        # field may not exist for a reference
        except KeyError as e:
            print(f'WARNING Missing Expected Field {e} from entry {bib_id}: \"', b["title"][:30],"..."*(len(b['title'])>30),"\"")
            continue
SUCESSFULLY PARSED lensen2017New: " New Representations in Genetic Programming for Feature Const ... "
SUCESSFULLY PARSED lensen2016Genetic: " Genetic Programming for Region Detection, Feature Extraction ... "
SUCESSFULLY PARSED lensen2015Genetic: " Genetic Programming for algae detection in river images  "
SUCESSFULLY PARSED lensen2015hybrid: " A hybrid Genetic Programming approach to feature detection a ... "
SUCESSFULLY PARSED lensen2017Improving: " Improving {k}-means clustering with genetic programming for  ... "
SUCESSFULLY PARSED lensen2017GPGC: " {GPGC:} genetic programming for automatic clustering using a ... "
SUCESSFULLY PARSED lensen2017Using: " Using Particle Swarm Optimisation and the Silhouette Metric  ... "
SUCESSFULLY PARSED lensen2016Particle: " Particle Swarm Optimisation Representations for Simultaneous ... "
SUCESSFULLY PARSED lensen2018generating: " Generating Redundant Features with Unsupervised Multi-tree G ... "
SUCESSFULLY PARSED lensen2018automatically: " Automatically Evolving Difficult Benchmark Feature Selection ... "
SUCESSFULLY PARSED oneill2018particle: " Particle Swarm Optimisation for Feature Selection and Weight ... "
SUCESSFULLY PARSED lensen2019can: " Can Genetic Programming Do Manifold Learning Too?  "
SUCESSFULLY PARSED alsahaf2019survey: " A survey on evolutionary machine learning  "
SUCESSFULLY PARSED lensen2019genetic: " Genetic Programming for Evolving Similarity Functions for Cl ... "
SUCESSFULLY PARSED lensen2019multi: " Multi-Objective Genetic Programming for Manifold Learning: B ... "
SUCESSFULLY PARSED lensen2020genetic: " Genetic Programming for Evolving a Front of Interpretable Mo ... "
SUCESSFULLY PARSED schofield2020evolving: " Evolving Simpler Constructed Features for Clustering Problem ... "
In [5]:
bibdata
Out[5]:
BibliographyData(entries=OrderedCaseInsensitiveDict([('lensen2017New', Entry('inproceedings', fields=[('title', 'New Representations in Genetic Programming for Feature Construction in k-Means Clustering'), ('booktitle', 'Proceedings of the 11th International Conference on Simulated Evolution and Learning ({SEAL})'), ('year', '2017'), ('month', 'November'), ('day', '10'), ('volume', '10593'), ('series', 'Lecture Notes in Computer Science'), ('pages', '543--555'), ('publisher', 'Springer'), ('url', '/files/lensen2017New.pdf')], persons=OrderedCaseInsensitiveDict([('author', [Person('Lensen, Andrew'), Person('Xue, Bing'), Person('Zhang, Mengjie')])]))), ('lensen2016Genetic', Entry('inproceedings', fields=[('title', 'Genetic Programming for Region Detection, Feature Extraction, Feature Construction and Classification in Image Data'), ('booktitle', 'Proceedings of the European Conference on Genetic Programming (EuroGP)'), ('year', '2016'), ('month', 'March'), ('day', '30'), ('volume', '9594'), ('series', 'Lecture Notes in Computer Science'), ('pages', '51--67'), ('publisher', 'Springer'), ('url', '/files/lensen2016Genetic.pdf')], persons=OrderedCaseInsensitiveDict([('author', [Person('Lensen, Andrew'), Person('Al{-}Sahaf, Harith'), Person('Zhang, Mengjie'), Person('Xue, Bing')]), ('editor', [Person('Heywood, Malcolm I.'), Person('McDermott, James'), Person('Castelli, Mauro'), Person('Costa, Ernesto'), Person('Sim, Kevin')])]))), ('lensen2015Genetic', Entry('inproceedings', fields=[('title', 'Genetic Programming for algae detection in river images'), ('booktitle', '{IEEE} Congress on Evolutionary Computation, {CEC} 2015, Sendai, Japan, May 25-28, 2015'), ('year', '2015'), ('month', 'November'), ('day', '23'), ('pages', '2468--2475'), ('publisher', '{IEEE}'), ('bibsource', 'dblp computer science bibliography, http://dblp.org'), ('biburl', 'http://dblp.org/rec/bib/conf/cec/LensenAZV15'), ('doi', '10.1109/CEC.2015.7257191'), ('owner', 'lensenandr'), ('timestamp', '2017.11.09'), ('url', '/files/lensen2015Genetic.pdf')], persons=OrderedCaseInsensitiveDict([('author', [Person('Lensen, Andrew'), Person('Al{-}Sahaf, Harith'), Person('Zhang, Mengjie'), Person('Verma, Brijesh')])]))), ('lensen2015hybrid', Entry('inproceedings', fields=[('title', 'A hybrid Genetic Programming approach to feature detection and image classification'), ('booktitle', '2015 International Conference on Image and Vision Computing New Zealand, {IVCNZ} 2015, Auckland, New Zealand, November 23-24, 2015'), ('year', '2015'), ('pages', '1--6'), ('publisher', '{IEEE}'), ('bibsource', 'dblp computer science bibliography, http://dblp.org'), ('biburl', 'http://dblp.org/rec/bib/conf/ivcnz/LensenAZX15'), ('doi', '10.1109/IVCNZ.2015.7761564'), ('owner', 'lensenandr'), ('timestamp', '2017.11.09'), ('url', '/files/lensen2015hybrid.pdf')], persons=OrderedCaseInsensitiveDict([('author', [Person('Lensen, Andrew'), Person('Al{-}Sahaf, Harith'), Person('Zhang, Mengjie'), Person('Xue, Bing')])]))), ('lensen2017Improving', Entry('inproceedings', fields=[('title', 'Improving {k}-means clustering with genetic programming for feature construction'), ('booktitle', 'Genetic and Evolutionary Computation Conference, Berlin, Germany, July 15-19, 2017, Companion Material Proceedings'), ('year', '2017'), ('month', 'April'), ('day', '19'), ('pages', '237--238'), ('publisher', '{ACM}'), ('owner', 'lensenandr'), ('timestamp', '2017.11.09'), ('url', '/files/lensen2017Improving.pdf')], persons=OrderedCaseInsensitiveDict([('author', [Person('Lensen, Andrew'), Person('Xue, Bing'), Person('Zhang, Mengjie')]), ('editor', [Person('Bosman, Peter A. N.')])]))), ('lensen2017GPGC', Entry('inproceedings', fields=[('title', '{GPGC:} genetic programming for automatic clustering using a flexible non-hyper-spherical graph-based approach'), ('booktitle', 'Proceedings of the Genetic and Evolutionary Computation Conference, {GECCO}.'), ('year', '2017'), ('month', 'July'), ('day', '15'), ('pages', '449--456'), ('publisher', '{ACM}'), ('owner', 'lensenandr'), ('timestamp', '2017.11.09'), ('url', '/files/lensen2017GPGC.pdf')], persons=OrderedCaseInsensitiveDict([('author', [Person('Lensen, Andrew'), Person('Xue, Bing'), Person('Zhang, Mengjie')])]))), ('lensen2017Using', Entry('inproceedings', fields=[('title', 'Using Particle Swarm Optimisation and the Silhouette Metric to Estimate the Number of Clusters, Select Features, and Perform Clustering'), ('booktitle', 'Proceedings of the 20th European Conference on the Applications of Evolutionary Computation (EvoApplications), Part {I}'), ('year', '2017'), ('month', 'April'), ('day', '19'), ('volume', '10199'), ('series', 'Lecture Notes in Computer Science'), ('pages', '538--554'), ('publisher', 'Springer'), ('owner', 'lensenandr'), ('timestamp', '2017.11.09'), ('url', '/files/lensen2017Using.pdf')], persons=OrderedCaseInsensitiveDict([('author', [Person('Lensen, Andrew'), Person('Xue, Bing'), Person('Zhang, Mengjie')])]))), ('lensen2016Particle', Entry('inproceedings', fields=[('title', 'Particle Swarm Optimisation Representations for Simultaneous Clustering and Feature Selection'), ('booktitle', 'Proceedings of the Symposium Series on Computational Intelligence'), ('year', '2016'), ('month', 'December'), ('day', '6'), ('pages', '1--8'), ('publisher', '{IEEE}'), ('owner', 'lensenandr'), ('timestamp', '2017.11.09'), ('url', '/files/lensen2016Particle.pdf')], persons=OrderedCaseInsensitiveDict([('author', [Person('Lensen, Andrew'), Person('Xue, Bing'), Person('Zhang, Mengjie')])]))), ('lensen2018generating', Entry('inproceedings', fields=[('title', 'Generating Redundant Features with Unsupervised Multi-tree Genetic Programming'), ('booktitle', 'Proceedings of the European Conference on Genetic Programming (EuroGP)'), ('pages', '84--100'), ('year', '2018'), ('month', 'April'), ('day', '4'), ('series', 'Lecture Notes in Computer Science'), ('volume', '10781'), ('publisher', 'Springer'), ('url', '/files/lensen2018generating.pdf')], persons=OrderedCaseInsensitiveDict([('author', [Person('Lensen, Andrew'), Person('Xue, Bing'), Person('Zhang, Mengjie')])]))), ('lensen2018automatically', Entry('inproceedings', fields=[('title', 'Automatically Evolving Difficult Benchmark Feature Selection Datasets with Genetic Programming'), ('booktitle', 'Proceedings of the Genetic and Evolutionary Computation Conference, {GECCO}'), ('pages', '458--465'), ('year', '2018'), ('month', 'July'), ('day', '15'), ('publisher', '{ACM}'), ('url', '/files/lensen2018automatically.pdf')], persons=OrderedCaseInsensitiveDict([('author', [Person('Lensen, Andrew'), Person('Xue, Bing'), Person('Zhang, Mengjie')])]))), ('oneill2018particle', Entry('inproceedings', fields=[('title', 'Particle Swarm Optimisation for Feature Selection and Weighting in High-Dimensional Clustering'), ('booktitle', 'Proceedings of the {IEEE} Congress on Evolutionary Computation, {CEC}'), ('year', '2018'), ('month', 'July'), ('day', '8'), ('publisher', '{IEEE}'), ('pages', '1--8'), ('url', '/files/oneill2018particle.pdf')], persons=OrderedCaseInsensitiveDict([('author', [Person("O'Neill, Damien"), Person('Lensen, Andrew'), Person('Xue, Bing'), Person('Zhang, Mengjie')])]))), ('lensen2019can', Entry('inproceedings', fields=[('title', 'Can Genetic Programming Do Manifold Learning Too?'), ('booktitle', 'Proceedings of the European Conference on Genetic Programming (EuroGP)'), ('year', '2019'), ('month', 'April'), ('day', '24'), ('volume', '11451'), ('series', 'Lecture Notes in Computer Science'), ('publisher', 'Springer'), ('pages', '114--130'), ('note', 'Best paper.'), ('url', '/files/lensen2019can.pdf')], persons=OrderedCaseInsensitiveDict([('author', [Person('Lensen, Andrew'), Person('Xue, Bing'), Person('Zhang, Mengjie')])]))), ('alsahaf2019survey', Entry('article', fields=[('title', 'A survey on evolutionary machine learning'), ('journal', 'Journal of the Royal Society of New Zealand'), ('volume', '49'), ('number', '2'), ('pages', '205-228'), ('year', '2019'), ('month', 'April'), ('day', '15'), ('publisher', 'Taylor & Francis'), ('doi', '10.1080/03036758.2019.1609052'), ('url', '/files/alsahaf2019survey.pdf')], persons=OrderedCaseInsensitiveDict([('author', [Person('Al-Sahaf, Harith'), Person('Bi, Ying'), Person('Chen, Qi'), Person('Lensen, Andrew'), Person('Mei, Yi'), Person('Sun, Yanan'), Person('Tran, Binh'), Person('Xue, Bing'), Person('Zhang, Mengjie')])]))), ('lensen2019genetic', Entry('article', fields=[('title', 'Genetic Programming for Evolving Similarity Functions for Clustering: Representations and Analysis'), ('journal', 'Evolutionary Computation'), ('volume', '0'), ('number', 'ja'), ('pages', '1--31'), ('year', '2019'), ('month', 'October'), ('day', '10'), ('note', 'Early Access'), ('publisher', 'MIT Press'), ('url', '/files/lensen2019genetic.pdf')], persons=OrderedCaseInsensitiveDict([('author', [Person('Lensen, Andrew'), Person('Xue, Bing'), Person('Zhang, Mengjie')])]))), ('lensen2019multi', Entry('article', fields=[('title', 'Multi-Objective Genetic Programming for Manifold Learning: Balancing Quality and Dimensionality'), ('journal', 'Genetic Programming and Evolvable Machines'), ('volume', '21'), ('pages', '399--431'), ('year', '2020'), ('url', '/files/lensen2019multi.pdf')], persons=OrderedCaseInsensitiveDict([('author', [Person('Lensen, Andrew'), Person('Zhang, Mengjie'), Person('Xue, Bing')])]))), ('lensen2020genetic', Entry('article', fields=[('title', 'Genetic Programming for Evolving a Front of Interpretable Models for Data Visualisation'), ('journal', '{IEEE} Trans. Cybernetics'), ('volume', '0'), ('pages', '1--15'), ('year', '2020'), ('month', 'February'), ('note', 'Early Access'), ('publisher', 'IEEE'), ('url', '/files/lensen2020genetic.pdf')], persons=OrderedCaseInsensitiveDict([('author', [Person('Lensen, Andrew'), Person('Xue, Bing'), Person('Zhang, Mengjie')])]))), ('schofield2020evolving', Entry('inproceedings', fields=[('title', 'Evolving Simpler Constructed Features for Clustering Problems with Genetic Programming'), ('booktitle', 'Proceedings of the {IEEE} Congress on Evolutionary Computation, {CEC}'), ('year', '2020'), ('month', 'July'), ('day', '19'), ('publisher', '{IEEE}'), ('pages', '1--8'), ('url', '/files/schofield2020evolving.pdf')], persons=OrderedCaseInsensitiveDict([('author', [Person('Schofield, Finn'), Person('Lensen, Andrew')])])))]), preamble=[])