Nicky Nicolson commited on
Commit
e6d4f1a
1 Parent(s): 7ba1d6e

Initial revision

Browse files
Files changed (8) hide show
  1. .gitignore +1 -1
  2. Dockerfile +9 -17
  3. README.md +2 -2
  4. extractcollectorname.py +0 -36
  5. getDownloadMetadata.py +0 -28
  6. metadata.json +5 -17
  7. requirements.txt +2 -8
  8. tab2csv.py +0 -112
.gitignore CHANGED
@@ -1,2 +1,2 @@
1
  env
2
- data
 
1
  env
2
+ data
Dockerfile CHANGED
@@ -2,32 +2,24 @@ FROM python:3.11
2
 
3
  # Download ID is set as a space variable
4
  # By default it is a download of all Solanum preserved specimen records (c600K)
5
- ARG GBIF_DOWNLOAD_ID=$GBIF_DOWNLOAD_ID
 
6
  WORKDIR /code
7
 
8
  COPY ./requirements.txt /code/requirements.txt
9
 
10
  RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
11
 
12
- # Download GBIF occurrences and prepare for use with datasette
13
  RUN mkdir /data
14
- ADD https://api.gbif.org/v1/occurrence/download/request/${GBIF_DOWNLOAD_ID}.zip /data/gbif-occs.zip
15
- RUN ls -lh /data
16
- RUN unzip /data/gbif-occs.zip -d /data
17
- RUN ls -lh /data
18
- COPY ./tab2csv.py /code/tab2csv.py
19
 
20
-
21
- RUN python tab2csv.py --createcols /data ${GBIF_DOWNLOAD_ID} gbifocc.csv
22
- RUN csvs-to-sqlite /data/gbifocc.csv /code/gbifocc.db
23
- RUN ls -l /code
24
- RUN sqlite-utils tables /code/gbifocc.db --counts
25
- RUN sqlite-utils enable-fts /code/gbifocc.db gbifocc collectorNameAndNumber
26
- RUN chmod 755 /code/gbifocc.db
27
 
28
  # Create datasette metadata file
29
- COPY ./getDownloadMetadata.py /code/getDownloadMetadata.py
30
  COPY ./metadata.json /code/metadata.json
31
- RUN python getDownloadMetadata.py /code/metadata.json /code/metadata.json --download_id=$GBIF_DOWNLOAD_ID
32
 
33
- CMD ["datasette", "/code/gbifocc.db", "-m", "/code/metadata.json", "--host", "0.0.0.0", "--port", "7860", "--setting", "sql_time_limit_ms", "3500"]
 
2
 
3
  # Download ID is set as a space variable
4
  # By default it is a download of all Solanum preserved specimen records (c600K)
5
+ ARG IHDELTA_REPO_URL=$IHDELTA_REPO_URL
6
+ ARG START_AFTER_COMMIT=$START_AFTER_COMMIT
7
  WORKDIR /code
8
 
9
  COPY ./requirements.txt /code/requirements.txt
10
 
11
  RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
12
 
13
+ # Clone ihdelta repo
14
  RUN mkdir /data
15
+ RUN mkdir /data/ihdelta
16
+ RUN git clone ${IHDELTA_REPO_URL} /data/ihdelta
17
+ RUN git-history file --start-after ${START_AFTER_COMMIT} --csv --dialect unix --repo /data/ihdelta --id irn /code/ihdelta.db /data/ihdelta/ih-institutions.csv
 
 
18
 
19
+ RUN sqlite-utils tables /code/ihdelta.db --counts
20
+ RUN chmod 755 /code/ihdelta.db
 
 
 
 
 
21
 
22
  # Create datasette metadata file
 
23
  COPY ./metadata.json /code/metadata.json
 
24
 
25
+ CMD ["datasette", "/code/ihdelta.db", "-m", "/code/metadata.json", "--host", "0.0.0.0", "--port", "7860", "--setting", "sql_time_limit_ms", "3500"]
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
- title: GBIF occurrence Datasette
3
- emoji: 🌱
4
  colorFrom: red
5
  colorTo: purple
6
  sdk: docker
 
1
  ---
2
+ title: Index Herbariorum
3
+ emoji: 🏛
4
  colorFrom: red
5
  colorTo: purple
6
  sdk: docker
extractcollectorname.py DELETED
@@ -1,36 +0,0 @@
1
- import argparse
2
- import pandas as pd
3
- import bananompy
4
- from tqdm import tqdm
5
- tqdm.pandas()
6
-
7
- def getFirstFamilyName(s):
8
- firstFamilyName = None
9
- parsed = bananompy.parse(s)
10
- try:
11
- firstFamilyName = parsed[0]['parsed'][0]['family']
12
- except:
13
- pass
14
- return firstFamilyName
15
-
16
- if __name__ == '__main__':
17
- parser = argparse.ArgumentParser()
18
- parser.add_argument("inputfile")
19
- parser.add_argument("outputfile")
20
- args = parser.parse_args()
21
-
22
- df = pd.read_csv(args.inputfile,
23
- encoding='utf8',
24
- keep_default_na=False,
25
- na_values=['NONE',''],
26
- on_bad_lines='skip',
27
- sep=',')
28
- # Extract unique recordedBy values
29
- df_rb = df[['recordedBy']].drop_duplicates()
30
- df_rb['recordedBy_first_familyname'] = df_rb.recordedBy.progress_apply(getFirstFamilyName)
31
- # Apply back to main dataframe
32
- df = pd.merge(left = df, right=df_rb, left_on='recordedBy', right_on='recordedBy', how='left')
33
- # Add column holding collector name and number
34
- mask = (df.recordNumber.notnull())
35
- df.loc[mask,'collectorNameAndNumber']=df[mask].apply(lambda row: '{} {}'.format(row['recordedBy_first_familyname'],row['recordNumber']),axis=1)
36
- df.to_csv(args.outputfile, index=False, sep=',')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
getDownloadMetadata.py DELETED
@@ -1,28 +0,0 @@
1
- import argparse
2
- from pygbif import occurrences as occ
3
- import json
4
-
5
-
6
- licenses = {'http://creativecommons.org/licenses/by-nc/4.0/legalcode':'CC BY-NC 4.0'}
7
- if __name__ == '__main__':
8
- parser = argparse.ArgumentParser()
9
- parser.add_argument("inputfile")
10
- parser.add_argument("--download_id", type=str)
11
- parser.add_argument("outputfile")
12
-
13
- args = parser.parse_args()
14
-
15
- datasette_metadata = None
16
- with open(args.inputfile, 'r') as f_in:
17
- datasette_metadata = json.load(f_in)
18
-
19
- gbif_metadata = occ.download_meta(key = args.download_id)
20
- license_url = gbif_metadata['license']
21
- if license_url in licenses:
22
- datasette_metadata['license'] = licenses[license_url]
23
- datasette_metadata['license_url'] = license_url
24
- datasette_metadata['source_url'] = 'https://doi.org/{}'.format(gbif_metadata['doi'])
25
-
26
- datasette_metadata_json = json.dumps(datasette_metadata, indent=4)
27
- with open(args.outputfile, 'w') as f_out:
28
- f_out.write(datasette_metadata_json)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
metadata.json CHANGED
@@ -1,27 +1,15 @@
1
  {
2
- "title": "GBIF-mediated specimen occurrences",
3
- "description": "This is a datasette instance containing GBIF-mediated specimen occurrences. It can be used to browse specimen records (with options to filter and facet records) and to run SQL queries. It is also configured to run an Open Refine compatible reconciliation service on collector name and number, allowing a user to easily link specimen references (as found in taxonomic literature) to these specimen records.",
4
- "source": "Global Biodiversity Information Facility (GBIF)",
5
  "databases": {
6
  "gbifocc": {
7
  "tables": {
8
  "gbifocc": {
9
  "plugins": {
10
- "datasette-reconcile": {
11
- "id_field": "gbifID",
12
- "name_field": "collectorNameAndNumber",
13
- "type_field": "basisOfRecord",
14
- "type_default": [{
15
- "id": "basisOfRecord",
16
- "name": "PRESERVED_SPECIMEN"
17
- }],
18
- "max_limit": 5,
19
- "service_name": "GBIF specimens reconciliation",
20
- "view_url": "https://gbif.org/occurrence/{{id}}"
21
- },
22
  "datasette-cluster-map": {
23
- "latitude_column": "decimalLatitude",
24
- "longitude_column": "decimalLongitude"
25
  }
26
  }
27
  }
 
1
  {
2
+ "title": "Revision controlled Index Herbariorum",
3
+ "description": "This is a datasette instance containing Index Herbariorum, gathered each week using the git scraping data pattern.",
4
+ "source": "Index Herbariorum (NYBG)",
5
  "databases": {
6
  "gbifocc": {
7
  "tables": {
8
  "gbifocc": {
9
  "plugins": {
 
 
 
 
 
 
 
 
 
 
 
 
10
  "datasette-cluster-map": {
11
+ "latitude_column": "location.lat",
12
+ "longitude_column": "location.lon"
13
  }
14
  }
15
  }
requirements.txt CHANGED
@@ -1,10 +1,4 @@
1
  datasette
2
- datasette-reconcile
3
  datasette-cluster-map
4
- sqlite-utils
5
- csvs-to-sqlite
6
- pandas==1.5.3
7
- bananompy
8
- datasette-jellyfish
9
- tqdm
10
- pygbif
 
1
  datasette
2
+ git-history
3
  datasette-cluster-map
4
+ sqlite-utils
 
 
 
 
 
 
tab2csv.py DELETED
@@ -1,112 +0,0 @@
1
- import argparse
2
- import pandas as pd
3
- import requests
4
- from pygbif import occurrences as occ
5
- from tqdm import tqdm
6
- tqdm.pandas()
7
- import os.path
8
-
9
- def getFirstFamilyName(recordedBy):
10
- firstFamilyName = None
11
- parsed = bananompy.parse(recordedBy)
12
- try:
13
- firstFamilyName = parsed[0]['parsed'][0]['family']
14
- except:
15
- pass
16
- return firstFamilyName
17
-
18
- def getFirstFamilyNames(recordedBy_l):
19
- # post to bionomia
20
- bionomia_parse_endpoint_url = "https://api.bionomia.net/parse.json"
21
- data = dict()
22
- data['names'] = '\r\n'.join(recordedBy_l)
23
- r = requests.post(bionomia_parse_endpoint_url, data=data)
24
- parsed_results = r.json()
25
- results = dict()
26
- for parsed_result in parsed_results:
27
- try:
28
- results[parsed_result['original']] = parsed_result['parsed'][0]['family']
29
- except:
30
- results[parsed_result['original']] = None
31
- return results
32
-
33
- def getFirstFamilyNameBulk(df,
34
- recordedByColName="recordedBy",
35
- firstFamilyNameColName="recordedBy_first_familyname",
36
- batchsize=500):
37
- results = dict()
38
- recordedBy_l = []
39
- for s in tqdm(df[recordedByColName].values):
40
- if len(recordedBy_l) == batchsize:
41
- # send it
42
- results.update(getFirstFamilyNames(recordedBy_l))
43
- # clear for next iteration
44
- recordedBy_l = []
45
- recordedBy_l.append(s)
46
- if len(recordedBy_l) > 0:
47
- results.update(getFirstFamilyNames(recordedBy_l))
48
- df[firstFamilyNameColName] = df[recordedByColName].map(results)
49
- return df
50
-
51
- GBIF_DOWNLOAD_DESCRIBE_URL_SIMPLE_CSV = 'https://api.gbif.org/v1/occurrence/download/describe/simpleCsv'
52
- GBIF_DOWNLOAD_DESCRIBE_URL_DWCA = 'https://api.gbif.org/v1/occurrence/download/describe/dwca'
53
-
54
- def getGbifDownloadColumnNames(download_format):
55
- column_names = None
56
- if download_format == 'SIMPLE_CSV':
57
- r = requests.get(GBIF_DOWNLOAD_DESCRIBE_URL_SIMPLE_CSV)
58
- columns_metadata = r.json()
59
- column_names = [column_metadata['name'] for column_metadata in columns_metadata['fields']]
60
- elif download_format == 'DWCA':
61
- r = requests.get(GBIF_DOWNLOAD_DESCRIBE_URL_DWCA)
62
- columns_metadata = r.json()
63
- column_names = [column_metadata['name'] for column_metadata in columns_metadata['verbatim']['fields']]
64
- return column_names
65
-
66
-
67
- if __name__ == '__main__':
68
- parser = argparse.ArgumentParser()
69
- parser.add_argument("data_dir")
70
- parser.add_argument("download_id")
71
- parser.add_argument("-c","--createcols", action='store_true')
72
- parser.add_argument("-l","--limit", type=int)
73
- parser.add_argument("outputfilename")
74
- args = parser.parse_args()
75
-
76
- # Determine format of datafile by accessing download metadata from GBIF API
77
- gbif_metadata = occ.download_meta(key = args.download_id)
78
- download_format = gbif_metadata['request']['format']
79
- # The GBIF download format determines:
80
- # (1) the columns in the download, SIMPLE_CSV being a much restricted set
81
- # of columns than DWCA
82
- # (2) The name of the occurrence data file, SIMPLE_CSV : '[download_id].csv'
83
- # DWCA : 'occurrence.txt'
84
- inputfilename = None
85
- column_names_simple_csv = getGbifDownloadColumnNames('SIMPLE_CSV')
86
- column_names = None
87
- if download_format == 'SIMPLE_CSV':
88
- inputfilename = '{}.csv'.format(args.download_id)
89
- column_names = column_names_simple_csv
90
- elif download_format == 'DWCA':
91
- inputfilename = 'occurrence.txt'
92
- column_names_dwca = getGbifDownloadColumnNames('DWCA')
93
- column_names = [column_name for column_name in column_names_dwca if column_name in column_names_simple_csv]
94
-
95
- df = pd.read_csv(os.path.join(args.data_dir,inputfilename),
96
- encoding='utf8',
97
- keep_default_na=False,
98
- on_bad_lines='skip',
99
- sep='\t',
100
- usecols=column_names,
101
- nrows=args.limit)
102
- if args.createcols:
103
- # Extract unique recordedBy values
104
- df_rb = df[['recordedBy']].drop_duplicates()
105
- df_rb = getFirstFamilyNameBulk(df_rb)
106
- #df_rb['recordedBy_first_familyname'] = df_rb.recordedBy.progress_apply(getFirstFamilyName)
107
- # Apply back to main dataframe
108
- df = pd.merge(left = df, right=df_rb, left_on='recordedBy', right_on='recordedBy', how='left')
109
- # Add column holding collector name and number
110
- mask = (df.recordNumber.notnull())
111
- df.loc[mask,'collectorNameAndNumber']=df[mask].apply(lambda row: '{} {}'.format(row['recordedBy_first_familyname'],row['recordNumber']),axis=1)
112
- df.to_csv(os.path.join(args.data_dir,args.outputfilename), index=False, sep=',')