doi: add new doi backend

Add a new backend to support mounting datasets published with a digital
object identifier (DOI).
This commit is contained in:
Flora Thiebaut 2025-04-15 15:46:00 +02:00
parent 0b9671313b
commit 0a0fbe965f
13 changed files with 1383 additions and 0 deletions

View File

@ -14,6 +14,7 @@ import (
_ "github.com/rclone/rclone/backend/combine"
_ "github.com/rclone/rclone/backend/compress"
_ "github.com/rclone/rclone/backend/crypt"
_ "github.com/rclone/rclone/backend/doi"
_ "github.com/rclone/rclone/backend/drive"
_ "github.com/rclone/rclone/backend/dropbox"
_ "github.com/rclone/rclone/backend/fichier"

View File

@ -0,0 +1,38 @@
// Type definitions specific to Dataverse
package api
// DataverseDatasetResponse is returned by the Dataverse dataset API
type DataverseDatasetResponse struct {
Status string `json:"status"`
Data DataverseDataset `json:"data"`
}
// DataverseDataset is the representation of a dataset
type DataverseDataset struct {
LatestVersion DataverseDatasetVersion `json:"latestVersion"`
}
// DataverseDatasetVersion is the representation of a dataset version
type DataverseDatasetVersion struct {
LastUpdateTime string `json:"lastUpdateTime"`
Files []DataverseFile `json:"files"`
}
// DataverseFile is the representation of a file found in a dataset
type DataverseFile struct {
DirectoryLabel string `json:"directoryLabel"`
DataFile DataverseDataFile `json:"dataFile"`
}
// DataverseDataFile represents file metadata details
type DataverseDataFile struct {
ID int64 `json:"id"`
Filename string `json:"filename"`
ContentType string `json:"contentType"`
FileSize int64 `json:"filesize"`
OriginalFileFormat string `json:"originalFileFormat"`
OriginalFileSize int64 `json:"originalFileSize"`
OriginalFileName string `json:"originalFileName"`
MD5 string `json:"md5"`
}

View File

@ -0,0 +1,34 @@
// Type definitions specific to InvenioRDM
package api
// InvenioRecordResponse is the representation of a record stored in InvenioRDM
type InvenioRecordResponse struct {
Links InvenioRecordResponseLinks `json:"links"`
// Metadata InvenioRecordMetadata `json:"metadata"`
}
// InvenioRecordResponseLinks represents a record's links
type InvenioRecordResponseLinks struct {
Self string `json:"self"`
}
// InvenioFilesResponse is the representation of a record's files
type InvenioFilesResponse struct {
Entries []InvenioFilesResponseEntry `json:"entries"`
}
// InvenioFilesResponseEntry is the representation of a file entry
type InvenioFilesResponseEntry struct {
Key string `json:"key"`
Checksum string `json:"checksum"`
Size int64 `json:"size"`
Updated string `json:"updated"`
MimeType string `json:"mimetype"`
Links InvenioFilesResponseEntryLinks `json:"links"`
}
// InvenioFilesResponseEntryLinks represents file links details
type InvenioFilesResponseEntryLinks struct {
Content string `json:"content"`
}

26
backend/doi/api/types.go Normal file
View File

@ -0,0 +1,26 @@
// Package api has general type definitions for doi
package api
// DoiResolverResponse is returned by the DOI resolver API
//
// Reference: https://www.doi.org/the-identifier/resources/factsheets/doi-resolution-documentation
type DoiResolverResponse struct {
ResponseCode int `json:"responseCode"`
Handle string `json:"handle"`
Values []DoiResolverResponseValue `json:"values"`
}
// DoiResolverResponseValue is a single handle record value
type DoiResolverResponseValue struct {
Index int `json:"index"`
Type string `json:"type"`
Data DoiResolverResponseValueData `json:"data"`
TTL int `json:"ttl"`
Timestamp string `json:"timestamp"`
}
// DoiResolverResponseValueData is the data held in a handle value
type DoiResolverResponseValueData struct {
Format string `json:"format"`
Value any `json:"value"`
}

139
backend/doi/dataverse.go Normal file
View File

@ -0,0 +1,139 @@
// Implementation for Dataverse
package doi
import (
"context"
"fmt"
"net/http"
"net/url"
"path"
"strings"
"time"
"github.com/rclone/rclone/backend/doi/api"
"github.com/rclone/rclone/fs"
"github.com/rclone/rclone/lib/rest"
)
// Returns true if resolvedURL is likely a DOI hosted on a Dataverse intallation
func activateDataverse(resolvedURL *url.URL) (isActive bool) {
queryValues := resolvedURL.Query()
persistentID := queryValues.Get("persistentId")
return persistentID != ""
}
// Resolve the main API endpoint for a DOI hosted on a Dataverse installation
func resolveDataverseEndpoint(resolvedURL *url.URL) (provider Provider, endpoint *url.URL, err error) {
queryValues := resolvedURL.Query()
persistentID := queryValues.Get("persistentId")
query := url.Values{}
query.Add("persistentId", persistentID)
endpointURL := resolvedURL.ResolveReference(&url.URL{Path: "/api/datasets/:persistentId/", RawQuery: query.Encode()})
return Dataverse, endpointURL, nil
}
// Implements Fs.List() for Dataverse installations
func (f *Fs) listDataverse(ctx context.Context, dir string) (entries fs.DirEntries, err error) {
fileEntries, err := f.listDataverseDoiFiles(ctx)
if err != nil {
return nil, fmt.Errorf("error listing %q: %w", dir, err)
}
fullDir := path.Join(f.root, dir)
if fullDir != "" {
fullDir = fullDir + "/"
}
dirPaths := map[string]bool{}
for _, entry := range fileEntries {
// First, filter out files not in `fullDir`
if !strings.HasPrefix(entry.remote, fullDir) {
continue
}
// Then, find entries in subfolers
remotePath := entry.remote
if fullDir != "" {
remotePath = strings.TrimLeft(strings.TrimPrefix(remotePath, fullDir), "/")
}
parts := strings.SplitN(remotePath, "/", 2)
if len(parts) == 1 {
newEntry := *entry
newEntry.remote = path.Join(dir, remotePath)
entries = append(entries, &newEntry)
} else {
dirPaths[path.Join(dir, parts[0])] = true
}
}
for dirPath := range dirPaths {
entry := fs.NewDir(dirPath, time.Time{})
entries = append(entries, entry)
}
return entries, nil
}
// List the files contained in the DOI
func (f *Fs) listDataverseDoiFiles(ctx context.Context) (entries []*Object, err error) {
// Use the cache if populated
cachedEntries, found := f.cache.GetMaybe("files")
if found {
parsedEntries, ok := cachedEntries.([]Object)
if ok {
for _, entry := range parsedEntries {
newEntry := entry
entries = append(entries, &newEntry)
}
return entries, nil
}
}
filesURL := f.endpoint
var res *http.Response
var result api.DataverseDatasetResponse
opts := rest.Opts{
Method: "GET",
Path: strings.TrimLeft(filesURL.EscapedPath(), "/"),
Parameters: filesURL.Query(),
}
err = f.pacer.Call(func() (bool, error) {
res, err = f.srv.CallJSON(ctx, &opts, nil, &result)
return shouldRetry(ctx, res, err)
})
if err != nil {
return nil, fmt.Errorf("readDir failed: %w", err)
}
modTime, modTimeErr := time.Parse(time.RFC3339, result.Data.LatestVersion.LastUpdateTime)
if modTimeErr != nil {
fs.Logf(f, "error: could not parse last update time %v", modTimeErr)
modTime = timeUnset
}
for _, file := range result.Data.LatestVersion.Files {
contentURLPath := fmt.Sprintf("/api/access/datafile/%d", file.DataFile.ID)
query := url.Values{}
query.Add("format", "original")
contentURL := f.endpoint.ResolveReference(&url.URL{Path: contentURLPath, RawQuery: query.Encode()})
entry := &Object{
fs: f,
remote: path.Join(file.DirectoryLabel, file.DataFile.Filename),
contentURL: contentURL.String(),
size: file.DataFile.FileSize,
modTime: modTime,
md5: file.DataFile.MD5,
contentType: file.DataFile.ContentType,
}
if file.DataFile.OriginalFileName != "" {
entry.remote = path.Join(file.DirectoryLabel, file.DataFile.OriginalFileName)
entry.size = file.DataFile.OriginalFileSize
entry.contentType = file.DataFile.OriginalFileFormat
}
entries = append(entries, entry)
}
// Populate the cache
cacheEntries := []Object{}
for _, entry := range entries {
cacheEntries = append(cacheEntries, *entry)
}
f.cache.Put("files", cacheEntries)
return entries, nil
}

599
backend/doi/doi.go Normal file
View File

@ -0,0 +1,599 @@
// Package doi provides a filesystem interface for digital objects identified by DOIs.
//
// See: https://www.doi.org/the-identifier/what-is-a-doi/
package doi
import (
"context"
"errors"
"fmt"
"io"
"net/http"
"net/url"
"path"
"strings"
"time"
"github.com/rclone/rclone/backend/doi/api"
"github.com/rclone/rclone/fs"
"github.com/rclone/rclone/fs/config/configmap"
"github.com/rclone/rclone/fs/config/configstruct"
"github.com/rclone/rclone/fs/fserrors"
"github.com/rclone/rclone/fs/fshttp"
"github.com/rclone/rclone/fs/hash"
"github.com/rclone/rclone/lib/cache"
"github.com/rclone/rclone/lib/pacer"
"github.com/rclone/rclone/lib/rest"
)
const (
// the URL of the DOI resolver
//
// Reference: https://www.doi.org/the-identifier/resources/factsheets/doi-resolution-documentation
doiResolverAPIURL = "https://doi.org/api"
minSleep = 10 * time.Millisecond
maxSleep = 2 * time.Second
decayConstant = 2 // bigger for slower decay, exponential
)
var (
errorReadOnly = errors.New("doi remotes are read only")
timeUnset = time.Unix(0, 0)
)
func init() {
fsi := &fs.RegInfo{
Name: "doi",
Description: "DOI datasets",
NewFs: NewFs,
CommandHelp: commandHelp,
Options: []fs.Option{{
Name: "doi",
Help: "The DOI or the doi.org URL.",
Required: true,
}, {
Name: fs.ConfigProvider,
Help: `DOI provider.
The DOI provider can be set when rclone does not automatically recognize a supported DOI provider.`,
Examples: []fs.OptionExample{
{
Value: "auto",
Help: "Auto-detect provider",
},
{
Value: string(Zenodo),
Help: "Zenodo",
}, {
Value: string(Dataverse),
Help: "Dataverse",
}, {
Value: string(Invenio),
Help: "Invenio",
}},
Required: false,
Advanced: true,
}},
}
fs.Register(fsi)
}
// Provider defines the type of provider hosting the DOI
type Provider string
const (
// Zenodo provider, see https://zenodo.org
Zenodo Provider = "zenodo"
// Dataverse provider, see https://dataverse.harvard.edu
Dataverse Provider = "dataverse"
// Invenio provider, see https://inveniordm.docs.cern.ch
Invenio Provider = "invenio"
)
// Options defines the configuration for this backend
type Options struct {
Doi string `config:"doi"` // The DOI, a digital identifier of an object, usually a dataset
Provider string `config:"provider"` // The DOI provider
}
// Fs stores the interface to the remote HTTP files
type Fs struct {
name string // name of this remote
root string // the path we are working on
provider Provider // the DOI provider
features *fs.Features // optional features
opt Options // options for this backend
ci *fs.ConfigInfo // global config
endpoint *url.URL // the main API endpoint for this remote
endpointURL string // endpoint as a string
srv *rest.Client // the connection to the server
pacer *fs.Pacer // pacer for API calls
cache *cache.Cache // a cache for the remote metadata
}
// Object is a remote object that has been stat'd (so it exists, but is not necessarily open for reading)
type Object struct {
fs *Fs // what this object is part of
remote string // the remote path
contentURL string // the URL where the contents of the file can be downloaded
size int64 // size of the object
modTime time.Time // modification time of the object
contentType string // content type of the object
md5 string // MD5 hash of the object content
}
// Parse the input string as a DOI
// Examples:
// 10.1000/182 -> 10.1000/182
// https://doi.org/10.1000/182 -> 10.1000/182
// doi:10.1000/182 -> 10.1000/182
func parseDoi(doi string) string {
doiURL, err := url.Parse(doi)
if err != nil {
return doi
}
if doiURL.Scheme == "doi" {
return strings.TrimLeft(strings.TrimPrefix(doi, "doi:"), "/")
}
if strings.HasSuffix(doiURL.Hostname(), "doi.org") {
return strings.TrimLeft(doiURL.Path, "/")
}
return doi
}
// Resolve a DOI to a URL
// Reference: https://www.doi.org/the-identifier/resources/factsheets/doi-resolution-documentation
func resolveDoiURL(ctx context.Context, srv *rest.Client, pacer *fs.Pacer, opt *Options) (doiURL *url.URL, err error) {
var result api.DoiResolverResponse
params := url.Values{}
params.Add("index", "1")
opts := rest.Opts{
Method: "GET",
RootURL: doiResolverAPIURL,
Path: "/handles/" + opt.Doi,
Parameters: params,
}
err = pacer.Call(func() (bool, error) {
res, err := srv.CallJSON(ctx, &opts, nil, &result)
return shouldRetry(ctx, res, err)
})
if err != nil {
return nil, err
}
if result.ResponseCode != 1 {
return nil, fmt.Errorf("could not resolve DOI (error code %d)", result.ResponseCode)
}
resolvedURLStr := ""
for _, value := range result.Values {
if value.Type == "URL" && value.Data.Format == "string" {
valueStr, ok := value.Data.Value.(string)
if !ok {
return nil, fmt.Errorf("could not resolve DOI (incorrect response format)")
}
resolvedURLStr = valueStr
}
}
resolvedURL, err := url.Parse(resolvedURLStr)
if err != nil {
return nil, err
}
return resolvedURL, nil
}
// Resolve the passed configuration into a provider and enpoint
func resolveEndpoint(ctx context.Context, srv *rest.Client, pacer *fs.Pacer, opt *Options) (provider Provider, endpoint *url.URL, err error) {
resolvedURL, err := resolveDoiURL(ctx, srv, pacer, opt)
if err != nil {
return "", nil, err
}
switch opt.Provider {
case string(Dataverse):
return resolveDataverseEndpoint(resolvedURL)
case string(Invenio):
return resolveInvenioEndpoint(ctx, srv, pacer, resolvedURL)
case string(Zenodo):
return resolveZenodoEndpoint(ctx, srv, pacer, resolvedURL, opt.Doi)
}
hostname := strings.ToLower(resolvedURL.Hostname())
if hostname == "dataverse.harvard.edu" || activateDataverse(resolvedURL) {
return resolveDataverseEndpoint(resolvedURL)
}
if hostname == "zenodo.org" || strings.HasSuffix(hostname, ".zenodo.org") {
return resolveZenodoEndpoint(ctx, srv, pacer, resolvedURL, opt.Doi)
}
if activateInvenio(ctx, srv, pacer, resolvedURL) {
return resolveInvenioEndpoint(ctx, srv, pacer, resolvedURL)
}
return "", nil, fmt.Errorf("provider '%s' is not supported", resolvedURL.Hostname())
}
// Make the http connection from the passed options
func (f *Fs) httpConnection(ctx context.Context, opt *Options) (isFile bool, err error) {
provider, endpoint, err := resolveEndpoint(ctx, f.srv, f.pacer, opt)
if err != nil {
return false, err
}
// Update f with the new parameters
f.srv.SetRoot(endpoint.ResolveReference(&url.URL{Path: "/"}).String())
f.endpoint = endpoint
f.endpointURL = endpoint.String()
f.provider = provider
f.opt.Provider = string(provider)
// Determine if the root is a file
switch f.provider {
case Dataverse:
entries, err := f.listDataverseDoiFiles(ctx)
if err != nil {
return false, err
}
for _, entry := range entries {
if entry.remote == f.root {
isFile = true
break
}
}
case Invenio, Zenodo:
isFile = f.root != ""
}
return isFile, nil
}
// retryErrorCodes is a slice of error codes that we will retry
var retryErrorCodes = []int{
429, // Too Many Requests.
500, // Internal Server Error
502, // Bad Gateway
503, // Service Unavailable
504, // Gateway Timeout
509, // Bandwidth Limit Exceeded
}
// shouldRetry returns a boolean as to whether this resp and err
// deserve to be retried. It returns the err as a convenience
func shouldRetry(ctx context.Context, res *http.Response, err error) (bool, error) {
if fserrors.ContextError(ctx, &err) {
return false, err
}
return fserrors.ShouldRetry(err) || fserrors.ShouldRetryHTTP(res, retryErrorCodes), err
}
// NewFs creates a new Fs object from the name and root. It connects to
// the host specified in the config file.
func NewFs(ctx context.Context, name, root string, m configmap.Mapper) (fs.Fs, error) {
root = strings.Trim(root, "/")
// Parse config into Options struct
opt := new(Options)
err := configstruct.Set(m, opt)
if err != nil {
return nil, err
}
opt.Doi = parseDoi(opt.Doi)
client := fshttp.NewClient(ctx)
ci := fs.GetConfig(ctx)
f := &Fs{
name: name,
root: root,
opt: *opt,
ci: ci,
srv: rest.NewClient(client),
pacer: fs.NewPacer(ctx, pacer.NewDefault(pacer.MinSleep(minSleep), pacer.MaxSleep(maxSleep), pacer.DecayConstant(decayConstant))),
cache: cache.New(),
}
f.features = (&fs.Features{
CanHaveEmptyDirectories: true,
}).Fill(ctx, f)
isFile, err := f.httpConnection(ctx, opt)
if err != nil {
return nil, err
}
if isFile {
// return an error with an fs which points to the parent
newRoot := path.Dir(f.root)
if newRoot == "." {
newRoot = ""
}
f.root = newRoot
return f, fs.ErrorIsFile
}
return f, nil
}
// Name returns the configured name of the file system
func (f *Fs) Name() string {
return f.name
}
// Root returns the root for the filesystem
func (f *Fs) Root() string {
return f.root
}
// String returns the URL for the filesystem
func (f *Fs) String() string {
return fmt.Sprintf("DOI %s", f.opt.Doi)
}
// Features returns the optional features of this Fs
func (f *Fs) Features() *fs.Features {
return f.features
}
// Precision is the remote http file system's modtime precision, which we have no way of knowing. We estimate at 1s
func (f *Fs) Precision() time.Duration {
return time.Second
}
// Hashes returns hash.HashNone to indicate remote hashing is unavailable
func (f *Fs) Hashes() hash.Set {
return hash.Set(hash.MD5)
// return hash.Set(hash.None)
}
// Mkdir makes the root directory of the Fs object
func (f *Fs) Mkdir(ctx context.Context, dir string) error {
return errorReadOnly
}
// Remove a remote http file object
func (o *Object) Remove(ctx context.Context) error {
return errorReadOnly
}
// Rmdir removes the root directory of the Fs object
func (f *Fs) Rmdir(ctx context.Context, dir string) error {
return errorReadOnly
}
// NewObject creates a new remote http file object
func (f *Fs) NewObject(ctx context.Context, remote string) (fs.Object, error) {
var entries []*Object
var err error
switch f.provider {
case Dataverse:
entries, err = f.listDataverseDoiFiles(ctx)
case Invenio, Zenodo:
entries, err = f.listInvevioDoiFiles(ctx)
default:
err = fmt.Errorf("provider type '%s' not supported", f.provider)
}
if err != nil {
return nil, err
}
for _, entry := range entries {
if entry.Remote() == remote {
return entry, nil
}
}
return nil, fs.ErrorObjectNotFound
}
// List the objects and directories in dir into entries. The
// entries can be returned in any order but should be for a
// complete directory.
//
// dir should be "" to list the root, and should not have
// trailing slashes.
//
// This should return ErrDirNotFound if the directory isn't
// found.
func (f *Fs) List(ctx context.Context, dir string) (entries fs.DirEntries, err error) {
switch f.provider {
case Dataverse:
return f.listDataverse(ctx, dir)
case Invenio, Zenodo:
return f.listInvenio(ctx, dir)
default:
return nil, fmt.Errorf("provider type '%s' not supported", f.provider)
}
}
// Put in to the remote path with the modTime given of the given size
//
// May create the object even if it returns an error - if so
// will return the object and the error, otherwise will return
// nil and the error
func (f *Fs) Put(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (fs.Object, error) {
return nil, errorReadOnly
}
// PutStream uploads to the remote path with the modTime given of indeterminate size
func (f *Fs) PutStream(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (fs.Object, error) {
return nil, errorReadOnly
}
// Fs is the filesystem this remote http file object is located within
func (o *Object) Fs() fs.Info {
return o.fs
}
// String returns the URL to the remote HTTP file
func (o *Object) String() string {
if o == nil {
return "<nil>"
}
return o.remote
}
// Remote the name of the remote HTTP file, relative to the fs root
func (o *Object) Remote() string {
return o.remote
}
// Hash returns "" since HTTP (in Go or OpenSSH) doesn't support remote calculation of hashes
func (o *Object) Hash(ctx context.Context, t hash.Type) (string, error) {
if t != hash.MD5 {
return "", hash.ErrUnsupported
}
return o.md5, nil
}
// Size returns the size in bytes of the remote http file
func (o *Object) Size() int64 {
return o.size
}
// ModTime returns the modification time of the remote http file
func (o *Object) ModTime(ctx context.Context) time.Time {
return o.modTime
}
// SetModTime sets the modification and access time to the specified time
//
// it also updates the info field
func (o *Object) SetModTime(ctx context.Context, modTime time.Time) error {
return errorReadOnly
}
// Storable returns whether the remote http file is a regular file (not a directory, symbolic link, block device, character device, named pipe, etc.)
func (o *Object) Storable() bool {
return true
}
// Open a remote http file object for reading. Seek is supported
func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (in io.ReadCloser, err error) {
fs.FixRangeOption(options, o.size)
opts := rest.Opts{
Method: "GET",
RootURL: o.contentURL,
Options: options,
}
var res *http.Response
err = o.fs.pacer.Call(func() (bool, error) {
res, err = o.fs.srv.Call(ctx, &opts)
return shouldRetry(ctx, res, err)
})
if err != nil {
return nil, fmt.Errorf("Open failed: %w", err)
}
// Handle non-compliant redirects
if res.Header.Get("Location") != "" {
newURL, err := res.Location()
if err == nil {
opts.RootURL = newURL.String()
err = o.fs.pacer.Call(func() (bool, error) {
res, err = o.fs.srv.Call(ctx, &opts)
return shouldRetry(ctx, res, err)
})
if err != nil {
return nil, fmt.Errorf("Open failed: %w", err)
}
}
}
return res.Body, nil
}
// Update in to the object with the modTime given of the given size
func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) error {
return errorReadOnly
}
// MimeType of an Object if known, "" otherwise
func (o *Object) MimeType(ctx context.Context) string {
return o.contentType
}
var commandHelp = []fs.CommandHelp{{
Name: "metadata",
Short: "Show metadata about the DOI.",
Long: `This command returns a JSON object with some information about the DOI.
rclone backend medatadata doi:
It returns a JSON object representing metadata about the DOI.
`,
}, {
Name: "set",
Short: "Set command for updating the config parameters.",
Long: `This set command can be used to update the config parameters
for a running doi backend.
Usage Examples:
rclone backend set doi: [-o opt_name=opt_value] [-o opt_name2=opt_value2]
rclone rc backend/command command=set fs=doi: [-o opt_name=opt_value] [-o opt_name2=opt_value2]
rclone rc backend/command command=set fs=doi: -o doi=NEW_DOI
The option keys are named as they are in the config file.
This rebuilds the connection to the doi backend when it is called with
the new parameters. Only new parameters need be passed as the values
will default to those currently in use.
It doesn't return anything.
`,
}}
// Command the backend to run a named command
//
// The command run is name
// args may be used to read arguments from
// opts may be used to read optional arguments from
//
// The result should be capable of being JSON encoded
// If it is a string or a []string it will be shown to the user
// otherwise it will be JSON encoded and shown to the user like that
func (f *Fs) Command(ctx context.Context, name string, arg []string, opt map[string]string) (out interface{}, err error) {
switch name {
case "metadata":
return f.ShowMetadata(ctx)
case "set":
newOpt := f.opt
err := configstruct.Set(configmap.Simple(opt), &newOpt)
if err != nil {
return nil, fmt.Errorf("reading config: %w", err)
}
_, err = f.httpConnection(ctx, &newOpt)
if err != nil {
return nil, fmt.Errorf("updating session: %w", err)
}
f.opt = newOpt
keys := []string{}
for k := range opt {
keys = append(keys, k)
}
fs.Logf(f, "Updated config values: %s", strings.Join(keys, ", "))
return nil, nil
default:
return nil, fs.ErrorCommandNotFound
}
}
// ShowMetadata returns some metadata about the corresponding DOI
func (f *Fs) ShowMetadata(ctx context.Context) (metadata interface{}, err error) {
doiURL, err := url.Parse("https://doi.org/" + f.opt.Doi)
if err != nil {
return nil, err
}
info := map[string]any{}
info["DOI"] = f.opt.Doi
info["URL"] = doiURL.String()
info["metadataURL"] = f.endpointURL
info["provider"] = f.provider
return info, nil
}
// Check the interfaces are satisfied
var (
_ fs.Fs = (*Fs)(nil)
_ fs.PutStreamer = (*Fs)(nil)
_ fs.Commander = (*Fs)(nil)
_ fs.Object = (*Object)(nil)
_ fs.MimeTyper = (*Object)(nil)
)

View File

@ -0,0 +1,34 @@
package doi
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestParseDoi(t *testing.T) {
// 10.1000/182 -> 10.1000/182
doi := "10.1000/182"
parsed := parseDoi(doi)
assert.Equal(t, "10.1000/182", parsed)
// https://doi.org/10.1000/182 -> 10.1000/182
doi = "https://doi.org/10.1000/182"
parsed = parseDoi(doi)
assert.Equal(t, "10.1000/182", parsed)
// https://dx.doi.org/10.1000/182 -> 10.1000/182
doi = "https://dxdoi.org/10.1000/182"
parsed = parseDoi(doi)
assert.Equal(t, "10.1000/182", parsed)
// doi:10.1000/182 -> 10.1000/182
doi = "doi:10.1000/182"
parsed = parseDoi(doi)
assert.Equal(t, "10.1000/182", parsed)
// doi://10.1000/182 -> 10.1000/182
doi = "doi://10.1000/182"
parsed = parseDoi(doi)
assert.Equal(t, "10.1000/182", parsed)
}

16
backend/doi/doi_test.go Normal file
View File

@ -0,0 +1,16 @@
// Test DOI filesystem interface
package doi
import (
"testing"
"github.com/rclone/rclone/fstest/fstests"
)
// TestIntegration runs integration tests against the remote
func TestIntegration(t *testing.T) {
fstests.Run(t, &fstests.Opt{
RemoteName: "TestDoi:",
NilObject: (*Object)(nil),
})
}

169
backend/doi/invenio.go Normal file
View File

@ -0,0 +1,169 @@
// Implementation for InvenioRDM
package doi
import (
"context"
"fmt"
"net/http"
"net/url"
"regexp"
"strings"
"time"
"github.com/rclone/rclone/backend/doi/api"
"github.com/rclone/rclone/fs"
"github.com/rclone/rclone/lib/rest"
)
var invenioRecordRegex = regexp.MustCompile(`\/records?\/(.+)`)
// Returns true if resolvedURL is likely a DOI hosted on an InvenioRDM intallation
func activateInvenio(ctx context.Context, srv *rest.Client, pacer *fs.Pacer, resolvedURL *url.URL) (isActive bool) {
_, _, err := resolveInvenioEndpoint(ctx, srv, pacer, resolvedURL)
return err == nil
}
// Resolve the main API endpoint for a DOI hosted on an InvenioRDM installation
func resolveInvenioEndpoint(ctx context.Context, srv *rest.Client, pacer *fs.Pacer, resolvedURL *url.URL) (provider Provider, endpoint *url.URL, err error) {
var res *http.Response
opts := rest.Opts{
Method: "GET",
RootURL: resolvedURL.String(),
}
err = pacer.Call(func() (bool, error) {
res, err = srv.Call(ctx, &opts)
return shouldRetry(ctx, res, err)
})
if err != nil {
return "", nil, err
}
// First, attempt to grab the API URL from the headers
var linksetURL *url.URL
links := parseLinkHeader(res.Header.Get("Link"))
for _, link := range links {
if link.Rel == "linkset" && link.Type == "application/linkset+json" {
parsed, err := url.Parse(link.Href)
if err == nil {
linksetURL = parsed
break
}
}
}
if linksetURL != nil {
endpoint, err = checkInvenioAPIURL(ctx, srv, pacer, linksetURL)
if err == nil {
return Invenio, endpoint, nil
}
fs.Logf(nil, "using linkset URL failed: %s", err.Error())
}
// If there is no linkset header, try to grab the record ID from the URL
recordID := ""
resURL := res.Request.URL
match := invenioRecordRegex.FindStringSubmatch(resURL.EscapedPath())
if match != nil {
recordID = match[1]
guessedURL := res.Request.URL.ResolveReference(&url.URL{
Path: "/api/records/" + recordID,
})
endpoint, err = checkInvenioAPIURL(ctx, srv, pacer, guessedURL)
if err == nil {
return Invenio, endpoint, nil
}
fs.Logf(nil, "guessing the URL failed: %s", err.Error())
}
return "", nil, fmt.Errorf("could not resolve the Invenio API endpoint for '%s'", resolvedURL.String())
}
func checkInvenioAPIURL(ctx context.Context, srv *rest.Client, pacer *fs.Pacer, resolvedURL *url.URL) (endpoint *url.URL, err error) {
var result api.InvenioRecordResponse
opts := rest.Opts{
Method: "GET",
RootURL: resolvedURL.String(),
}
err = pacer.Call(func() (bool, error) {
res, err := srv.CallJSON(ctx, &opts, nil, &result)
return shouldRetry(ctx, res, err)
})
if err != nil {
return nil, err
}
if result.Links.Self == "" {
return nil, fmt.Errorf("could not parse API response from '%s'", resolvedURL.String())
}
return url.Parse(result.Links.Self)
}
// Implements Fs.List() for Invenio
func (f *Fs) listInvenio(ctx context.Context, dir string) (entries fs.DirEntries, err error) {
if dir != "" {
return nil, fs.ErrorDirNotFound
}
fileEntries, err := f.listInvevioDoiFiles(ctx)
if err != nil {
return nil, fmt.Errorf("error listing %q: %w", dir, err)
}
for _, entry := range fileEntries {
entries = append(entries, entry)
}
return entries, nil
}
// List the files contained in the DOI
func (f *Fs) listInvevioDoiFiles(ctx context.Context) (entries []*Object, err error) {
// Use the cache if populated
cachedEntries, found := f.cache.GetMaybe("files")
if found {
parsedEntries, ok := cachedEntries.([]Object)
if ok {
for _, entry := range parsedEntries {
newEntry := entry
entries = append(entries, &newEntry)
}
return entries, nil
}
}
filesURL := f.endpoint.JoinPath("files")
var result api.InvenioFilesResponse
opts := rest.Opts{
Method: "GET",
Path: strings.TrimLeft(filesURL.EscapedPath(), "/"),
}
err = f.pacer.Call(func() (bool, error) {
res, err := f.srv.CallJSON(ctx, &opts, nil, &result)
return shouldRetry(ctx, res, err)
})
if err != nil {
return nil, fmt.Errorf("readDir failed: %w", err)
}
for _, file := range result.Entries {
modTime, modTimeErr := time.Parse(time.RFC3339, file.Updated)
if modTimeErr != nil {
fs.Logf(f, "error: could not parse last update time %v", modTimeErr)
modTime = timeUnset
}
entry := &Object{
fs: f,
remote: file.Key,
contentURL: file.Links.Content,
size: file.Size,
modTime: modTime,
contentType: file.MimeType,
md5: strings.TrimPrefix(file.Checksum, "md5:"),
}
entries = append(entries, entry)
}
// Populate the cache
cacheEntries := []Object{}
for _, entry := range entries {
cacheEntries = append(cacheEntries, *entry)
}
f.cache.Put("files", cacheEntries)
return entries, nil
}

View File

@ -0,0 +1,75 @@
package doi
import (
"regexp"
"strings"
)
var linkRegex = regexp.MustCompile(`^<(.+)>$`)
var valueRegex = regexp.MustCompile(`^"(.+)"$`)
// headerLink represents a link as presented in HTTP headers
// MDN Reference: https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Headers/Link
type headerLink struct {
Href string
Rel string
Type string
Extras map[string]string
}
func parseLinkHeader(header string) (links []headerLink) {
for _, link := range strings.Split(header, ",") {
link = strings.TrimSpace(link)
parsed := parseLink(link)
if parsed != nil {
links = append(links, *parsed)
}
}
return links
}
func parseLink(link string) (parsedLink *headerLink) {
var parts []string
for _, part := range strings.Split(link, ";") {
parts = append(parts, strings.TrimSpace(part))
}
match := linkRegex.FindStringSubmatch(parts[0])
if match == nil {
return nil
}
result := &headerLink{
Href: match[1],
Extras: map[string]string{},
}
for _, keyValue := range parts[1:] {
parsed := parseKeyValue(keyValue)
if parsed != nil {
key, value := parsed[0], parsed[1]
switch strings.ToLower(key) {
case "rel":
result.Rel = value
case "type":
result.Type = value
default:
result.Extras[key] = value
}
}
}
return result
}
func parseKeyValue(keyValue string) []string {
parts := strings.SplitN(keyValue, "=", 2)
if parts[0] == "" || len(parts) < 2 {
return nil
}
match := valueRegex.FindStringSubmatch(parts[1])
if match != nil {
parts[1] = match[1]
return parts
}
return parts
}

View File

@ -0,0 +1,44 @@
package doi
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestParseLinkHeader(t *testing.T) {
header := "<https://zenodo.org/api/records/15063252> ; rel=\"linkset\" ; type=\"application/linkset+json\""
links := parseLinkHeader(header)
expected := headerLink{
Href: "https://zenodo.org/api/records/15063252",
Rel: "linkset",
Type: "application/linkset+json",
Extras: map[string]string{},
}
assert.Contains(t, links, expected)
header = "<https://api.example.com/issues?page=2>; rel=\"prev\", <https://api.example.com/issues?page=4>; rel=\"next\", <https://api.example.com/issues?page=10>; rel=\"last\", <https://api.example.com/issues?page=1>; rel=\"first\""
links = parseLinkHeader(header)
expectedList := []headerLink{{
Href: "https://api.example.com/issues?page=2",
Rel: "prev",
Type: "",
Extras: map[string]string{},
}, {
Href: "https://api.example.com/issues?page=4",
Rel: "next",
Type: "",
Extras: map[string]string{},
}, {
Href: "https://api.example.com/issues?page=10",
Rel: "last",
Type: "",
Extras: map[string]string{},
}, {
Href: "https://api.example.com/issues?page=1",
Rel: "first",
Type: "",
Extras: map[string]string{},
}}
assert.Equal(t, links, expectedList)
}

47
backend/doi/zenodo.go Normal file
View File

@ -0,0 +1,47 @@
// Implementation for Zenodo
package doi
import (
"context"
"fmt"
"net/url"
"regexp"
"github.com/rclone/rclone/backend/doi/api"
"github.com/rclone/rclone/fs"
"github.com/rclone/rclone/lib/rest"
)
var zenodoRecordRegex = regexp.MustCompile(`zenodo[.](.+)`)
// Resolve the main API endpoint for a DOI hosted on Zenodo
func resolveZenodoEndpoint(ctx context.Context, srv *rest.Client, pacer *fs.Pacer, resolvedURL *url.URL, doi string) (provider Provider, endpoint *url.URL, err error) {
match := zenodoRecordRegex.FindStringSubmatch(doi)
if match == nil {
return "", nil, fmt.Errorf("could not derive API endpoint URL from '%s'", resolvedURL.String())
}
recordID := match[1]
endpointURL := resolvedURL.ResolveReference(&url.URL{Path: "/api/records/" + recordID})
var result api.InvenioRecordResponse
opts := rest.Opts{
Method: "GET",
RootURL: endpointURL.String(),
}
err = pacer.Call(func() (bool, error) {
res, err := srv.CallJSON(ctx, &opts, nil, &result)
return shouldRetry(ctx, res, err)
})
if err != nil {
return "", nil, err
}
endpointURL, err = url.Parse(result.Links.Self)
if err != nil {
return "", nil, err
}
return Zenodo, endpointURL, nil
}

161
docs/content/doi.md Normal file
View File

@ -0,0 +1,161 @@
---
title: "DOI"
description: "Rclone docs for DOI"
versionIntroduced: "?"
---
# {{< icon "fa fa-building-columns" >}} DOI
The DOI remote is a read only remote for reading files from digital object identifiers (DOI).
Currently, the DOI backend supports supports DOIs hosted with:
- [InvenioRDM](https://inveniosoftware.org/products/rdm/)
- [Zenodo](https://zenodo.org)
- [CaltechDATA](https://data.caltech.edu)
- [Other InvenioRDM repositories](https://inveniosoftware.org/showcase/)
- [Dataverse](https://dataverse.org)
- [Harvard Dataverse](https://dataverse.harvard.edu)
- [Other Dataverse repositories](https://dataverse.org/installations)
Paths are specified as `remote:path`
Paths may be as deep as required, e.g. `remote:directory/subdirectory`.
## Configuration
Here is an example of how to make a remote called `remote`. First run:
rclone config
This will guide you through an interactive setup process:
```
No remotes found, make a new one?
n) New remote
s) Set configuration password
q) Quit config
n/s/q> n
Enter name for new remote.
name> remote
Type of storage to configure.
Choose a number from below, or type in your own value
[snip]
XX / DOI datasets
\ (doi)
[snip]
Storage> doi
Option doi.
The DOI or the doi.org URL.
Enter a value.
doi> 10.5281/zenodo.5876941
Edit advanced config?
y) Yes
n) No (default)
y/n> n
Configuration complete.
Options:
- type: doi
- doi: 10.5281/zenodo.5876941
Keep this "remote" remote?
y) Yes this is OK (default)
e) Edit this remote
d) Delete this remote
y/e/d> y
```
{{< rem autogenerated options start" - DO NOT EDIT - instead edit fs.RegInfo in backend/doi/doi.go then run make backenddocs" >}}
### Standard options
Here are the Standard options specific to doi (DOI datasets).
#### --doi-doi
The DOI or the doi.org URL.
Properties:
- Config: doi
- Env Var: RCLONE_DOI_DOI
- Type: string
- Required: true
### Advanced options
Here are the Advanced options specific to doi (DOI datasets).
#### --doi-provider
DOI provider.
The DOI provider can be set when rclone does not automatically recognize a supported DOI provider.
Properties:
- Config: provider
- Env Var: RCLONE_DOI_PROVIDER
- Type: string
- Required: false
- Examples:
- "auto"
- Auto-detect provider
- "zenodo"
- Zenodo
- "dataverse"
- Dataverse
- "invenio"
- Invenio
## Backend commands
Here are the commands specific to the doi backend.
Run them with
rclone backend COMMAND remote:
The help below will explain what arguments each command takes.
See the [backend](/commands/rclone_backend/) command for more
info on how to pass options and arguments.
These can be run on a running backend using the rc command
[backend/command](/rc/#backend-command).
### metadata
Show metadata about the DOI.
rclone backend metadata remote: [options] [<arguments>+]
This command returns a JSON object with some information about the DOI.
rclone backend medatadata doi:
It returns a JSON object representing metadata about the DOI.
### set
Set command for updating the config parameters.
rclone backend set remote: [options] [<arguments>+]
This set command can be used to update the config parameters
for a running doi backend.
Usage Examples:
rclone backend set doi: [-o opt_name=opt_value] [-o opt_name2=opt_value2]
rclone rc backend/command command=set fs=doi: [-o opt_name=opt_value] [-o opt_name2=opt_value2]
rclone rc backend/command command=set fs=doi: -o doi=NEW_DOI
The option keys are named as they are in the config file.
This rebuilds the connection to the doi backend when it is called with
the new parameters. Only new parameters need be passed as the values
will default to those currently in use.
It doesn't return anything.
{{< rem autogenerated options stop >}}