Changes required to integrate libefi in to Linux.

The major change here is to fix up libefi to be linux aware.  For
the most part this wasn't too hard but there were a few major issues.

First off I needed to handle the DKIOCGMEDIAINFO and DKIOCINFO ioctls.
There is no direct equivilant for these ioctls under linux.  To handle
this I added wrapper functions which under Solaris simple call the ioctls.
But under Linux dig around the system a little bit getting the needed
info to fill in the requested structures.

Secondly the efi_ioctl() call was adapted such that under linux it directly
read or writes out the partition table.  Under Solaris this work was
handed off to the kernel via an ioctl.  In the efi_write() case we also
ensure we prompt the kernel via BLKRRPART to re-scan the new partition
table.  The libefi generated partition tables are correct but older
versions of ~parted-1.8.1 can not read them without a small patch.
The kernel and fdisk are able to read them just fine.

Thirdly efi_alloc_and_init() which is used by zpool to determine if a
device is a 'wholedisk' was updated to be linux aware.  This check is
performed by using the partition number for the device, which the
partition number is 0 on linux it is a 'wholedisk'.  However, certain
device type such as the loopback and ram disks needed to be excluded
because they do not support partitioning.

Forthly the zpool command was made symlink aware so it can correctly
resolve udev entries such as /dev/disk/by-*/*.  This symlinks are
fully expanded ensuring all block devices are recognized.  When a
when a 'wholedisk' block device is detected we now properly write
out an efi label and place zfs in the first partition (0th slice).
This partition is created 1MiB in to the disk to ensure it is aligned
nicely with all high end block devices I'm aware of.

This all works for me now but it did take quite a bit of work to get
it all sorted out.  It would not surprise me if certain special cases
were missed so we should keep any eye of for any odd behavior.
This commit is contained in:
Brian Behlendorf 2009-10-14 16:07:48 -07:00
parent 245e7692f7
commit 992be351d5
4 changed files with 326 additions and 52 deletions

View File

@ -400,17 +400,23 @@ make_leaf_vdev(const char *arg, uint64_t is_log)
if (arg[0] == '/') {
/*
* Complete device or file path. Exact type is determined by
* examining the file descriptor afterwards.
* examining the file descriptor afterwards. Symbolic links
* are resolved to their real paths for the is_whole_disk()
* and S_ISBLK/S_ISREG type checks.
*/
wholedisk = is_whole_disk(arg);
if (!wholedisk && (stat64(arg, &statbuf) != 0)) {
if (realpath(arg, path) == NULL) {
(void) fprintf(stderr,
gettext("cannot open '%s': %s\n"),
arg, strerror(errno));
gettext("cannot resolve path '%s'\n"), arg);
return (NULL);
}
(void) strlcpy(path, arg, sizeof (path));
wholedisk = is_whole_disk(path);
if (!wholedisk && (stat64(path, &statbuf) != 0)) {
(void) fprintf(stderr,
gettext("cannot open '%s': %s\n"),
path, strerror(errno));
return (NULL);
}
} else {
/*
* This may be a short path for a device, or it could be total
@ -888,8 +894,10 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv)
uint64_t wholedisk;
int fd;
int ret;
#if defined(__sun__) || defined(__sun)
ddi_devid_t devid;
char *minor = NULL, *devid_str = NULL;
#endif
verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
@ -918,7 +926,7 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv)
/*
* Fill in the devid, now that we've labeled the disk.
*/
(void) snprintf(buf, sizeof (buf), "%ss0", path);
(void) snprintf(buf, sizeof (buf), "%s%s", path, FIRST_SLICE);
if ((fd = open(buf, O_RDONLY)) < 0) {
(void) fprintf(stderr,
gettext("cannot open '%s': %s\n"),
@ -926,6 +934,7 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv)
return (-1);
}
#if defined(__sun__) || defined(__sun)
if (devid_get(fd, &devid) == 0) {
if (devid_get_minor_name(fd, &minor) == 0 &&
(devid_str = devid_str_encode(devid, minor)) !=
@ -939,6 +948,7 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv)
devid_str_free(minor);
devid_free(devid);
}
#endif
/*
* Update the path to refer to the 's0' slice. The presence of

View File

@ -30,6 +30,7 @@
#include <strings.h>
#include <unistd.h>
#include <uuid/uuid.h>
#include <zlib.h>
#include <libintl.h>
#include <sys/types.h>
#include <sys/dkio.h>
@ -39,7 +40,9 @@
#include <sys/dktp/fdisk.h>
#include <sys/efi_partition.h>
#include <sys/byteorder.h>
#include <sys/ddi.h>
#if defined(__linux__)
#include <linux/fs.h>
#endif
static struct uuid_to_ptag {
struct uuid uuid;
@ -50,11 +53,11 @@ static struct uuid_to_ptag {
{ EFI_SWAP },
{ EFI_USR },
{ EFI_BACKUP },
{ 0 }, /* STAND is never used */
{ EFI_UNUSED }, /* STAND is never used */
{ EFI_VAR },
{ EFI_HOME },
{ EFI_ALTSCTR },
{ 0 }, /* CACHE (cachefs) is never used */
{ EFI_UNUSED }, /* CACHE (cachefs) is never used */
{ EFI_RESERVED },
{ EFI_SYSTEM },
{ EFI_LEGACY_MBR },
@ -108,21 +111,115 @@ int efi_debug = 1;
int efi_debug = 0;
#endif
extern unsigned int efi_crc32(const unsigned char *, unsigned int);
static int efi_read(int, struct dk_gpt *);
/*
* Return a 32-bit CRC of the contents of the buffer. Pre-and-post
* one's conditioning will be handled by crc32() internally.
*/
static uint32_t
efi_crc32(const unsigned char *buf, unsigned int size)
{
uint32_t crc = crc32(0, Z_NULL, 0);
crc = crc32(crc, buf, size);
return (crc);
}
static int
read_disk_info(int fd, diskaddr_t *capacity, uint_t *lbsize)
{
struct dk_minfo disk_info;
int sector_size;
unsigned long long capacity_size;
if (ioctl(fd, BLKSSZGET, &sector_size) < 0)
return (-1);
if (ioctl(fd, BLKGETSIZE64, &capacity_size) < 0)
return (-1);
*lbsize = (uint_t)sector_size;
*capacity = (diskaddr_t)(capacity_size / sector_size);
if ((ioctl(fd, DKIOCGMEDIAINFO, (caddr_t)&disk_info)) == -1)
return (errno);
*capacity = disk_info.dki_capacity;
*lbsize = disk_info.dki_lbsize;
return (0);
}
static int
efi_get_info(int fd, struct dk_cinfo *dki_info)
{
#if defined(__linux__)
char path[PATH_MAX];
char *dev_path;
int rval;
/*
* The simplest way to get the partition number under linux is
* to parse it out of the /dev/<disk><parition> block device name.
* The kernel creates this using the partition number when it
* populates /dev/ so it may be trusted. Another issue is that
* that the libefi API only provides the open fd and not the
* file path. To handle this realpath(3) is used to resolve
* the block device name from /proc/self/fd/<fd>. Aside from
* the partition number we collect some additional device info.
*/
memset(dki_info, 0, sizeof(*dki_info));
(void) sprintf(path, "/proc/self/fd/%d", fd);
if ((dev_path = realpath(path, NULL)) == NULL)
goto error;
rval = sscanf(dev_path, "/dev/%[a-zA-Z]%hu",
dki_info->dki_dname,
&dki_info->dki_partition);
switch (rval) {
case 0:
errno = EINVAL;
goto error;
case 1:
dki_info->dki_partition = 0;
}
if ((strncmp(dki_info->dki_dname, "sd", 2) == 0)) {
strcpy(dki_info->dki_cname, "sd");
dki_info->dki_ctype = DKC_SCSI_CCS;
} else if ((strncmp(dki_info->dki_dname, "hd", 2) == 0)) {
strcpy(dki_info->dki_cname, "hd");
dki_info->dki_ctype = DKC_DIRECT;
} else if ((strncmp(dki_info->dki_dname, "md", 2) == 0)) {
strcpy(dki_info->dki_cname, "pseudo");
dki_info->dki_ctype = DKC_MD;
} else if ((strncmp(dki_info->dki_dname, "ram", 3) == 0)) {
strcpy(dki_info->dki_cname, "pseudo");
dki_info->dki_ctype = DKC_PCMCIA_MEM;
} else if ((strncmp(dki_info->dki_dname, "loop", 4) == 0)) {
strcpy(dki_info->dki_cname, "pseudo");
dki_info->dki_ctype = DKC_VBD;
} else {
strcpy(dki_info->dki_cname, "unknown");
dki_info->dki_ctype = DKC_UNKNOWN;
}
free(dev_path);
#else
if (ioctl(fd, DKIOCINFO, (caddr_t)dki_info) == -1)
goto error;
#endif
return (0);
error:
if (efi_debug)
(void) fprintf(stderr, "DKIOCINFO errno 0x%x\n", errno);
switch (errno) {
case EIO:
return (VT_EIO);
case EINVAL:
return (VT_EINVAL);
default:
return (VT_ERROR);
}
}
/*
* the number of blocks the EFI label takes up (round up to nearest
* block)
@ -136,12 +233,13 @@ read_disk_info(int fd, diskaddr_t *capacity, uint_t *lbsize)
int
efi_alloc_and_init(int fd, uint32_t nparts, struct dk_gpt **vtoc)
{
diskaddr_t capacity;
uint_t lbsize;
diskaddr_t capacity = 0;
uint_t lbsize = 0;
uint_t nblocks;
size_t length;
struct dk_gpt *vptr;
struct uuid uuid;
struct dk_cinfo dki_info;
if (read_disk_info(fd, &capacity, &lbsize) != 0) {
if (efi_debug)
@ -149,6 +247,31 @@ efi_alloc_and_init(int fd, uint32_t nparts, struct dk_gpt **vtoc)
"couldn't read disk information\n");
return (-1);
}
#if defined(__linux__)
if (efi_get_info(fd, &dki_info) != 0) {
if (efi_debug)
(void) fprintf(stderr,
"couldn't read disk information\n");
return (-1);
}
if (dki_info.dki_partition != 0) {
if (efi_debug)
(void) fprintf(stderr,
"can only partition whole devices\n");
return (-1);
}
if ((dki_info.dki_ctype == DKC_PCMCIA_MEM) ||
(dki_info.dki_ctype == DKC_VBD) ||
(dki_info.dki_ctype == DKC_UNKNOWN)) {
if (efi_debug)
(void) fprintf(stderr,
"unpartitionable device type %d\n",
dki_info.dki_ctype);
return (-1);
}
#endif
nblocks = NBLOCKS(nparts, lbsize);
if ((nblocks * lbsize) < EFI_MIN_ARRAY_SIZE + lbsize) {
@ -244,14 +367,117 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc)
{
void *data = dk_ioc->dki_data;
int error;
#if defined(__linux__)
diskaddr_t capacity;
uint_t lbsize;
/*
* When the IO is not being performed in kernel as an ioctl we need
* to know the sector size so we can seek to the proper byte offset.
*/
if (read_disk_info(fd, &capacity, &lbsize) == -1) {
if (efi_debug)
fprintf(stderr,"unable to read disk info: %d",errno);
errno = EIO;
return -1;
}
switch (cmd) {
case DKIOCGETEFI:
if (lbsize == 0) {
if (efi_debug)
(void) fprintf(stderr, "DKIOCGETEFI assuming "
"LBA %d bytes\n", DEV_BSIZE);
lbsize = DEV_BSIZE;
}
error = lseek(fd, dk_ioc->dki_lba * lbsize, SEEK_SET);
if (error == -1)
return error;
error = read(fd, data, dk_ioc->dki_length);
if (error == -1)
return error;
if (error != dk_ioc->dki_length) {
if (efi_debug)
(void) fprintf(stderr, "DKIOCGETEFI short "
"read of %d bytes\n", error);
errno = EIO;
return -1;
}
error = 0;
break;
case DKIOCSETEFI:
if (lbsize == 0) {
if (efi_debug)
(void) fprintf(stderr, "DKIOCSETEFI unknown "
"LBA size\n");
errno = EIO;
return -1;
}
error = lseek(fd, dk_ioc->dki_lba * lbsize, SEEK_SET);
if (error == -1)
return error;
error = write(fd, data, dk_ioc->dki_length);
if (error == -1)
return error;
if (error != dk_ioc->dki_length) {
if (efi_debug)
(void) fprintf(stderr, "DKIOCSETEFI short "
"write of %d bytes\n", error);
errno = EIO;
return -1;
}
error = fdatasync(fd);
if (error == -1)
return error;
error = 0;
break;
default:
if (efi_debug)
(void) fprintf(stderr, "unsupported ioctl()\n");
errno = EIO;
return -1;
}
#else
dk_ioc->dki_data_64 = (uint64_t)(uintptr_t)data;
error = ioctl(fd, cmd, (void *)dk_ioc);
dk_ioc->dki_data = data;
#endif
return (error);
}
#if defined(__linux__)
static int
efi_rescan(int fd)
{
int retry = 5;
int error;
/* Notify the kernel a devices partition table has been updated */
while ((error = ioctl(fd, BLKRRPART)) != 0) {
if (--retry == 0) {
(void) fprintf(stderr, "the kernel failed to rescan "
"the partition table: %d\n", errno);
return (-1);
}
}
return (0);
}
#endif
static int
check_label(int fd, dk_efi_t *dk_ioc)
{
@ -306,6 +532,8 @@ efi_read(int fd, struct dk_gpt *vtoc)
int rval = 0;
int md_flag = 0;
int vdc_flag = 0;
diskaddr_t capacity = 0;
uint_t lbsize = 0;
struct dk_minfo disk_info;
dk_efi_t dk_ioc;
efi_gpt_t *efi;
@ -317,19 +545,9 @@ efi_read(int fd, struct dk_gpt *vtoc)
/*
* get the partition number for this file descriptor.
*/
if (ioctl(fd, DKIOCINFO, (caddr_t)&dki_info) == -1) {
if (efi_debug) {
(void) fprintf(stderr, "DKIOCINFO errno 0x%x\n", errno);
}
switch (errno) {
case EIO:
return (VT_EIO);
case EINVAL:
return (VT_EINVAL);
default:
return (VT_ERROR);
}
}
if ((rval = efi_get_info(fd, &dki_info)) != 0)
return rval;
if ((strncmp(dki_info.dki_cname, "pseudo", 7) == 0) &&
(strncmp(dki_info.dki_dname, "md", 3) == 0)) {
md_flag++;
@ -343,14 +561,18 @@ efi_read(int fd, struct dk_gpt *vtoc)
}
/* get the LBA size */
if (ioctl(fd, DKIOCGMEDIAINFO, (caddr_t)&disk_info) == -1) {
if (read_disk_info(fd, &capacity, &lbsize) == -1) {
if (efi_debug) {
(void) fprintf(stderr,
"assuming LBA 512 bytes %d\n",
errno);
"unable to read disk info: %d",
errno);
}
disk_info.dki_lbsize = DEV_BSIZE;
return (VT_EINVAL);
}
disk_info.dki_lbsize = lbsize;
disk_info.dki_capacity = capacity;
if (disk_info.dki_lbsize == 0) {
if (efi_debug) {
(void) fprintf(stderr,
@ -840,22 +1062,13 @@ efi_write(int fd, struct dk_gpt *vtoc)
efi_gpe_t *efi_parts;
int i, j;
struct dk_cinfo dki_info;
int rval;
int md_flag = 0;
int nblocks;
diskaddr_t lba_backup_gpt_hdr;
if (ioctl(fd, DKIOCINFO, (caddr_t)&dki_info) == -1) {
if (efi_debug)
(void) fprintf(stderr, "DKIOCINFO errno 0x%x\n", errno);
switch (errno) {
case EIO:
return (VT_EIO);
case EINVAL:
return (VT_EINVAL);
default:
return (VT_ERROR);
}
}
if ((rval = efi_get_info(fd, &dki_info)) != 0)
return rval;
/* check if we are dealing wih a metadevice */
if ((strncmp(dki_info.dki_cname, "pseudo", 7) == 0) &&
@ -942,6 +1155,10 @@ efi_write(int fd, struct dk_gpt *vtoc)
return (VT_EINVAL);
}
/* Zero's should be written for empty partitions */
if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED)
continue;
efi_parts[i].efi_gpe_StartingLBA =
LE_64(vtoc->efi_parts[i].p_start);
efi_parts[i].efi_gpe_EndingLBA =
@ -1033,6 +1250,13 @@ efi_write(int fd, struct dk_gpt *vtoc)
/* write the PMBR */
(void) write_pmbr(fd, vtoc);
free(dk_ioc.dki_data);
#if defined(__linux__)
rval = efi_rescan(fd);
if (rval)
return (VT_ERROR);
#endif
return (0);
}
@ -1050,6 +1274,7 @@ efi_free(struct dk_gpt *ptr)
int
efi_type(int fd)
{
#if 0
struct vtoc vtoc;
struct extvtoc extvtoc;
@ -1063,6 +1288,9 @@ efi_type(int fd)
}
}
return (0);
#else
return (ENOSYS);
#endif
}
void
@ -1176,7 +1404,7 @@ efi_auto_sense(int fd, struct dk_gpt **vtoc)
return (-1);
}
for (i = 0; i < min((*vtoc)->efi_nparts, V_NUMPAR); i++) {
for (i = 0; i < MIN((*vtoc)->efi_nparts, V_NUMPAR); i++) {
(*vtoc)->efi_parts[i].p_tag = default_vtoc_map[i].p_tag;
(*vtoc)->efi_parts[i].p_flag = default_vtoc_map[i].p_flag;
(*vtoc)->efi_parts[i].p_start = 0;

View File

@ -56,11 +56,15 @@ extern "C" {
#if defined(__sun__) || defined(__sun)
#define DISK_ROOT "/dev/dsk"
#define RDISK_ROOT "/dev/rdsk"
#define FIRST_SLICE "s0"
#define BACKUP_SLICE "s2"
#endif
#ifdef __linux__
#define DISK_ROOT "/dev"
#define RDISK_ROOT DISK_ROOT
#define FIRST_SLICE "1"
#define BACKUP_SLICE ""
#endif
/*

View File

@ -634,9 +634,12 @@ zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp)
/*
* Don't start the slice at the default block of 34; many storage
* devices will use a stripe width of 128k, so start there instead.
* devices will use a stripe width of 128k, other vendors prefer a 1m
* alignment. It is best to play it safe and ensure a 1m alignment
* give 512b blocks. When the block size is larger by a power of 2
* we will still be 1m aligned.
*/
#define NEW_START_BLOCK 256
#define NEW_START_BLOCK 2048
/*
* Validate the given pool name, optionally putting an extended error message in
@ -1758,6 +1761,7 @@ is_guid_type(zpool_handle_t *zhp, uint64_t guid, const char *type)
static int
zpool_relabel_disk(libzfs_handle_t *hdl, const char *name)
{
#if 0
char path[MAXPATHLEN];
char errbuf[1024];
int fd, error;
@ -1788,6 +1792,12 @@ zpool_relabel_disk(libzfs_handle_t *hdl, const char *name)
return (zfs_error(hdl, EZFS_NOCAP, errbuf));
}
return (0);
#else
char errbuf[1024];
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
"relabel '%s/%s': libefi is unsupported"), DISK_ROOT, name);
return (zfs_error(hdl, EZFS_NOTSUP, errbuf));
#endif
}
/*
@ -3117,7 +3127,10 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name)
uint64_t slice_size;
diskaddr_t start_block;
char errbuf[1024];
#if defined(__linux__)
struct stat64 statbuf;
int i;
#endif
/* prepare an error message just in case */
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN, "cannot label '%s'"), name);
@ -3153,6 +3166,7 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name)
* This shouldn't happen. We've long since verified that this
* is a valid device.
*/
printf("errno =%d\n", errno);
zfs_error_aux(hdl,
dgettext(TEXT_DOMAIN, "unable to open device"));
return (zfs_error(hdl, EZFS_OPENFAILED, errbuf));
@ -3214,6 +3228,24 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name)
(void) close(fd);
efi_free(vtoc);
#if defined(__linux__)
/*
* The efi partition table has been successfully written and the
* kernel notified. However, it still may take a moment for udev
* to notice the devfs update and properly populate /dev/. We will
* wait up to 3 seconds which is far far far longer than needed.
*/
(void) snprintf(path, sizeof (path), "%s/%s%s", RDISK_ROOT, name,
FIRST_SLICE);
for (i = 0; i < 3000; i++) {
if (stat64(path, &statbuf) == 0 || errno != ENOENT)
break;
usleep(1000);
}
#endif
return (0);
}