diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index f2270a3f7a..b29fadeb40 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -406,17 +406,23 @@ make_leaf_vdev(const char *arg, uint64_t is_log) if (arg[0] == '/') { /* * Complete device or file path. Exact type is determined by - * examining the file descriptor afterwards. + * examining the file descriptor afterwards. Symbolic links + * are resolved to their real paths for the is_whole_disk() + * and S_ISBLK/S_ISREG type checks. */ - wholedisk = is_whole_disk(arg); - if (!wholedisk && (stat64(arg, &statbuf) != 0)) { + if (realpath(arg, path) == NULL) { (void) fprintf(stderr, - gettext("cannot open '%s': %s\n"), - arg, strerror(errno)); + gettext("cannot resolve path '%s'\n"), arg); return (NULL); } - (void) strlcpy(path, arg, sizeof (path)); + wholedisk = is_whole_disk(path); + if (!wholedisk && (stat64(path, &statbuf) != 0)) { + (void) fprintf(stderr, + gettext("cannot open '%s': %s\n"), + path, strerror(errno)); + return (NULL); + } } else { /* * This may be a short path for a device, or it could be total @@ -894,8 +900,10 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) uint64_t wholedisk; int fd; int ret; +#if defined(__sun__) || defined(__sun) ddi_devid_t devid; char *minor = NULL, *devid_str = NULL; +#endif verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); @@ -924,7 +932,7 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) /* * Fill in the devid, now that we've labeled the disk. */ - (void) snprintf(buf, sizeof (buf), "%ss0", path); + (void) snprintf(buf, sizeof (buf), "%s%s", path, FIRST_SLICE); if ((fd = open(buf, O_RDONLY)) < 0) { (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), @@ -932,6 +940,7 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) return (-1); } +#if defined(__sun__) || defined(__sun) if (devid_get(fd, &devid) == 0) { if (devid_get_minor_name(fd, &minor) == 0 && (devid_str = devid_str_encode(devid, minor)) != @@ -945,6 +954,7 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) devid_str_free(minor); devid_free(devid); } +#endif /* * Update the path to refer to the 's0' slice. The presence of diff --git a/lib/libefi/rdwr_efi.c b/lib/libefi/rdwr_efi.c index 31eb3d3f61..ae6138db8f 100644 --- a/lib/libefi/rdwr_efi.c +++ b/lib/libefi/rdwr_efi.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -39,7 +40,9 @@ #include #include #include -#include +#if defined(__linux__) +#include +#endif static struct uuid_to_ptag { struct uuid uuid; @@ -50,11 +53,11 @@ static struct uuid_to_ptag { { EFI_SWAP }, { EFI_USR }, { EFI_BACKUP }, - { 0 }, /* STAND is never used */ + { EFI_UNUSED }, /* STAND is never used */ { EFI_VAR }, { EFI_HOME }, { EFI_ALTSCTR }, - { 0 }, /* CACHE (cachefs) is never used */ + { EFI_UNUSED }, /* CACHE (cachefs) is never used */ { EFI_RESERVED }, { EFI_SYSTEM }, { EFI_LEGACY_MBR }, @@ -108,21 +111,115 @@ int efi_debug = 1; int efi_debug = 0; #endif -extern unsigned int efi_crc32(const unsigned char *, unsigned int); static int efi_read(int, struct dk_gpt *); +/* + * Return a 32-bit CRC of the contents of the buffer. Pre-and-post + * one's conditioning will be handled by crc32() internally. + */ +static uint32_t +efi_crc32(const unsigned char *buf, unsigned int size) +{ + uint32_t crc = crc32(0, Z_NULL, 0); + + crc = crc32(crc, buf, size); + + return (crc); +} + static int read_disk_info(int fd, diskaddr_t *capacity, uint_t *lbsize) { - struct dk_minfo disk_info; + int sector_size; + unsigned long long capacity_size; + + if (ioctl(fd, BLKSSZGET, §or_size) < 0) + return (-1); + + if (ioctl(fd, BLKGETSIZE64, &capacity_size) < 0) + return (-1); + + *lbsize = (uint_t)sector_size; + *capacity = (diskaddr_t)(capacity_size / sector_size); - if ((ioctl(fd, DKIOCGMEDIAINFO, (caddr_t)&disk_info)) == -1) - return (errno); - *capacity = disk_info.dki_capacity; - *lbsize = disk_info.dki_lbsize; return (0); } +static int +efi_get_info(int fd, struct dk_cinfo *dki_info) +{ +#if defined(__linux__) + char path[PATH_MAX]; + char *dev_path; + int rval; + + /* + * The simplest way to get the partition number under linux is + * to parse it out of the /dev/ block device name. + * The kernel creates this using the partition number when it + * populates /dev/ so it may be trusted. Another issue is that + * that the libefi API only provides the open fd and not the + * file path. To handle this realpath(3) is used to resolve + * the block device name from /proc/self/fd/. Aside from + * the partition number we collect some additional device info. + */ + memset(dki_info, 0, sizeof(*dki_info)); + (void) sprintf(path, "/proc/self/fd/%d", fd); + if ((dev_path = realpath(path, NULL)) == NULL) + goto error; + + rval = sscanf(dev_path, "/dev/%[a-zA-Z]%hu", + dki_info->dki_dname, + &dki_info->dki_partition); + + switch (rval) { + case 0: + errno = EINVAL; + goto error; + case 1: + dki_info->dki_partition = 0; + } + + if ((strncmp(dki_info->dki_dname, "sd", 2) == 0)) { + strcpy(dki_info->dki_cname, "sd"); + dki_info->dki_ctype = DKC_SCSI_CCS; + } else if ((strncmp(dki_info->dki_dname, "hd", 2) == 0)) { + strcpy(dki_info->dki_cname, "hd"); + dki_info->dki_ctype = DKC_DIRECT; + } else if ((strncmp(dki_info->dki_dname, "md", 2) == 0)) { + strcpy(dki_info->dki_cname, "pseudo"); + dki_info->dki_ctype = DKC_MD; + } else if ((strncmp(dki_info->dki_dname, "ram", 3) == 0)) { + strcpy(dki_info->dki_cname, "pseudo"); + dki_info->dki_ctype = DKC_PCMCIA_MEM; + } else if ((strncmp(dki_info->dki_dname, "loop", 4) == 0)) { + strcpy(dki_info->dki_cname, "pseudo"); + dki_info->dki_ctype = DKC_VBD; + } else { + strcpy(dki_info->dki_cname, "unknown"); + dki_info->dki_ctype = DKC_UNKNOWN; + } + + free(dev_path); +#else + if (ioctl(fd, DKIOCINFO, (caddr_t)dki_info) == -1) + goto error; +#endif + return (0); +error: + if (efi_debug) + (void) fprintf(stderr, "DKIOCINFO errno 0x%x\n", errno); + + switch (errno) { + case EIO: + return (VT_EIO); + case EINVAL: + return (VT_EINVAL); + default: + return (VT_ERROR); + } +} + /* * the number of blocks the EFI label takes up (round up to nearest * block) @@ -136,12 +233,13 @@ read_disk_info(int fd, diskaddr_t *capacity, uint_t *lbsize) int efi_alloc_and_init(int fd, uint32_t nparts, struct dk_gpt **vtoc) { - diskaddr_t capacity; - uint_t lbsize; + diskaddr_t capacity = 0; + uint_t lbsize = 0; uint_t nblocks; size_t length; struct dk_gpt *vptr; struct uuid uuid; + struct dk_cinfo dki_info; if (read_disk_info(fd, &capacity, &lbsize) != 0) { if (efi_debug) @@ -149,6 +247,31 @@ efi_alloc_and_init(int fd, uint32_t nparts, struct dk_gpt **vtoc) "couldn't read disk information\n"); return (-1); } +#if defined(__linux__) + if (efi_get_info(fd, &dki_info) != 0) { + if (efi_debug) + (void) fprintf(stderr, + "couldn't read disk information\n"); + return (-1); + } + + if (dki_info.dki_partition != 0) { + if (efi_debug) + (void) fprintf(stderr, + "can only partition whole devices\n"); + return (-1); + } + + if ((dki_info.dki_ctype == DKC_PCMCIA_MEM) || + (dki_info.dki_ctype == DKC_VBD) || + (dki_info.dki_ctype == DKC_UNKNOWN)) { + if (efi_debug) + (void) fprintf(stderr, + "unpartitionable device type %d\n", + dki_info.dki_ctype); + return (-1); + } +#endif nblocks = NBLOCKS(nparts, lbsize); if ((nblocks * lbsize) < EFI_MIN_ARRAY_SIZE + lbsize) { @@ -244,14 +367,117 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc) { void *data = dk_ioc->dki_data; int error; +#if defined(__linux__) + diskaddr_t capacity; + uint_t lbsize; + /* + * When the IO is not being performed in kernel as an ioctl we need + * to know the sector size so we can seek to the proper byte offset. + */ + if (read_disk_info(fd, &capacity, &lbsize) == -1) { + if (efi_debug) + fprintf(stderr,"unable to read disk info: %d",errno); + + errno = EIO; + return -1; + } + + switch (cmd) { + case DKIOCGETEFI: + if (lbsize == 0) { + if (efi_debug) + (void) fprintf(stderr, "DKIOCGETEFI assuming " + "LBA %d bytes\n", DEV_BSIZE); + + lbsize = DEV_BSIZE; + } + + error = lseek(fd, dk_ioc->dki_lba * lbsize, SEEK_SET); + if (error == -1) + return error; + + error = read(fd, data, dk_ioc->dki_length); + if (error == -1) + return error; + + if (error != dk_ioc->dki_length) { + if (efi_debug) + (void) fprintf(stderr, "DKIOCGETEFI short " + "read of %d bytes\n", error); + errno = EIO; + return -1; + } + error = 0; + break; + + case DKIOCSETEFI: + if (lbsize == 0) { + if (efi_debug) + (void) fprintf(stderr, "DKIOCSETEFI unknown " + "LBA size\n"); + errno = EIO; + return -1; + } + + error = lseek(fd, dk_ioc->dki_lba * lbsize, SEEK_SET); + if (error == -1) + return error; + + error = write(fd, data, dk_ioc->dki_length); + if (error == -1) + return error; + + if (error != dk_ioc->dki_length) { + if (efi_debug) + (void) fprintf(stderr, "DKIOCSETEFI short " + "write of %d bytes\n", error); + errno = EIO; + return -1; + } + + error = fdatasync(fd); + if (error == -1) + return error; + + error = 0; + break; + + default: + if (efi_debug) + (void) fprintf(stderr, "unsupported ioctl()\n"); + + errno = EIO; + return -1; + } +#else dk_ioc->dki_data_64 = (uint64_t)(uintptr_t)data; error = ioctl(fd, cmd, (void *)dk_ioc); dk_ioc->dki_data = data; - +#endif return (error); } +#if defined(__linux__) +static int +efi_rescan(int fd) +{ + int retry = 5; + int error; + + /* Notify the kernel a devices partition table has been updated */ + while ((error = ioctl(fd, BLKRRPART)) != 0) { + if (--retry == 0) { + (void) fprintf(stderr, "the kernel failed to rescan " + "the partition table: %d\n", errno); + return (-1); + } + } + + return (0); +} +#endif + static int check_label(int fd, dk_efi_t *dk_ioc) { @@ -306,6 +532,8 @@ efi_read(int fd, struct dk_gpt *vtoc) int rval = 0; int md_flag = 0; int vdc_flag = 0; + diskaddr_t capacity = 0; + uint_t lbsize = 0; struct dk_minfo disk_info; dk_efi_t dk_ioc; efi_gpt_t *efi; @@ -317,19 +545,9 @@ efi_read(int fd, struct dk_gpt *vtoc) /* * get the partition number for this file descriptor. */ - if (ioctl(fd, DKIOCINFO, (caddr_t)&dki_info) == -1) { - if (efi_debug) { - (void) fprintf(stderr, "DKIOCINFO errno 0x%x\n", errno); - } - switch (errno) { - case EIO: - return (VT_EIO); - case EINVAL: - return (VT_EINVAL); - default: - return (VT_ERROR); - } - } + if ((rval = efi_get_info(fd, &dki_info)) != 0) + return rval; + if ((strncmp(dki_info.dki_cname, "pseudo", 7) == 0) && (strncmp(dki_info.dki_dname, "md", 3) == 0)) { md_flag++; @@ -343,14 +561,18 @@ efi_read(int fd, struct dk_gpt *vtoc) } /* get the LBA size */ - if (ioctl(fd, DKIOCGMEDIAINFO, (caddr_t)&disk_info) == -1) { + if (read_disk_info(fd, &capacity, &lbsize) == -1) { if (efi_debug) { (void) fprintf(stderr, - "assuming LBA 512 bytes %d\n", - errno); + "unable to read disk info: %d", + errno); } - disk_info.dki_lbsize = DEV_BSIZE; + return (VT_EINVAL); } + + disk_info.dki_lbsize = lbsize; + disk_info.dki_capacity = capacity; + if (disk_info.dki_lbsize == 0) { if (efi_debug) { (void) fprintf(stderr, @@ -840,22 +1062,13 @@ efi_write(int fd, struct dk_gpt *vtoc) efi_gpe_t *efi_parts; int i, j; struct dk_cinfo dki_info; + int rval; int md_flag = 0; int nblocks; diskaddr_t lba_backup_gpt_hdr; - if (ioctl(fd, DKIOCINFO, (caddr_t)&dki_info) == -1) { - if (efi_debug) - (void) fprintf(stderr, "DKIOCINFO errno 0x%x\n", errno); - switch (errno) { - case EIO: - return (VT_EIO); - case EINVAL: - return (VT_EINVAL); - default: - return (VT_ERROR); - } - } + if ((rval = efi_get_info(fd, &dki_info)) != 0) + return rval; /* check if we are dealing wih a metadevice */ if ((strncmp(dki_info.dki_cname, "pseudo", 7) == 0) && @@ -942,6 +1155,10 @@ efi_write(int fd, struct dk_gpt *vtoc) return (VT_EINVAL); } + /* Zero's should be written for empty partitions */ + if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED) + continue; + efi_parts[i].efi_gpe_StartingLBA = LE_64(vtoc->efi_parts[i].p_start); efi_parts[i].efi_gpe_EndingLBA = @@ -1033,6 +1250,13 @@ efi_write(int fd, struct dk_gpt *vtoc) /* write the PMBR */ (void) write_pmbr(fd, vtoc); free(dk_ioc.dki_data); + +#if defined(__linux__) + rval = efi_rescan(fd); + if (rval) + return (VT_ERROR); +#endif + return (0); } @@ -1050,6 +1274,7 @@ efi_free(struct dk_gpt *ptr) int efi_type(int fd) { +#if 0 struct vtoc vtoc; struct extvtoc extvtoc; @@ -1063,6 +1288,9 @@ efi_type(int fd) } } return (0); +#else + return (ENOSYS); +#endif } void @@ -1176,7 +1404,7 @@ efi_auto_sense(int fd, struct dk_gpt **vtoc) return (-1); } - for (i = 0; i < min((*vtoc)->efi_nparts, V_NUMPAR); i++) { + for (i = 0; i < MIN((*vtoc)->efi_nparts, V_NUMPAR); i++) { (*vtoc)->efi_parts[i].p_tag = default_vtoc_map[i].p_tag; (*vtoc)->efi_parts[i].p_flag = default_vtoc_map[i].p_flag; (*vtoc)->efi_parts[i].p_start = 0; diff --git a/lib/libzfs/include/libzfs.h b/lib/libzfs/include/libzfs.h index 22e3e78cb7..37d95d92b9 100644 --- a/lib/libzfs/include/libzfs.h +++ b/lib/libzfs/include/libzfs.h @@ -56,11 +56,15 @@ extern "C" { #if defined(__sun__) || defined(__sun) #define DISK_ROOT "/dev/dsk" #define RDISK_ROOT "/dev/rdsk" +#define FIRST_SLICE "s0" #define BACKUP_SLICE "s2" #endif #ifdef __linux__ #define DISK_ROOT "/dev" +#define RDISK_ROOT DISK_ROOT +#define FIRST_SLICE "1" +#define BACKUP_SLICE "" #endif /* diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index bae4a62d23..4f669bc17d 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -634,9 +634,12 @@ zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp) /* * Don't start the slice at the default block of 34; many storage - * devices will use a stripe width of 128k, so start there instead. + * devices will use a stripe width of 128k, other vendors prefer a 1m + * alignment. It is best to play it safe and ensure a 1m alignment + * give 512b blocks. When the block size is larger by a power of 2 + * we will still be 1m aligned. */ -#define NEW_START_BLOCK 256 +#define NEW_START_BLOCK 2048 /* * Validate the given pool name, optionally putting an extended error message in @@ -1758,6 +1761,7 @@ is_guid_type(zpool_handle_t *zhp, uint64_t guid, const char *type) static int zpool_relabel_disk(libzfs_handle_t *hdl, const char *name) { +#if 0 char path[MAXPATHLEN]; char errbuf[1024]; int fd, error; @@ -1788,6 +1792,12 @@ zpool_relabel_disk(libzfs_handle_t *hdl, const char *name) return (zfs_error(hdl, EZFS_NOCAP, errbuf)); } return (0); +#else + char errbuf[1024]; + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " + "relabel '%s/%s': libefi is unsupported"), DISK_ROOT, name); + return (zfs_error(hdl, EZFS_NOTSUP, errbuf)); +#endif } /* @@ -3119,7 +3129,10 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name) uint64_t slice_size; diskaddr_t start_block; char errbuf[1024]; - +#if defined(__linux__) + struct stat64 statbuf; + int i; +#endif /* prepare an error message just in case */ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot label '%s'"), name); @@ -3155,6 +3168,7 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name) * This shouldn't happen. We've long since verified that this * is a valid device. */ + printf("errno =%d\n", errno); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "unable to open device")); return (zfs_error(hdl, EZFS_OPENFAILED, errbuf)); @@ -3216,6 +3230,24 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name) (void) close(fd); efi_free(vtoc); + +#if defined(__linux__) + /* + * The efi partition table has been successfully written and the + * kernel notified. However, it still may take a moment for udev + * to notice the devfs update and properly populate /dev/. We will + * wait up to 3 seconds which is far far far longer than needed. + */ + (void) snprintf(path, sizeof (path), "%s/%s%s", RDISK_ROOT, name, + FIRST_SLICE); + for (i = 0; i < 3000; i++) { + if (stat64(path, &statbuf) == 0 || errno != ENOENT) + break; + + usleep(1000); + } +#endif + return (0); } diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 40b7530261..62ea804101 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -76,6 +76,19 @@ vdev_bdev_mode(int smode) } #endif /* HAVE_OPEN_BDEV_EXCLUSIVE */ +static uint64_t +bdev_capacity(struct block_device *bdev) +{ + struct hd_struct *part = bdev->bd_part; + + /* The partition capacity referenced by the block device */ + if (part) + return part->nr_sects; + + /* Otherwise assume the full device capacity */ + return get_capacity(bdev->bd_disk); +} + static int vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift) { @@ -122,15 +135,11 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift) /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ v->vdev_nowritecache = B_FALSE; - /* Determine the actual size of the device (in bytes) - * - * XXX: SECTOR_SIZE is defined to 512b which may not be true for - * your device, we must use the actual hardware sector size. - */ - *psize = get_capacity(bdev->bd_disk) * SECTOR_SIZE; + /* Physical volume size in bytes */ + *psize = bdev_capacity(bdev) * bdev_hardsect_size(bdev); /* Based on the minimum sector size set the block size */ - *ashift = highbit(MAX(SECTOR_SIZE, SPA_MINBLOCKSIZE)) - 1; + *ashift = highbit(MAX(bdev_hardsect_size(bdev), SPA_MINBLOCKSIZE)) - 1; return 0; } @@ -314,7 +323,7 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, uint64_t bio_offset; int i, error = 0, bio_count, bio_size; - ASSERT3S(kbuf_offset % SECTOR_SIZE, ==, 0); + ASSERT3S(kbuf_offset % bdev_hardsect_size(bdev), ==, 0); q = bdev_get_queue(bdev); if (!q) return ENXIO; @@ -558,7 +567,7 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) if (IS_ERR(bdev)) return -PTR_ERR(bdev); - s = get_capacity(bdev->bd_disk) * SECTOR_SIZE; + s = bdev_capacity(bdev) * bdev_hardsect_size(bdev); if (s == 0) { vdev_bdev_close(bdev, vdev_bdev_mode(FREAD)); return EIO;