hello,
i wrote a method with Neon for accelerate my code..
I don't have the expected result
only simples instructions and, or, shift... on 64bits
no gain speed. nada :-(
i dont understand... how programm this method for speed?
original code :
unsigned long long RXBitBoard::get_legal_moves(const unsigned long long p_discs, const unsigned long long o_discs) {
const unsigned long long inner_o_discs = o_discs & 0x7E7E7E7E7E7E7E7EULL;
/* direction W */
unsigned long long
flipped = (p_discs >> 1) & inner_o_discs;
flipped |= (flipped >> 1) & inner_o_discs;
unsigned long long adjacent_o_discs = inner_o_discs & (inner_o_discs >> 1);
flipped |= (flipped >> 2) & adjacent_o_discs;
flipped |= (flipped >> 2) & adjacent_o_discs;
unsigned long long legals = flipped >> 1;
// /* direction _E*/
// flipped = (p_discs << 1) & inner_o_discs;
// flipped |= (flipped << 1) & inner_o_discs;
//
// adjacent_o_discs = inner_o_discs & (inner_o_discs << 1);
//
// flipped |= (flipped << 2) & adjacent_o_discs;
// flipped |= (flipped << 2) & adjacent_o_discs;
//
// legals |= flipped << 1;
// trick
/* direction _E */
flipped = (p_discs << 1);
legals |= ((flipped + inner_o_discs) & ~flipped);
/* direction S */
flipped = (p_discs >> 8) & o_discs;
flipped |= (flipped >> 8) & o_discs;
adjacent_o_discs = o_discs & (o_discs >> 8);
flipped |= (flipped >> 16) & adjacent_o_discs;
flipped |= (flipped >> 16) & adjacent_o_discs;
legals |= flipped >> 8;
/* direction N */
flipped = (p_discs << 8) & o_discs;
flipped |= (flipped << 8) & o_discs;
adjacent_o_discs = o_discs & (o_discs << 8);
flipped |= (flipped << 16) & adjacent_o_discs;
flipped |= (flipped << 16) & adjacent_o_discs;
legals |= flipped << 8;
/* direction NE */
flipped = (p_discs >> 7) & inner_o_discs;
flipped |= (flipped >> 7) & inner_o_discs;
adjacent_o_discs = inner_o_discs & (inner_o_discs >> 7);
flipped |= (flipped >> 14) & adjacent_o_discs;
flipped |= (flipped >> 14) & adjacent_o_discs;
legals |= flipped >> 7;
/* direction SW */
flipped = (p_discs << 7) & inner_o_discs;
flipped |= (flipped << 7) & inner_o_discs;
adjacent_o_discs = inner_o_discs & (inner_o_discs << 7);
flipped |= (flipped << 14) & adjacent_o_discs;
flipped |= (flipped << 14) & adjacent_o_discs;
legals |= flipped << 7;
/* direction NW */
flipped = (p_discs >> 9) & inner_o_discs;
flipped |= (flipped >> 9) & inner_o_discs;
adjacent_o_discs = inner_o_discs & (inner_o_discs >> 9);
flipped |= (flipped >> 18) & adjacent_o_discs;
flipped |= (flipped >> 18) & adjacent_o_discs;
legals |= flipped >> 9;
/* direction SE */
flipped = (p_discs << 9) & inner_o_discs;
flipped |= (flipped << 9) & inner_o_discs;
adjacent_o_discs = inner_o_discs & (inner_o_discs << 9);
flipped |= (flipped << 18) & adjacent_o_discs;
flipped |= (flipped << 18) & adjacent_o_discs;
legals |= flipped << 9;
//Removes existing discs
legals &= ~(p_discs | o_discs);
return legals;
}
my neon code
const uint64x2_t pp_discs = vdupq_n_u64(p_discs);
const uint64x2_t oo_discs = vdupq_n_u64(o_discs);
const uint64x2_t inner_oo_discs = vdupq_n_u64(o_discs & 0x7E7E7E7E7E7E7E7EULL);
//horizontals directions -1, +1
static const int64x2_t shift_1 = {-1, 1};
static const int64x2_t shift_2 = {-2, 2};
uint64x2_t
flipped = vandq_u64(vshlq_u64(pp_discs, shift_1), inner_oo_discs);
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_1), inner_oo_discs));
uint64x2_t
adjacent_oo_discs = vandq_u64(inner_oo_discs, vshlq_u64(inner_oo_discs, shift_1));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_2), adjacent_oo_discs));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_2), adjacent_oo_discs));
uint64x2_t legals = vshlq_u64(flipped, shift_1);
//verticals directions -8 , +8
static const int64x2_t shift_8 = {-8, 8};
static const int64x2_t shift_16 = {-16, 16};
flipped = vandq_u64(vshlq_u64(pp_discs, shift_8), oo_discs);
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_8), oo_discs));
adjacent_oo_discs = vandq_u64(oo_discs, vshlq_u64(oo_discs, shift_8));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_16), adjacent_oo_discs));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_16), adjacent_oo_discs));
legals = vorrq_u64(legals, vshlq_u64(flipped, shift_8));
//diagonals directions -7 , +7
static const int64x2_t shift_7 = {-7, 7};
static const int64x2_t shift_14 = {-14, 14};
flipped = vandq_u64(vshlq_u64(pp_discs, shift_7), inner_oo_discs);
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_7), inner_oo_discs));
adjacent_oo_discs = vandq_u64(inner_oo_discs, vshlq_u64(inner_oo_discs, shift_7));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_14), adjacent_oo_discs));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_14), adjacent_oo_discs));
legals = vorrq_u64(legals, vshlq_u64(flipped, shift_7));
//diagonals directions -9 , +9
static const int64x2_t shift_9 = {-9, 9};
static const int64x2_t shift_18 = {-18, 18};
flipped = vandq_u64(vshlq_u64(pp_discs, shift_9), inner_oo_discs);
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_9), inner_oo_discs));
adjacent_oo_discs = vandq_u64(inner_oo_discs, vshlq_u64(inner_oo_discs, shift_9));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_18), adjacent_oo_discs));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_18), adjacent_oo_discs));
legals = vorrq_u64(legals, vshlq_u64(flipped, shift_9));
return ((vgetq_lane_u64(legals, 0) | vgetq_lane_u64(legals, 1)) & ~(p_discs | o_discs));
}
i even wrote a interleave version
I can expect a speed gain, no?
How did we do? We’d love to know your thoughts on this year’s conference. Take the survey here
Apple Silicon
RSS for tagBuild apps, libraries, frameworks, plug-ins, and other executable code that run natively on Apple silicon.
Posts under Apple Silicon tag
32 Posts
Sort by:
Post
Replies
Boosts
Views
Activity
I have an Intel based Macbook Pro, which I use for iOS development.
Recently I have the need to test my apps on M-series Apple Silicon based simulator images (specifically, 13 inch versions of iPads) which have no "A" series equivalent (iPad Air 13, iPad Pro 13, etc).
I have "created" simulators for these devices in Xcode Simulators and Devices, but they do never appear under the File > Open Simulator menu.
I have confirmed that creating a new simulator with any A-class device does not exhibit this issue.
Interestingly, these device types do NOT appear under the same "Device Type" menu when attempting to create a "New Simulator" from within the Simulator itself.
Creating a Simulator in "Device and Simulators"
The same menu from inside the "Simulator" app - missing M-class devices:
Can anyone confirm, preferably with a link to the documentation, that this is intended behavior?
Apple expects us to support M class devices, on still-supported Intel devices, without access to appropriate simulators?
Or is this just my machine?
System:
16-inch, 2019 MacBook Pro
Sonoma 14.0
Xcode 15.4 (15F31d)
Thank you
Investigating a kernel panic, I discovered that Apple Silicon Panic traces are not working with how I know to symbolicate the panic information. I have not found proper documentation that corrects this situation.
Attached file is an indentity-removed panic, received from causing an intentional panic (dereferencing nullptr), so that I know what functions to expect in the call stack. This is cut-and-pasted from the "Report To Apple" dialog that appears after the reboot:
panic_1_4_21_b.txt
To start, I download and install the matching KDK (in this case KDK_14.6.1_23G93.kdk), identified from this line:
OS version: 23G93
Kernel version: Darwin Kernel Version 23.6.0: Mon Jul 29 21:14:04 PDT 2024; root:xnu-10063.141.2~1/RELEASE_ARM64_T8122
Then start lldb from Terminal, using this command:
bash_prompt % lldb -arch arm64e /Library/Developer/KDKs/KDK_14.6.1_23G93.kdk/System/Library/Kernels/kernel.release.t8122
Next I load the remaining scripts per the instructions from lldb:
(lldb) settings set target.load-script-from-symbol-file true
I need to know what address to load my kext symbols to, which I read from this line of the panic log, after the @ symbol:
com.company.product(1.4.21d119)[92BABD94-80A4-3F6D-857A-3240E4DA8009]@0xfffffe001203bfd0->0xfffffe00120533ab
I am using a debug build of my kext, so the DWARF symbols are part of the binary. I use this line to load the symbols into the lldb session:
(lldb) addkext -F /Library/Extensions/KextName.kext/Contents/MacOS/KextName 0xfffffe001203bfd0
And now I should be able to use lldb image lookup to identify pointers on the stack that land within my kext. For example, the current PC at the moment of the crash lands within the kext (expected, because it was intentional):
(lldb) image lookup -a 0xfffffe001203fe10
Which gives the following incorrect result:
Address: KextName[0x0000000000003e40] (KextName.__TEXT.__cstring + 14456)
Summary: "ffer has %d retains\n"
That's not even a program instruction - that's within a cstring. No, that cstring isn't involved in anything pertaining to the intentional panic I am expecting to see.
Can someone please explain what I'm doing wrong and provide instructions that will give symbol information from a panic trace on an Apple Silicon Mac?
Disclaimers:
Yes I know IOPCIFamily is deprecated, I am in process of transitioning to DriverKit Dext from IOKit kext. Until then I must maintain the kext.
Terminal command "atos" provides similar incorrect results, and seems to not work with debug-built-binaries (only dSYM files)
Yes this is an intentional panic so that I can verify the symbolicate process before I move on to investigating an unexpected panic
I have set nvram boot-args to include keepsyms=1
I have tried (lldb) command script import lldb.macosx but get a result of error: no images in crash log (after the nvram settings)
I want to get MacOS Version from Objective-C build for iOS Apps to run on Apple Silicon Mac.
Because AppStore In-App Purchase on Apple Silicon Mac had issues on MacOS version 13.0 and was fixed at version 13.3.1.
I need to know the exact version of the MacOS to prevent users from buying In-App Purchases.
I found that I can get iOS version that the app is emulating on and assume
MacOS version from iOS version. But I think it's not 100% guaranteed way, because no documentation says anything about it.
Is there a way to get MacOS Version in iOS Apps running on Apple Silicon Mac?
Helpppp.
I installed Krita from the Appstore, it works. Then install ai_diffusion and I got :
xcrun: error: cannot be used within an App Sandbox.
Can anybody help me?
Thanks.
AttributeError
Python 3.10.7: /Applications/krita.app/Contents/MacOS/krita
Sat Aug 3 18:15:59 2024
A problem occurred in a Python script. Here is the sequence of
function calls leading up to the error, in the order they occurred.
/Users/alejandropereira/Library/Containers/org.kde.krita/Data/Library/Application Support/krita/pykrita/ai_diffusion/ui/region.py in update_settings(self=<ai_diffusion.ui.region.ActiveRegionWidget object>, key='prompt_translation', value=None)
345 self._layout_language_button()
346 elif key == "prompt_translation":
347 self._update_language()
348
349 async def _replace_with_translation(self, client: Client):
self = <ai_diffusion.ui.region.ActiveRegionWidget object>
self._update_language = >
/Users/alejandropereira/Library/Containers/org.kde.krita/Data/Library/Application Support/krita/pykrita/ai_diffusion/ui/region.py in _update_language(self=<ai_diffusion.ui.region.ActiveRegionWidget object>)
381 enabled = self._root._model.translation_enabled
382 lang = settings.prompt_translation if enabled else "en"
383 self._language_button.setText(lang.upper())
384 if enabled:
385 text = self._lang_help_enabled
self = <ai_diffusion.ui.region.ActiveRegionWidget object>
self._language_button = <PyQt5.QtWidgets.QToolButton object>
self._language_button.setText =
lang = None
lang.upper undefined
AttributeError: 'NoneType' object has no attribute 'upper'
cause = None
class = <class 'AttributeError'>
context = None
delattr = <method-wrapper 'delattr' of AttributeError object>
dict = {}
dir =
doc = 'Attribute not found.'
eq = <method-wrapper 'eq' of AttributeError object>
format =
ge = <method-wrapper 'ge' of AttributeError object>
getattribute = <method-wrapper 'getattribute' of AttributeError object>
gt = <method-wrapper 'gt' of AttributeError object>
hash = <method-wrapper 'hash' of AttributeError object>
init = <method-wrapper 'init' of AttributeError object>
init_subclass =
le = <method-wrapper 'le' of AttributeError object>
lt = <method-wrapper 'lt' of AttributeError object>
ne = <method-wrapper 'ne' of AttributeError object>
new =
reduce =
reduce_ex =
repr = <method-wrapper 'repr' of AttributeError object>
setattr = <method-wrapper 'setattr' of AttributeError object>
setstate =
sizeof =
str = <method-wrapper 'str' of AttributeError object>
subclasshook =
suppress_context = False
traceback =
args = ("'NoneType' object has no attribute 'upper'",)
name = 'upper'
obj = None
with_traceback =
The above is a description of an error in a Python program. Here is
the original traceback:
Traceback (most recent call last):
File "/Users/alejandropereira/Library/Containers/org.kde.krita/Data/Library/Application Support/krita/pykrita/ai_diffusion/ui/region.py", line 347, in update_settings
self._update_language()
File "/Users/alejandropereira/Library/Containers/org.kde.krita/Data/Library/Application Support/krita/pykrita/ai_diffusion/ui/region.py", line 383, in _update_language
self._language_button.setText(lang.upper())
AttributeError: 'NoneType' object has no attribute 'upper'
Hello,
I have an iPad app that users are running on their M1 / M2 MacBooks thanks to the "Designed for iPad" feature.
Some of them reported to me that the pressed keyboard keys were not recognized.
According to my source code, it seems that my custom views (that are set as firstResponder) do not get pressesBegan events.
While I could not reproduce this specific problem on my M1 Macbook, I found a strange problem that may be related.
My view also supports interaction with game controllers.
I found that the emulated controller (which is using the keyboard, when the controller emulation feature of the Designed for iPad app is set to On) has some problems.
The valueChangedHandler of the GCExtendedGamepad from the GCGameController is only fired if the app is compiled with the "Requires full screen" option set to On.
If Requires full screen is unchecked in XCode, the Emulated Controller is still present in the list of game controllers, but no button callbacks are triggered.
Note that a real game controller connected via USB will work correctly no matter how Requires full screen is set.
Could there be a focus bug of the keyboard not sending events when the app is not a full screen app (resizable on mac, and multitask on iPad)?
Or is there something I can change to avoid this problem?
I'm lost.
Our apps can currently be installed on Apple Silicon Macs via the iPad app on Mac feature (“Designed for iPad”). Now we are working on “proper” (universal) Catalyst-based Mac apps that will be available on the Mac App Store.
How does the transition work for users that currently have the iPad version installed? Will they automatically update to the Mac Catalyst app once it’s available, or do they need to re-install the app from the Mac App Store?
Topic:
App Store Distribution & Marketing
SubTopic:
General
Tags:
Universal Apps
Mac Catalyst
Apple Silicon
Recently I started getting emails from AppStoreConnect when I submit new builds The email states:
ITMS-90863: Macs with Apple silicon support issue - The app links with libraries that aren’t present in macOS: /usr/lib/swift/libswiftCloudKit.dylib
I can run this app on apple silicon from TestFlight or directly from Xcode and it runs just fine including all iCloud functions. This app has been using iCloud for several years now.
So my question is: Should I just ignore the email or do I need to change something to bring this app into compliance?
Hello,
I have an iOS app that is recording audio that is working fine on iPads/iPhones. It asks for microphone permission and after that recording works.
I installed the same app on my M3 MacBook via TestFlight since iPad apps are supposed to work without a change that way. The app starts fine and everything, but it never asks for Microphone permission, so I can't record.
Do I need to do something to make this happen (this is not macCatalyst, its running the arm64 iPhone binary on macOS)
thanks
It seems that Xcode Cloud currently only uses Intel machines for running the workflow jobs.
When will Apple Silicon machines be available and supported?
Topic:
Developer Tools & Services
SubTopic:
Xcode
Tags:
Xcode
Apple Silicon
Xcode Cloud
wwdc2023-10224
Call to std::remainder(double(411.0), int(365)); results in a crash due to a nan in libsystem_m.dylib. MCVE program is provided + lldb backtrace and system report.
$ clang++ -g -arch arm64 -std=c++20 main.cpp -o test
$ ./test
ori_fpcr=0, new_fpcr=1792
std::fmod(simTimeInDays, numDays) = 46
Illegal instruction: 4
main.cpp
#include <cassert>
#include <cfenv>
#include <cmath>
#include <iostream>
#if !defined(__arm64__) || !defined(__APPLE__)
# error "Meant to be run on arm64 apple"
#endif
inline int feenableexcept(unsigned int excepts) {
static fenv_t fenv;
if (std::fegetenv(&fenv) != 0) {
return -1;
}
const unsigned long long old_fpcr = fenv.__fpcr;
const unsigned int old_excepts = (old_fpcr >> 8u) & unsigned(FE_ALL_EXCEPT);
// Check the bits passed are valid, and bit shift them
const unsigned int new_excepts = excepts & unsigned(FE_ALL_EXCEPT);
const unsigned long long new_fpcr = new_excepts << 8u;
// Set the new bits
fenv.__fpcr = fenv.__fpcr | new_fpcr;
return (std::fesetenv(&fenv) != 0) ? -1 : static_cast<int>(old_excepts);
}
int main([[maybe_unused]] int argc, [[maybe_unused]] const char** argv) {
constexpr unsigned int flags = FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW;
static_assert(flags == 7);
constexpr uint32_t fpcr_flags_shifted = flags << 8;
constexpr uint32_t fpcr_flags = (__fpcr_trap_divbyzero | __fpcr_trap_invalid | __fpcr_trap_overflow);
static_assert(fpcr_flags_shifted == fpcr_flags);
static_assert(fpcr_flags_shifted == 1792);
uint32_t ori_fpcr = __builtin_arm_rsr("fpcr");
feenableexcept(flags);
uint32_t new_fpcr = __builtin_arm_rsr("fpcr");
// std::cout << "(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW) = " << flags << '\n';
// std::cout << "((FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW) << 8) = " << fpcr_flags_shifted << '\n';
// std::cout << "(__fpcr_trap_divbyzero | __fpcr_trap_invalid | __fpcr_trap_overflow) = " << fpcr_flags << '\n';
std::cout << "ori_fpcr=" << ori_fpcr << ", new_fpcr=" << new_fpcr << '\n';
const double simTimeInDays = 411.0;
const int numDays = 365;
// This is fine
std::cout << "std::fmod(simTimeInDays, numDays) = " << std::fmod(simTimeInDays, numDays) << '\n';
// This isn't
std::cout << "std::fmod(simTimeInDays, numDays) = " << std::remainder(simTimeInDays, numDays) << '\n';
return 0;
}
backtrace: see attachment
lldb_backtrace.txt
$ system_profiler SPSoftwareDataType SPHardwareDataType
Software:
System Software Overview:
System Version: macOS 13.2 (22D49)
Kernel Version: Darwin 22.3.0
Boot Volume: Macintosh HD
Boot Mode: Normal
Secure Virtual Memory: Enabled
System Integrity Protection: Enabled
Time since boot: 7 hours, 58 minutes
Hardware:
Hardware Overview:
Model Name: MacBook Pro
Model Identifier: MacBookPro18,2
Model Number: Z14V000NBFN/A
Chip: Apple M1 Max
Total Number of Cores: 10 (8 performance and 2 efficiency)
Memory: 64 GB
System Firmware Version: 8419.80.7
OS Loader Version: 8419.80.7
Activation Lock Status: Enabled
$ otool -L test
test:
/usr/lib/libc++.1.dylib (compatibility version 1.0.0, current version 1300.36.0)
/usr/lib/libSystem.B.dylib (compatibility version 1.0.0, current version 1319.0.0
$ clang++ --version
Apple clang version 14.0.0 (clang-1400.0.29.202)
Target: arm64-apple-darwin22.3.0
Thread model: posix
InstalledDir: /Library/Developer/CommandLineTools/usr/bin
I'm using M1pro and have successfully installed Numpy with Accelerate following, and it really speedup my programs. I also ran np.test() to check the correctness and every test passed.
However, I can't install Scipy with Accelerate, since the official document said Accelerate has a LAPACK of too old version. I can't even find a scipy that can pass scipy.test(). I tried the codes below:
conda install numpy 'libblas=*=*accelerate'
conda install scipy
np.test() as fails, sp.test() can't even finish
conda install numpy 'libblas=*=*openblas'
conda install scipy
Both np.test() and sp.test() can finish, but with many failures. I believe the bugs are due to Conda.
pip install --no-binary :all: --no-use-pep517 numpy
pip install scipy
np.test() has no failure and went fast, sp.test() uses OpenBLAS and has 3 failures. This is the best version I have found.
So my question is: can we find a reliable version of scipy on M1? Considering the popularity of scipy, I think it's not a high-living expectation.
And a question for Apple: is there really a plan to upgrade the LAPACK in Accelerate?
Topic:
Developer Tools & Services
SubTopic:
Xcode
Tags:
Developer Tools
Accelerate
Mac
Apple Silicon